Training in progress, step 3100
Browse files- adapter_model.safetensors +1 -1
- train.log +358 -0
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1204780872
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44f8a7d14f22c01a14ae42fef575ccd9bfbced1fa7a387e0f0fe630a34c22899
|
| 3 |
size 1204780872
|
train.log
CHANGED
|
@@ -16276,3 +16276,361 @@ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device=
|
|
| 16276 |
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16277 |
{'train/tv_loss': None, 'train/lm_loss': 0.15130637884140016, 'train/info_loss': 0.1600653976202011, 'train/ref_loss': None, 'train/uncertainty_loss': -9.910131338983775e-05, 'train/video_loss': 0.15996628999710083, 'train/total_loss': 0.31127268075942993}
|
| 16278 |
tensor(0.0755, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16276 |
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16277 |
{'train/tv_loss': None, 'train/lm_loss': 0.15130637884140016, 'train/info_loss': 0.1600653976202011, 'train/ref_loss': None, 'train/uncertainty_loss': -9.910131338983775e-05, 'train/video_loss': 0.15996628999710083, 'train/total_loss': 0.31127268075942993}
|
| 16278 |
tensor(0.0755, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16279 |
+
[Rank 3] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
|
| 16280 |
+
[Rank 2] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}[Rank 0] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
|
| 16281 |
+
[Rank 1] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
|
| 16282 |
+
|
| 16283 |
+
{'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09, 'epoch': 0.99}
|
| 16284 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16285 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16286 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16287 |
+
{'train/tv_loss': 0.0001784276915714145, 'train/lm_loss': 1.5091327077243478e-05, 'train/info_loss': 1.3351262168725953e-05, 'train/ref_loss': 0.10094004124403, 'train/uncertainty_loss': -6.672072340734303e-05, 'train/video_loss': 0.10231409221887589, 'train/total_loss': 0.10232918709516525}
|
| 16288 |
+
tensor(0.0191, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16289 |
+
tensor(0.1469, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16290 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16291 |
+
tensor(0.4153, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16292 |
+
tensor(0.1078, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16293 |
+
{'train/tv_loss': 0.0001733818091452122, 'train/lm_loss': 2.2076342429500076e-05, 'train/info_loss': 1.5854584489716217e-05, 'train/ref_loss': 0.2894924581050873, 'train/uncertainty_loss': 0.010783981531858444, 'train/video_loss': 0.3016793429851532, 'train/total_loss': 0.3017014265060425}
|
| 16294 |
+
[Rank 1] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
|
| 16295 |
+
[Rank 2] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
|
| 16296 |
+
[Rank 0] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}[Rank 3] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
|
| 16297 |
+
|
| 16298 |
+
{'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09, 'epoch': 0.99}
|
| 16299 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16300 |
+
tensor(0.0729, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16301 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16302 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16303 |
+
{'train/tv_loss': 0.0004507274366915226, 'train/lm_loss': 2.8608183492906394e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.16604295372962952, 'train/uncertainty_loss': -7.334401598200203e-05, 'train/video_loss': 0.16959282755851746, 'train/total_loss': 0.16962143778800964}
|
| 16304 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16305 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.15873619318008425, 'train/info_loss': 0.20619021356105804, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012108908267691732, 'train/video_loss': 0.20606912672519684, 'train/total_loss': 0.36480534076690674}
|
| 16306 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16307 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16308 |
+
tensor(0.0074, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16309 |
+
[Rank 3] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}[Rank 2] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
|
| 16310 |
+
|
| 16311 |
+
[Rank 1] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
|
| 16312 |
+
[Rank 0] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
|
| 16313 |
+
{'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09, 'epoch': 0.99}
|
| 16314 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16315 |
+
tensor(0.4648, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16316 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16317 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16318 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.31027505397796634, 'train/info_loss': 0.33556777238845825, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012801478151232005, 'train/video_loss': 0.3354397714138031, 'train/total_loss': 0.6457148194313049}
|
| 16319 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16320 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16321 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16322 |
+
{'train/tv_loss': 0.0001363527961075306, 'train/lm_loss': 2.822676906362176e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.13075466454029083, 'train/uncertainty_loss': -6.930269300937652e-05, 'train/video_loss': 0.1317932903766632, 'train/total_loss': 0.13182151317596436}
|
| 16323 |
+
tensor(0.0040, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16324 |
+
[Rank 2] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
|
| 16325 |
+
[Rank 0] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}[Rank 3] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
|
| 16326 |
+
|
| 16327 |
+
{'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09, 'epoch': 0.99}
|
| 16328 |
+
[Rank 1] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
|
| 16329 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16330 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16331 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16332 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16333 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.30461447238922124, 'train/info_loss': 0.1613609790802002, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011446572607383133, 'train/video_loss': 0.16124650835990906, 'train/total_loss': 0.4658609926700592}
|
| 16334 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16335 |
+
tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16336 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2111635446548462, 'train/info_loss': 0.14900483191013336, 'train/ref_loss': None, 'train/uncertainty_loss': -8.154477691277862e-05, 'train/video_loss': 0.14892329275608063, 'train/total_loss': 0.3600868582725525}
|
| 16337 |
+
tensor(0.1748, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16338 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16339 |
+
[Rank 1] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
|
| 16340 |
+
[Rank 3] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
|
| 16341 |
+
[Rank 0] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}[Rank 2] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
|
| 16342 |
+
|
| 16343 |
+
{'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09, 'epoch': 0.99}
|
| 16344 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16345 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16346 |
+
tensor(0.0091, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16347 |
+
tensor(0.0925, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16348 |
+
{'train/tv_loss': 0.00016606017015874386, 'train/lm_loss': 7.689904887229205e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.2799209952354431, 'train/uncertainty_loss': 0.009252391010522843, 'train/video_loss': 0.29052385687828064, 'train/total_loss': 0.2906007468700409}
|
| 16349 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16350 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16351 |
+
tensor(0.1732, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16352 |
+
tensor(0.1812, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16353 |
+
{'train/tv_loss': 0.00012968671508133413, 'train/lm_loss': 3.659390495158732e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.3349378705024719, 'train/uncertainty_loss': 0.018121950328350067, 'train/video_loss': 0.35411736369132996, 'train/total_loss': 0.35415396094322205}
|
| 16354 |
+
[Rank 3] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
|
| 16355 |
+
[Rank 2] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
|
| 16356 |
+
[Rank 0] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}[Rank 1] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
|
| 16357 |
+
|
| 16358 |
+
{'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09, 'epoch': 0.99}
|
| 16359 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16360 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(0.0697, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16361 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16362 |
+
{'train/tv_loss': 0.00016026009107008579, 'train/lm_loss': 1.6855483409017324e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.26810598373413086, 'train/uncertainty_loss': 0.006967854499816895, 'train/video_loss': 0.2763703167438507, 'train/total_loss': 0.27638718485832214}
|
| 16363 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16364 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16365 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16366 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.10554903745651245, 'train/info_loss': 0.11926790326833725, 'train/ref_loss': None, 'train/uncertainty_loss': -9.559270110912622e-05, 'train/video_loss': 0.11917231231927872, 'train/total_loss': 0.22472134232521057}
|
| 16367 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16368 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16369 |
+
[Rank 3] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}[Rank 1] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}[Rank 0] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}
|
| 16370 |
+
|
| 16371 |
+
[Rank 2] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}
|
| 16372 |
+
|
| 16373 |
+
{'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09, 'epoch': 0.99}
|
| 16374 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16375 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.10073459148406982, 'train/info_loss': 0.14998847246170044, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012555404100567102, 'train/video_loss': 0.14986291527748108, 'train/total_loss': 0.2505975067615509}
|
| 16376 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16377 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16378 |
+
tensor(0.2004, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16379 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16380 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16381 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.39838287830352787, 'train/info_loss': 0.1479235589504242, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012364789145067335, 'train/video_loss': 0.14779990911483765, 'train/total_loss': 0.5461827516555786}
|
| 16382 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16383 |
+
tensor(-0.0006, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16384 |
+
[Rank 1] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}[Rank 2] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
|
| 16385 |
+
[Rank 3] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
|
| 16386 |
+
|
| 16387 |
+
[Rank 0] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
|
| 16388 |
+
{'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09, 'epoch': 0.99}
|
| 16389 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16390 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16391 |
+
{'train/tv_loss': 0.00012514020781964063, 'train/lm_loss': 1.3207952724769713e-05, 'train/info_loss': 1.2516818969743326e-05, 'train/ref_loss': 0.19032391905784607, 'train/uncertainty_loss': -6.848637713119388e-05, 'train/video_loss': 0.19126906991004944, 'train/total_loss': 0.1912822723388672}
|
| 16392 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16393 |
+
tensor(0.2011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16394 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16395 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.21502296924591066, 'train/info_loss': 0.1758623719215393, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011166655458509922, 'train/video_loss': 0.1757507026195526, 'train/total_loss': 0.3907736539840698}
|
| 16396 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16397 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16398 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16399 |
+
[Rank 2] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}[Rank 0] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}[Rank 3] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}
|
| 16400 |
+
|
| 16401 |
+
|
| 16402 |
+
[Rank 1] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}
|
| 16403 |
+
{'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09, 'epoch': 1.0}
|
| 16404 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16405 |
+
tensor(0.0269, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16406 |
+
{'train/tv_loss': 0.00012117947917431593, 'train/lm_loss': 2.856050559785217e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.23599904775619507, 'train/uncertainty_loss': 0.0026872064918279648, 'train/video_loss': 0.23967339098453522, 'train/total_loss': 0.23970195651054382}
|
| 16407 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16408 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16409 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16410 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2651496648788452, 'train/info_loss': 0.16941337287425995, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012172098504379392, 'train/video_loss': 0.1692916452884674, 'train/total_loss': 0.43444132804870605}
|
| 16411 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16412 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16413 |
+
tensor(0.0431, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16414 |
+
[Rank 2] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}[Rank 3] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}[Rank 0] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}
|
| 16415 |
+
|
| 16416 |
+
|
| 16417 |
+
[Rank 1] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}
|
| 16418 |
+
{'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09, 'epoch': 1.0}
|
| 16419 |
+
tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16420 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16421 |
+
tensor(0.1725, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16422 |
+
tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16423 |
+
{'train/tv_loss': 0.00032057708594948056, 'train/lm_loss': 1.3088752166368068e-05, 'train/info_loss': 1.3112849956087302e-05, 'train/ref_loss': 0.06266696751117706, 'train/uncertainty_loss': -7.520327344536782e-05, 'train/video_loss': 0.06516949087381363, 'train/total_loss': 0.06518258154392242}
|
| 16424 |
+
tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16425 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16426 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.35767147541046146, 'train/info_loss': 0.1781483143568039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011188344797119498, 'train/video_loss': 0.17803643643856049, 'train/total_loss': 0.5357078909873962}
|
| 16427 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16428 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16429 |
+
[Rank 2] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}[Rank 3] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}[Rank 0] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}
|
| 16430 |
+
|
| 16431 |
+
|
| 16432 |
+
[Rank 1] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}
|
| 16433 |
+
{'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10, 'epoch': 1.0}
|
| 16434 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16435 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16436 |
+
tensor(0.1212, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16437 |
+
{'train/tv_loss': 0.00014093549689278006, 'train/lm_loss': 1.683164300629869e-05, 'train/info_loss': 1.4185704458213877e-05, 'train/ref_loss': 0.2974424362182617, 'train/uncertainty_loss': 0.012122622132301331, 'train/video_loss': 0.3107067346572876, 'train/total_loss': 0.31072357296943665}
|
| 16438 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16439 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16440 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16441 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16442 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16443 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.05586314797401429, 'train/info_loss': 0.11093997955322266, 'train/ref_loss': None, 'train/uncertainty_loss': -8.851074380800128e-05, 'train/video_loss': 0.1108514666557312, 'train/total_loss': 0.166714608669281}
|
| 16444 |
+
[Rank 0] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}[Rank 1] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}[Rank 3] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}
|
| 16445 |
+
|
| 16446 |
+
|
| 16447 |
+
[Rank 2] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}
|
| 16448 |
+
{'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10, 'epoch': 1.0}
|
| 16449 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16450 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16451 |
+
tensor(0.1352, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16452 |
+
{'train/tv_loss': 0.00017385379178449514, 'train/lm_loss': 6.800925475545228e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.30475878715515137, 'train/uncertainty_loss': 0.013520647585391999, 'train/video_loss': 0.3196900188922882, 'train/total_loss': 0.31975802779197693}
|
| 16453 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16454 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16455 |
+
tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16456 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16457 |
+
{'train/tv_loss': 0.00015056305564939976, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.18287095427513123, 'train/uncertainty_loss': -7.037441246211529e-05, 'train/video_loss': 0.18402278423309326, 'train/total_loss': 0.18404754996299744}
|
| 16458 |
+
tensor(0.1576, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16459 |
+
[Rank 3] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
|
| 16460 |
+
[Rank 0] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}[Rank 1] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
|
| 16461 |
+
|
| 16462 |
+
[Rank 2] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
|
| 16463 |
+
{'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10, 'epoch': 1.0}
|
| 16464 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16465 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16466 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16467 |
+
tensor(0.0547, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16468 |
+
{'train/tv_loss': 9.430212085135282e-05, 'train/lm_loss': 1.3041071360930802e-05, 'train/info_loss': 1.293404056923464e-05, 'train/ref_loss': 0.24797077476978302, 'train/uncertainty_loss': 0.005471675470471383, 'train/video_loss': 0.2542097866535187, 'train/total_loss': 0.2542228400707245}
|
| 16469 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16470 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16471 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16472 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16473 |
+
{'train/tv_loss': 0.0001686649862676859, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.16706281900405884, 'train/uncertainty_loss': -7.104419637471437e-05, 'train/video_loss': 0.16835574805736542, 'train/total_loss': 0.1683724820613861}
|
| 16474 |
+
[Rank 3] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
|
| 16475 |
+
[Rank 0] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}[Rank 1] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
|
| 16476 |
+
|
| 16477 |
+
[Rank 2] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
|
| 16478 |
+
{'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10, 'epoch': 1.0}
|
| 16479 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16480 |
+
tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16481 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2963602066040039, 'train/info_loss': 0.15512116253376007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010410062968730928, 'train/video_loss': 0.15501706302165985, 'train/total_loss': 0.451377272605896}
|
| 16482 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16483 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16484 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16485 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16486 |
+
{'train/tv_loss': 0.00016122745582833887, 'train/lm_loss': 1.504364627180621e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.12243026494979858, 'train/uncertainty_loss': -7.243495201691985e-05, 'train/video_loss': 0.12366142123937607, 'train/total_loss': 0.12367646396160126}
|
| 16487 |
+
tensor(0.0812, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16488 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16489 |
+
[Rank 0] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}[Rank 1] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
|
| 16490 |
+
[Rank 2] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
|
| 16491 |
+
|
| 16492 |
+
[Rank 3] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
|
| 16493 |
+
{'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10, 'epoch': 1.0}
|
| 16494 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16495 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16496 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.035262671113014225, 'train/info_loss': 0.207004576921463, 'train/ref_loss': None, 'train/uncertainty_loss': -8.50230921059847e-05, 'train/video_loss': 0.20691955089569092, 'train/total_loss': 0.24218222498893738}
|
| 16497 |
+
tensor(0.1316, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16498 |
+
tensor(0.0744, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16499 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16500 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.08468132019042969, 'train/info_loss': 0.13620348274707794, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001168315066024661, 'train/video_loss': 0.13608665764331818, 'train/total_loss': 0.22076797485351562}
|
| 16501 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16502 |
+
tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16503 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16504 |
+
[Rank 2] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}[Rank 3] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
|
| 16505 |
+
|
| 16506 |
+
[Rank 1] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
|
| 16507 |
+
[Rank 0] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
|
| 16508 |
+
{'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10, 'epoch': 1.0}
|
| 16509 |
+
tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16510 |
+
tensor(0.1708, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16511 |
+
tensor(0.1118, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16512 |
+
{'train/tv_loss': 0.0001698122243396938, 'train/lm_loss': 1.723692112136632e-05, 'train/info_loss': 1.293404056923464e-05, 'train/ref_loss': 0.2866149842739105, 'train/uncertainty_loss': 0.01117597669363022, 'train/video_loss': 0.2991624176502228, 'train/total_loss': 0.29917964339256287}
|
| 16513 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16514 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16515 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3133418560028076, 'train/info_loss': 0.17231473326683044, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012505786726251246, 'train/video_loss': 0.17218968272209167, 'train/total_loss': 0.4855315387248993}
|
| 16516 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16517 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16518 |
+
tensor(0.0181, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16519 |
+
[Rank 2] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
| 16520 |
+
[Rank 3] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
| 16521 |
+
[Rank 1] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
| 16522 |
+
[Rank 0] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
|
| 16523 |
+
{'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10, 'epoch': 1.0}
|
| 16524 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16525 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16526 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16527 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16528 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3884145736694336, 'train/info_loss': 0.23487679660320282, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012862115399912, 'train/video_loss': 0.23474816977977753, 'train/total_loss': 0.6231627464294434}
|
| 16529 |
+
tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16530 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16531 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16532 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.010719782114028931, 'train/info_loss': 0.19857865571975708, 'train/ref_loss': None, 'train/uncertainty_loss': -8.820317452773452e-05, 'train/video_loss': 0.1984904557466507, 'train/total_loss': 0.20921023190021515}
|
| 16533 |
+
tensor(0.0573, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16534 |
+
[Rank 3] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
|
| 16535 |
+
[Rank 1] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}[Rank 0] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
|
| 16536 |
+
[Rank 2] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
|
| 16537 |
+
|
| 16538 |
+
{'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10, 'epoch': 1.0}
|
| 16539 |
+
tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16540 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16541 |
+
{'train/tv_loss': 0.0002269430086016655, 'train/lm_loss': 1.3041071360930802e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.15769097208976746, 'train/uncertainty_loss': -7.11314962245524e-05, 'train/video_loss': 0.15944914519786835, 'train/total_loss': 0.15946218371391296}
|
| 16542 |
+
tensor(0.1142, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16543 |
+
tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16544 |
+
tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16545 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16546 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16547 |
+
{'train/tv_loss': 0.00013790428638458252, 'train/lm_loss': 2.784535172395408e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.2233009785413742, 'train/uncertainty_loss': -7.168206502683461e-05, 'train/video_loss': 0.22434939444065094, 'train/total_loss': 0.22437724471092224}
|
| 16548 |
+
tensor(0.1906, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16549 |
+
[Rank 3] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}[Rank 0] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}[Rank 2] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}
|
| 16550 |
+
|
| 16551 |
+
[Rank 1] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}
|
| 16552 |
+
|
| 16553 |
+
{'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10, 'epoch': 1.0}
|
| 16554 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16555 |
+
tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16556 |
+
tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16557 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.1099284052848816, 'train/info_loss': 0.18190373480319977, 'train/ref_loss': None, 'train/uncertainty_loss': -9.252233430743218e-05, 'train/video_loss': 0.18181121349334717, 'train/total_loss': 0.2917396128177643}
|
| 16558 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16559 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16560 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16561 |
+
tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16562 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16563 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.2040557146072388, 'train/info_loss': 0.19522923231124878, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010584730189293623, 'train/video_loss': 0.19512338936328888, 'train/total_loss': 0.3991791009902954}
|
| 16564 |
+
[Rank 1] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
|
| 16565 |
+
[Rank 0] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}[Rank 3] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
|
| 16566 |
+
[Rank 2] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
|
| 16567 |
+
|
| 16568 |
+
{'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11, 'epoch': 1.0}
|
| 16569 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16570 |
+
tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16571 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16572 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16573 |
+
{'train/tv_loss': 0.00012163397623226047, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.6629419405944645e-05, 'train/ref_loss': 0.0847846046090126, 'train/uncertainty_loss': -7.099361391738057e-05, 'train/video_loss': 0.0857033059000969, 'train/total_loss': 0.08572807908058167}
|
| 16574 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16575 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.30223650932312013, 'train/info_loss': 0.21647511422634125, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013338448479771614, 'train/video_loss': 0.21634173393249512, 'train/total_loss': 0.518578290939331}
|
| 16576 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16577 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16578 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16579 |
+
[Rank 3] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}[Rank 0] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}[Rank 2] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}
|
| 16580 |
+
|
| 16581 |
+
|
| 16582 |
+
{'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11, 'epoch': 1.0}
|
| 16583 |
+
[Rank 1] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}
|
| 16584 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16585 |
+
tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16586 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16587 |
+
{'train/tv_loss': 0.00012253506574779748, 'train/lm_loss': 1.4829085557721555e-05, 'train/info_loss': 1.2755231182381976e-05, 'train/ref_loss': 0.06293053925037384, 'train/uncertainty_loss': -6.927629001438618e-05, 'train/video_loss': 0.06385429948568344, 'train/total_loss': 0.06386912614107132}
|
| 16588 |
+
tensor(0.3175, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16589 |
+
tensor(0.2061, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16590 |
+
tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16591 |
+
tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16592 |
+
{'train/tv_loss': 0.00018769102171063424, 'train/lm_loss': 3.144493966829032e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.18825295567512512, 'train/uncertainty_loss': -6.965706706978381e-05, 'train/video_loss': 0.18970192968845367, 'train/total_loss': 0.1897333711385727}
|
| 16593 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16594 |
+
[Rank 1] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}[Rank 0] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}[Rank 2] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}
|
| 16595 |
+
|
| 16596 |
+
[Rank 3] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}
|
| 16597 |
+
|
| 16598 |
+
{'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11, 'epoch': 1.0}
|
| 16599 |
+
tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16600 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.08768467903137207, 'train/info_loss': 0.19097542762756348, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011287924135103822, 'train/video_loss': 0.19086255133152008, 'train/total_loss': 0.2785472273826599}
|
| 16601 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16602 |
+
tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16603 |
+
tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16604 |
+
tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16605 |
+
tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16606 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.3417798757553101, 'train/info_loss': 0.12373294681310654, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011930712498724461, 'train/video_loss': 0.123613640666008, 'train/total_loss': 0.4653935432434082}
|
| 16607 |
+
tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16608 |
+
tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16609 |
+
[Rank 1] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}[Rank 2] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}[Rank 3] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}
|
| 16610 |
+
|
| 16611 |
+
|
| 16612 |
+
[Rank 0] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}
|
| 16613 |
+
{'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12, 'epoch': 1.0}
|
| 16614 |
+
tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16615 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.22061138153076174, 'train/info_loss': 0.19586126506328583, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013994580367580057, 'train/video_loss': 0.1957213133573532, 'train/total_loss': 0.4163326919078827}
|
| 16616 |
+
tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16617 |
+
tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16618 |
+
tensor(0.3543, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16619 |
+
tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
|
| 16620 |
+
tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
|
| 16621 |
+
{'train/tv_loss': None, 'train/lm_loss': 0.5868082523345948, 'train/info_loss': 0.14800412952899933, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013187677832320334, 'train/video_loss': 0.14787225425243378, 'train/total_loss': 0.7346805334091187}
|
| 16622 |
+
tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
|
| 16623 |
+
tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
|
| 16624 |
+
[Rank 1] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}[Rank 2] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}[Rank 3] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}
|
| 16625 |
+
|
| 16626 |
+
|
| 16627 |
+
[Rank 0] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}
|
| 16628 |
+
{'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0, 'epoch': 1.0}
|
| 16629 |
+
[Rank 1] Trainer log: {'train_runtime': 29823.9547, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}[Rank 2] Trainer log: {'train_runtime': 29822.0432, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}[Rank 3] Trainer log: {'train_runtime': 29817.5825, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}
|
| 16630 |
+
|
| 16631 |
+
|
| 16632 |
+
[Rank 0] Trainer log: {'train_runtime': 29828.2573, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}
|
| 16633 |
+
{'train_runtime': 29828.2573, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'train_loss': 0.11426909348176371, 'epoch': 1.0}
|
| 16634 |
+
Finished TrainingFinished Training
|
| 16635 |
+
Finished Training
|
| 16636 |
+
|