| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| commit_msg : fix bug | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| branch : main | |
| } | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f134cae31c0> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f134cae31c0> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| commit_msg : fix bug | |
| branch : main | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| } | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fe404aa3370> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fe404aa3370> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| commit_msg : fix bug | |
| branch : main | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| } | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3ef86a7370> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3ef86a7370> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| branch : main | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| commit_msg : fix bug | |
| } | |
| [10-29 12:06:17] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1279867, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7ff54c41f370> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7ff54c41f370> | |
| [10-29 12:06:20] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:06:20] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ... [dataloader multi processing](*) finished! (47.39s) | |
| [dataloader multi processing](*) finished! (47.94s) | |
| [dataloader multi processing](*) finished! (51.25s) | |
| [10-29 12:07:07] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [dataloader multi processing](*) finished! (51.98s) | |
| [10-29 12:07:08] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [10-29 12:07:25] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:07:25] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:07:11] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1667, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [10-29 12:07:29] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:07:12] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [10-29 12:07:30] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:07:25] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 12:07:25] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 12:07:29] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 12:07:30] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 12 days, 20:52:31 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.04 (0.04) Acct: 0.02 (0.02) time: 666.2379 data: 0.0006 | |
| [10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 12 days, 20:53:48 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.01 (0.01) Acct: 0.02 (0.02) time: 666.2845 data: 0.0005 | |
| [10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1667] eta: 12 days, 20:31:44 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.04 (0.04) Acct: 0.05 (0.05) time: 666.2895 data: 0.0004 | |
| [10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 12 days, 20:31:03 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.04 (0.04) Acct: 0.03 (0.03) time: 665.4663 data: 0.0006 | |
| [10-29 12:23:32] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 416/1667] eta: 0:45:08 tlr: 9.7e-06 tnm: 0.06 Lm: 8.271 (8.271) Lt: 8.260 (8.260) Accm: 0.08 (0.08) Acct: 0.09 (0.09) time: 0.3480 data: 0.0002 | |
| [10-29 12:23:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 0:45:02 tlr: 9.7e-06 tnm: 0.06 Lm: 8.272 (8.272) Lt: 8.267 (8.267) Accm: 0.08 (0.08) Acct: 0.08 (0.08) time: 0.3481 data: 0.0002 | |
| [10-29 12:23:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 0:45:05 tlr: 9.7e-06 tnm: 0.06 Lm: 8.273 (8.273) Lt: 8.267 (8.267) Accm: 0.09 (0.09) Acct: 0.13 (0.13) time: 0.3481 data: 0.0002 | |
| [10-29 12:23:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 0:45:05 tlr: 9.7e-06 tnm: 0.06 Lm: 8.273 (8.273) Lt: 8.262 (8.262) Accm: 0.07 (0.07) Acct: 0.05 (0.05) time: 0.3481 data: 0.0002 | |
| [10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 833/1667] eta: 0:17:28 tlr: 1.8e-05 tnm: 0.08 Lm: 8.224 (8.207) Lt: 8.203 (8.200) Accm: 0.12 (0.14) Acct: 0.12 (0.15) time: 0.3484 data: 0.0002 | |
| [10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:17:28 tlr: 1.8e-05 tnm: 0.08 Lm: 8.228 (8.214) Lt: 8.206 (8.199) Accm: 0.11 (0.12) Acct: 0.09 (0.11) time: 0.3485 data: 0.0002 | |
| [10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:17:28 tlr: 1.8e-05 tnm: 0.08 Lm: 8.228 (8.214) Lt: 8.216 (8.207) Accm: 0.17 (0.14) Acct: 0.24 (0.21) time: 0.3485 data: 0.0002 | |
| [10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:17:27 tlr: 1.8e-05 tnm: 0.08 Lm: 8.227 (8.213) Lt: 8.216 (8.203) Accm: 0.13 (0.14) Acct: 0.12 (0.15) time: 0.3485 data: 0.0002 | |
| [10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1249/1667] eta: 0:06:38 tlr: 2.7e-05 tnm: 0.14 Lm: 8.152 (8.117) Lt: 8.142 (8.115) Accm: 0.19 (0.29) Acct: 0.20 (0.34) time: 0.3490 data: 0.0002 | |
| [10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:06:38 tlr: 2.7e-05 tnm: 0.12 Lm: 8.162 (8.131) Lt: 8.140 (8.127) Accm: 0.16 (0.25) Acct: 0.16 (0.23) time: 0.3491 data: 0.0002 | |
| [10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:06:38 tlr: 2.7e-05 tnm: 0.12 Lm: 8.162 (8.130) Lt: 8.151 (8.130) Accm: 0.21 (0.25) Acct: 0.30 (0.29) time: 0.3491 data: 0.0002 | |
| [10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:06:38 tlr: 2.7e-05 tnm: 0.12 Lm: 8.161 (8.129) Lt: 8.145 (8.128) Accm: 0.20 (0.27) Acct: 0.22 (0.27) time: 0.3491 data: 0.0005 | |
| [10-29 12:30:47] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1666/1667] eta: 0:00:00 tlr: 3.5e-05 tnm: 0.35 Lm: 8.081 (8.025) Lt: 8.081 (8.007) Accm: 0.26 (0.39) Acct: 0.28 (0.45) time: 0.3499 data: 0.0014 | |
| [10-29 12:30:47] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [ 0/350] Total time: 0:22:17 (0.803 s / it) | |
| ======================================================= RESTART [10-29 12:58:25] ======================================================= | |
| ======================================================= RESTART [10-29 12:58:25] ======================================================= | |
| ======================================================= RESTART [10-29 12:58:25] ======================================================= | |
| ======================================================= RESTART [10-29 12:58:25] ======================================================= | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| branch : main | |
| commit_msg : fix bug | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| } | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3ce89a7370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3ce89a7370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| commit_msg : fix bug | |
| branch : main | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| } | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fb2782f7370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fb2782f7370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| commit_msg : fix bug | |
| branch : main | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| } | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fd434417370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fd434417370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True | |
| [10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24 | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 38)=> initial args: | |
| { | |
| data_path : /mnt/localssd/ImageNet2012/ | |
| exp_name : text | |
| vae_ckpt : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt | |
| vfast : 2 | |
| tfast : 2 | |
| depth : 17 | |
| ini : -1 | |
| hd : 0.02 | |
| aln : 0.5 | |
| alng : 0.0001 | |
| fp16 : 1 | |
| tblr : 8e-05 | |
| tlr : 0.00024000000000000003 | |
| twd : 0.05 | |
| twde : 0.05 | |
| tclip : 2.0 | |
| ls : 0.0 | |
| bs : 768 | |
| batch_size : 24 | |
| glb_batch_size : 768 | |
| ac : 1 | |
| ep : 350 | |
| wp : 7.0 | |
| wp0 : 0.005 | |
| wpe : 0.01 | |
| sche : lin0 | |
| opt : adamw | |
| afuse : True | |
| saln : False | |
| anorm : True | |
| fuse : True | |
| pn : 1_1_2_3_3_4_5_6_8_11 | |
| patch_size : 11 | |
| patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11) | |
| resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121) | |
| data_load_reso : 256 | |
| mid_reso : 1.125 | |
| hflip : False | |
| workers : 12 | |
| pg : 0.0 | |
| pg0 : 4 | |
| pgwp : 1.1666666666666667 | |
| cmd : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True | |
| acc_mean : None | |
| acc_tail : None | |
| L_mean : None | |
| L_tail : None | |
| vacc_mean : None | |
| vacc_tail : None | |
| vL_mean : None | |
| vL_tail : None | |
| grad_norm : None | |
| cur_lr : None | |
| cur_wd : None | |
| cur_it : | |
| cur_ep : | |
| remain_time : | |
| finish_time : | |
| local_out_dir_path : /sensei-fs/users/xiangl/exp113_d17/ | |
| tb_log_dir_path : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05 | |
| log_txt_path : /sensei-fs/users/xiangl/exp113_d17/log.txt | |
| last_ckpt_path : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth | |
| tf32 : True | |
| seed : None | |
| codebook_size : 4096 | |
| codebook_embed_dim : 32 | |
| codebook_l2_norm : True | |
| codebook_show_usage : True | |
| commit_loss_beta : 0.25 | |
| entropy_loss_ratio : 0.0 | |
| test_model : True | |
| encoder_ch_mult : [1, 1, 2, 2, 4] | |
| decoder_ch_mult : [1, 1, 2, 2, 4] | |
| z_channels : 256 | |
| dropout_p : 0.0 | |
| v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11] | |
| enc_type : dinov2 | |
| dec_type : dinov2 | |
| semantic_guide : dinov2 | |
| num_latent_tokens : 121 | |
| encoder_model : vit_large_patch14_dinov2.lvd142m | |
| decoder_model : vit_large_patch14_dinov2.lvd142m | |
| abs_pos_embed : True | |
| share_quant_resi : 4 | |
| product_quant : 2 | |
| half_sem : True | |
| p_drop : 0.0 | |
| joint_sample : False | |
| infer_ckpt : | |
| masking_method : uniform | |
| same_seed_for_all_ranks: 0 | |
| local_debug : False | |
| dbg_nan : False | |
| cfg : [3.5, 3.5] | |
| top_k : 900 | |
| top_p : 0.95 | |
| commit_id : d9be612da9c1a0f8350fd7614e16337787b4640e | |
| branch : main | |
| commit_msg : fix bug | |
| } | |
| [10-29 12:58:27] (/home/user/VAR/train.py , line 42)=> [build PT data] ... | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1279867, len(val_set)=50000, num_classes=1000 | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [train] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fef347d3370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 48)=> Transform [val] = | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256)) | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> ToTensor() | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fef347d3370> | |
| [10-29 12:58:30] (e/user/VAR/utils/data.py, line 54)=> --------------------------- | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 65)=> [auto_resume quit] | |
| [10-29 12:58:30] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ... [dataloader multi processing](*) finished! (46.34s) | |
| [dataloader multi processing](*) finished! (47.47s) | |
| [dataloader multi processing](*) finished! (47.76s) | |
| [dataloader multi processing](*) finished! (51.68s) | |
| [10-29 12:59:16] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [10-29 12:59:29] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:59:18] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [10-29 12:59:31] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:59:17] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1667, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [10-29 12:59:33] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:59:22] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder') | |
| [10-29 12:59:35] (e/user/VAR/models/var.py, line 103)=> | |
| [constructor] ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== | |
| [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0 | |
| [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354, | |
| 0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708])) | |
| [10-29 12:59:31] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 13:00:35] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 13:00:35] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 12:59:35] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 13:00:35] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 13:00:35] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 12:59:33] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 13:00:35] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 13:00:35] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 12:59:30] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule( | |
| (_orig_mod): VAR( | |
| drop_path_rate=0.0708333 | |
| (word_embed): Linear(in_features=64, out_features=1088, bias=True) | |
| (class_emb): Embedding(1001, 1088) | |
| (lvl_embed): Embedding(10, 1088) | |
| (shared_ada_lin): Identity() | |
| (blocks): ModuleList( | |
| (0): AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): Identity() | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| (1-16): 16 x AdaLNSelfAttn( | |
| shared_aln=False | |
| (drop_path): DropPath((drop_prob=...)) | |
| (attn): SelfAttention( | |
| (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False) | |
| (proj): Linear(in_features=1088, out_features=1088, bias=True) | |
| (proj_drop): Identity() | |
| ) | |
| (ffn): FFN( | |
| fused_mlp_func=False | |
| (fc1): Linear(in_features=1088, out_features=4352, bias=True) | |
| (act): GELU(approximate='tanh') | |
| (fc2): Linear(in_features=4352, out_features=1088, bias=True) | |
| (drop): Identity() | |
| ) | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=6528, bias=True) | |
| ) | |
| ) | |
| ) | |
| (head_nm): AdaLNBeforeHead( | |
| (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False) | |
| (ada_lin): Sequential( | |
| (0): SiLU() | |
| (1): Linear(in_features=1088, out_features=2176, bias=True) | |
| ) | |
| ) | |
| (head): Linear(in_features=1088, out_features=8192, bias=True) | |
| ) | |
| ) | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34 | |
| [10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26 | |
| [10-29 13:00:35] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups = | |
| { 'D': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')", | |
| 'wd_sc': 1.0}, | |
| 'ND': { 'lr_sc': 1.0, | |
| 'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n" | |
| " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n" | |
| " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n" | |
| " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n" | |
| " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n" | |
| " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n" | |
| " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n" | |
| " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')", | |
| 'wd_sc': 0.0}} | |
| [10-29 13:00:36] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=214, numel=375258593 | |
| [10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 4 days, 5:18:02 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.04 (0.04) Acct: 0.03 (0.03) time: 218.5037 data: 0.0006 | |
| [10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 4 days, 5:40:05 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.01 (0.01) Acct: 0.02 (0.02) time: 219.2963 data: 0.0005 | |
| [10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 4 days, 5:40:06 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.02 (0.02) Acct: 0.00 (0.00) time: 219.2969 data: 0.0005 | |
| [10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> | |
| [10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0} | |
| [10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1667] eta: 4 days, 5:34:41 tlr: 1.2e-06 tnm: 0.06 Lm: 8.318 (8.318) Lt: 8.318 (8.318) Accm: 0.03 (0.03) Acct: 0.02 (0.02) time: 219.3650 data: 0.0006 | |
| [10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 416/1667] eta: 0:22:50 tlr: 9.7e-06 tnm: 0.06 Lm: 8.271 (8.271) Lt: 8.262 (8.262) Accm: 0.09 (0.09) Acct: 0.11 (0.11) time: 0.3478 data: 0.0002 | |
| [10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 0:22:49 tlr: 9.7e-06 tnm: 0.05 Lm: 8.274 (8.274) Lt: 8.268 (8.268) Accm: 0.08 (0.08) Acct: 0.04 (0.04) time: 0.3479 data: 0.0002 | |
| [10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 0:22:49 tlr: 9.7e-06 tnm: 0.05 Lm: 8.268 (8.268) Lt: 8.258 (8.258) Accm: 0.11 (0.11) Acct: 0.13 (0.13) time: 0.3479 data: 0.0002 | |
| [10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 0:22:47 tlr: 9.7e-06 tnm: 0.05 Lm: 8.268 (8.268) Lt: 8.258 (8.258) Accm: 0.08 (0.08) Acct: 0.08 (0.08) time: 0.3479 data: 0.0002 | |
| [10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 833/1667] eta: 0:10:02 tlr: 1.8e-05 tnm: 0.08 Lm: 8.224 (8.209) Lt: 8.207 (8.199) Accm: 0.15 (0.13) Acct: 0.21 (0.18) time: 0.3485 data: 0.0002 | |
| [10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:10:01 tlr: 1.8e-05 tnm: 0.08 Lm: 8.219 (8.211) Lt: 8.199 (8.196) Accm: 0.13 (0.13) Acct: 0.12 (0.15) time: 0.3487 data: 0.0002 | |
| [10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:10:02 tlr: 1.8e-05 tnm: 0.08 Lm: 8.230 (8.210) Lt: 8.219 (8.200) Accm: 0.15 (0.15) Acct: 0.09 (0.16) time: 0.3487 data: 0.0002 | |
| [10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:10:02 tlr: 1.8e-05 tnm: 0.08 Lm: 8.218 (8.212) Lt: 8.198 (8.199) Accm: 0.20 (0.18) Acct: 0.24 (0.21) time: 0.3487 data: 0.0002 | |
| [10-29 13:13:02] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1249/1667] eta: 0:04:09 tlr: 2.7e-05 tnm: 0.13 Lm: 8.154 (8.118) Lt: 8.140 (8.114) Accm: 0.18 (0.30) Acct: 0.27 (0.32) time: 0.3483 data: 0.0002 | |
| [10-29 13:13:03] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:04:09 tlr: 2.7e-05 tnm: 0.20 Lm: 8.159 (8.129) Lt: 8.140 (8.125) Accm: 0.26 (0.31) Acct: 0.30 (0.35) time: 0.3485 data: 0.0002 | |
| [10-29 13:13:03] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:04:09 tlr: 2.7e-05 tnm: 0.20 Lm: 8.157 (8.122) Lt: 8.136 (8.119) Accm: 0.18 (0.27) Acct: 0.22 (0.25) time: 0.3485 data: 0.0002 | |
| [10-29 13:13:03] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:04:09 tlr: 2.7e-05 tnm: 0.20 Lm: 8.156 (8.125) Lt: 8.140 (8.123) Accm: 0.22 (0.26) Acct: 0.24 (0.25) time: 0.3485 data: 0.0002 | |
| [10-29 13:15:28] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1666/1667] eta: 0:00:00 tlr: 3.5e-05 tnm: 0.35 Lm: 8.085 (8.022) Lt: 8.074 (8.004) Accm: 0.20 (0.44) Acct: 0.33 (0.50) time: 0.3518 data: 0.0016 | |
| [10-29 13:15:28] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [ 0/350] Total time: 0:14:51 (0.535 s / it) | |