| train_file: /home/share/jiaofangkai/wiki_erica_path/v7/union/train_distant.path_v7.train.0.pkl | |
| dev_file: /home/share/jiaofangkai/wiki_erica_path/v7/union/train_distant.path_v7.dev.pkl | |
| test_file: null | |
| model: | |
| _target_: models.albert_baseline.AlbertForMultipleChoicePreTrain.from_pretrained | |
| mlp_hidden_size: 8192 | |
| read_tensor: | |
| _target_: dataset.wiki_entity_path_v8_2.convert_examples_into_features | |
| max_neg_num: 3 | |
| aug_num: 1 | |
| max_seq_length: 256 | |
| shuffle_context: true | |
| min_rep_num: 5 | |
| geo_p: 0.4 | |
| deduct_ratio: 1.0 | |
| context_ratio: 1.0 | |
| num_workers: 64 | |
| extended_vocab: null | |
| collator: | |
| _target_: dataset.wiki_entity_path_v8.WikiPathDatasetCollatorWithContext | |
| max_seq_length: 256 | |
| tokenizer: pretrained-models/albert-xxlarge-v2 | |
| mlm_probability: 0.15 | |
| max_option_num: 4 | |
| swap: true | |
| num_workers: 8 | |
| prefetch_factor: 2 | |
| model_name_or_path: pretrained-models/albert-xxlarge-v2 | |
| pretrain: null | |
| output_dir: experiments/albert.xxlarge.path.v7_v8.2.2.1aug.ctx.TeslaT4 | |
| do_train: Train | |
| evaluate_during_training: true | |
| do_eval: false | |
| eval_sub_path: null | |
| do_preprocess: false | |
| per_gpu_train_batch_size: 1 | |
| per_gpu_eval_batch_size: 1 | |
| learning_rate: 5.0e-05 | |
| gradient_accumulation_steps: 2048 | |
| weight_decay: 0.01 | |
| adam_epsilon: 1.0e-06 | |
| adam_betas: (0.9, 0.98) | |
| max_grad_norm: 5.0 | |
| num_train_epochs: 1 | |
| max_steps: 100 | |
| warmup_proportion: 0.2 | |
| warmup_steps: 0 | |
| optimizer: lamb | |
| use_nvlamb: true | |
| logging_steps: 1 | |
| save_steps: 50 | |
| eval_steps: 50 | |
| no_cuda: false | |
| seed: 42 | |
| local_rank: 0 | |
| fp16: true | |
| fp16_opt_level: O1 | |
| reshard_after_forward: false | |
| cpu_offload: false | |
| move_grads_to_cpu: false | |
| move_params_to_cpu: false | |
| n_gpu: 1 | |
| device: cuda:0 | |
| train_batch_size: 1 | |
| eval_batch_size: 1 | |
| note: null | |