pi-cotraining / config.json
shrg7's picture
Upload folder using huggingface_hub
7273038 verified
{
"_name_or_path": "final-hf/paligemma-3b-pt-224-main",
"architectures": [
"PaliGemmaForConditionalGeneration"
],
"bos_token_id": 2,
"eos_token_id": 1,
"hidden_size": 2048,
"ignore_index": -100,
"image_token_index": 257152,
"model_type": "paligemma",
"pad_token_id": 0,
"projection_dim": 2048,
"text_config": {
"hidden_size": 2048,
"intermediate_size": 16384,
"model_type": "gemma",
"num_attention_heads": 8,
"num_hidden_layers": 18,
"num_image_tokens": 256,
"num_key_value_heads": 1,
"torch_dtype": "float32",
"vocab_size": 257216
},
"torch_dtype": "float32",
"transformers_version": "4.41.0.dev0",
"vision_config": {
"hidden_size": 1152,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"num_image_tokens": 256,
"patch_size": 14,
"projection_dim": 2048,
"projector_hidden_act": "gelu_fast",
"vision_use_head": false
},
"vocab_size": 257216,
"model": {
"llm_backbone_id": "gemma-2b",
"llm_max_length": 2048,
"model_id": "paligemma-224px+3b",
"type": "paligemma-224px+3b",
"vision_backbone_id": "siglip-vit-so400m",
"arch_specifier": "linear",
"image_resize_strategy": "resize-naive",
"reduce_in_full_precision": false,
"align_epochs": 1,
"align_global_batch_size": 256,
"align_learning_rate": 0.001,
"align_lr_scheduler_type": "linear-warmup+cosine-decay",
"align_max_grad_norm": 1.0,
"align_max_steps": null,
"align_per_device_batch_size": 16,
"align_train_strategy": "fsdp-shard-grad-op",
"align_warmup_ratio": 0.03,
"align_weight_decay": 0.0,
"enable_gradient_checkpointing": true,
"enable_mixed_precision_training": true
}
}