| | from transformers import PretrainedConfig |
| | from typing import List |
| |
|
| | from transformers import Qwen2Config, CLIPVisionConfig |
| |
|
| | class InfMLLMUnifiedHDChatConfig(PretrainedConfig): |
| | def __init__( |
| | self, |
| | vison_config=None, |
| | lm_config=None, |
| | lm_model="", |
| | lm_tokenizer="", |
| | lora_modules="", |
| | lora_llm=False, |
| | lora_r=128, |
| | lora_alpha=256, |
| | lora_dropout=0, |
| | |
| | encoder_img="", |
| | image_size_img=336, |
| | lora_encoder_img=False, |
| | hd_num=9, |
| | |
| | encoder_video="", |
| | |
| | max_txt_len=4096, |
| | conv_style='qwen-7b-chat', |
| | precision="bf16", |
| | **kwargs |
| | ): |
| | self.lm_model = lm_model |
| | self.lm_tokenizer = lm_tokenizer |
| | self.lora_modules = lora_modules |
| | self.lora_llm = lora_llm |
| | self.lora_r = lora_r |
| | self.lora_alpha = lora_alpha |
| | self.lora_dropout = lora_dropout |
| |
|
| | self.encoder_img = encoder_img |
| | self.image_size_img = image_size_img |
| | self.lora_encoder_img = lora_encoder_img |
| | self.hd_num = hd_num |
| |
|
| | self.encoder_video = encoder_video |
| | |
| | self.max_txt_len = max_txt_len |
| | self.conv_style = conv_style |
| |
|
| | self.precision = precision |
| | |
| | if type(vison_config) == dict: |
| | self.vision_config = CLIPVisionConfig(**vison_config) |
| | else: |
| | self.vision_config = vison_config |
| | |
| | if type(lm_config) == dict: |
| | self.lm_config = Qwen2Config(**lm_config) |
| | else: |
| | self.lm_config = lm_config |
| | super().__init__(**kwargs) |
| | |