|  | --- | 
					
						
						|  | license: apache-2.0 | 
					
						
						|  | datasets: | 
					
						
						|  | - Inst-IT/Inst-IT-Dataset | 
					
						
						|  | - lmms-lab/LLaVA-NeXT-Data | 
					
						
						|  | language: | 
					
						
						|  | - en | 
					
						
						|  | metrics: | 
					
						
						|  | - accuracy | 
					
						
						|  | base_model: | 
					
						
						|  | - liuhaotian/llava-v1.6-vicuna-7b | 
					
						
						|  | pipeline_tag: video-text-to-text | 
					
						
						|  | tags: | 
					
						
						|  | - multimodal | 
					
						
						|  | - fine-grained | 
					
						
						|  | - instance-understanding | 
					
						
						|  | model-index: | 
					
						
						|  | - name: LLaVA-Next-Inst-It-Vicuna-7B | 
					
						
						|  | results: | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: Inst-IT-Bench-I-OE | 
					
						
						|  | type: Open-Ended | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 68.6 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: Inst-IT-Bench-I-MC | 
					
						
						|  | type: Multi-Choice | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 63 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: AI2D | 
					
						
						|  | type: ai2d | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 71 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: MMMU | 
					
						
						|  | type: mmmu | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 37.4 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: POPE | 
					
						
						|  | type: pope | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 87.2 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: GQA | 
					
						
						|  | type: gqa | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 65.9 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: MM-Vet | 
					
						
						|  | type: mm-vet | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 38.1 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: Inst-IT-Bench-V-OE | 
					
						
						|  | type: Open-Ended | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 49.3 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: Inst-IT-Bench-V-MC | 
					
						
						|  | type: Multi-Choice | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 42.1 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: ActNet-QA | 
					
						
						|  | type: actnet-qa | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 53.7 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: EgoSchema | 
					
						
						|  | type: egoschema | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 57.8 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: NextQA | 
					
						
						|  | type: nextqa | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 70.2 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: VideoMME | 
					
						
						|  | type: videomme | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 44.3 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | - task: | 
					
						
						|  | type: multimodal | 
					
						
						|  | dataset: | 
					
						
						|  | name: TempoCompass | 
					
						
						|  | type: tempocompass | 
					
						
						|  | metrics: | 
					
						
						|  | - type: accuracy | 
					
						
						|  | value: 59.8 | 
					
						
						|  | name: accuracy | 
					
						
						|  | verified: true | 
					
						
						|  | --- | 
					
						
						|  |  | 
					
						
						|  | # LLaVA-Next-Inst-It-Vicuna-7B | 
					
						
						|  | [**Homepage**](https://inst-it.github.io/) | [**Code**](https://github.com/inst-it/inst-it) | [**Paper**](https://huggingface.co/papers/2412.03565) | [**arXiv**](https://arxiv.org/abs/2412.03565) | 
					
						
						|  |  | 
					
						
						|  | LLaVA-Next-Inst-It-Vicuna-7B is a multimodal model that excels at instance-level understanding, | 
					
						
						|  | which is introduced in the paper [Inst-IT: Boosting Multimodal Instance Understanding via Explicit Visual Prompt Instruction Tuning](https://huggingface.co/papers/2412.03565) | 
					
						
						|  |  | 
					
						
						|  | * **Architecture**: clip-vit-large-patch14-336 + Vicuna-7B | 
					
						
						|  | * **Initialized Model**: LLaVA-NeXT | 
					
						
						|  | * **Data**: LLaVA-NeXT-Data / Inst-IT-Dataset | 
					
						
						|  | * **Precision**: bfloat16 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | ## Quick Start | 
					
						
						|  | **Install** | 
					
						
						|  |  | 
					
						
						|  | Our code is based on LLaVA-NeXT, before running, please install the LLaVA-NeXT to prepare the environment: | 
					
						
						|  | ```shell | 
					
						
						|  | pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git | 
					
						
						|  | ``` | 
					
						
						|  | **Load Model** | 
					
						
						|  | ```python | 
					
						
						|  | from llava.model.builder import load_pretrained_model | 
					
						
						|  | from llava.constants import ( | 
					
						
						|  | DEFAULT_IMAGE_TOKEN, | 
					
						
						|  | IMAGE_TOKEN_INDEX, | 
					
						
						|  | ) | 
					
						
						|  | from llava.mm_utils import ( | 
					
						
						|  | KeywordsStoppingCriteria, | 
					
						
						|  | get_model_name_from_path, | 
					
						
						|  | tokenizer_image_token, | 
					
						
						|  | process_images | 
					
						
						|  | ) | 
					
						
						|  | from llava.conversation import SeparatorStyle, conv_templates | 
					
						
						|  |  | 
					
						
						|  | overwrite_config = {} | 
					
						
						|  | overwrite_config["mm_spatial_pool_stride"] = 2 | 
					
						
						|  | overwrite_config["mm_spatial_pool_mode"] = 'bilinear' | 
					
						
						|  | overwrite_config["mm_pooling_position"] = 'after' | 
					
						
						|  | overwrite_config["mm_newline_position"] = 'no_token' | 
					
						
						|  |  | 
					
						
						|  | model_path = "Inst-IT/LLaVA-Next-Inst-It-Vicuna-7B" | 
					
						
						|  | model_name = get_model_name_from_path(model_path) | 
					
						
						|  |  | 
					
						
						|  | tokenizer, model, image_processor, max_length = load_pretrained_model( | 
					
						
						|  | model_path=model_path, | 
					
						
						|  | model_base=None, | 
					
						
						|  | model_name=model_name, | 
					
						
						|  | device_map="auto", | 
					
						
						|  | torch_dtype='bfloat16', | 
					
						
						|  | overwrite_config=overwrite_config, | 
					
						
						|  | attn_implementation='sdpa') | 
					
						
						|  | ``` | 
					
						
						|  | **Image Inference** | 
					
						
						|  |  | 
					
						
						|  | <details> | 
					
						
						|  | <summary>Inference without SoMs</summary> | 
					
						
						|  |  | 
					
						
						|  | Our model can perform inference on images without [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts, in this case, it can be used in the same way as its base mode [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT). | 
					
						
						|  |  | 
					
						
						|  | ```python | 
					
						
						|  | import torch | 
					
						
						|  | import requests | 
					
						
						|  | from PIL import Image | 
					
						
						|  |  | 
					
						
						|  | img_url = "https://github.com/inst-it/inst-it/blob/main/assets/demo/image.jpg?raw=true" | 
					
						
						|  | image = Image.open(requests.get(img_url, stream=True).raw) | 
					
						
						|  | image_tensor = process_images([image], image_processor, model.config).bfloat16() | 
					
						
						|  | image_sizes = [image.size] | 
					
						
						|  |  | 
					
						
						|  | question = "Describe this image." | 
					
						
						|  | question = DEFAULT_IMAGE_TOKEN + "\n" + question | 
					
						
						|  |  | 
					
						
						|  | conv_template = 'vicuna_v1' | 
					
						
						|  | conv = conv_templates[conv_template].copy() | 
					
						
						|  | conv.append_message(conv.roles[0], question) | 
					
						
						|  | conv.append_message(conv.roles[1], None) | 
					
						
						|  | prompt = conv.get_prompt() | 
					
						
						|  |  | 
					
						
						|  | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() | 
					
						
						|  |  | 
					
						
						|  | pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id | 
					
						
						|  | attention_masks = input_ids.ne(pad_token_ids).long().cuda() | 
					
						
						|  |  | 
					
						
						|  | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 | 
					
						
						|  | keywords = [stop_str] | 
					
						
						|  | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) | 
					
						
						|  |  | 
					
						
						|  | with torch.inference_mode(): | 
					
						
						|  | output_ids = model.generate( | 
					
						
						|  | inputs=input_ids, | 
					
						
						|  | images=image_tensor, | 
					
						
						|  | attention_mask=attention_masks, | 
					
						
						|  | modalities="image", | 
					
						
						|  | image_sizes=image_sizes, | 
					
						
						|  | use_cache=True, | 
					
						
						|  | stopping_criteria=[stopping_criteria], | 
					
						
						|  | max_new_tokens=4096 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() | 
					
						
						|  | print(pred) | 
					
						
						|  | ``` | 
					
						
						|  | </details> | 
					
						
						|  |  | 
					
						
						|  | <details> | 
					
						
						|  | <summary>Inference with SoMs</summary> | 
					
						
						|  |  | 
					
						
						|  | Our model performs more fine-grained understanding when [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts are provided. | 
					
						
						|  | You can refer to the instances that you are interested in using their IDs. | 
					
						
						|  | Compared to the previous inference code, the following code has no modifications except for the input image, which is visual prompted with Set-of-Marks. | 
					
						
						|  | Refer to [this link](https://github.com/microsoft/SoM) to learn how to generate SoMs for an image. | 
					
						
						|  |  | 
					
						
						|  | ```python | 
					
						
						|  | import torch | 
					
						
						|  | import requests | 
					
						
						|  | from PIL import Image | 
					
						
						|  |  | 
					
						
						|  | img_url = "https://github.com/inst-it/inst-it/blob/main/assets/demo/image_som.jpg?raw=true" | 
					
						
						|  | image = Image.open(requests.get(img_url, stream=True).raw) | 
					
						
						|  | image_tensor = process_images([image], image_processor, model.config).bfloat16() | 
					
						
						|  | image_sizes = [image.size] | 
					
						
						|  |  | 
					
						
						|  | # You can use [id] to refer to the instances that you are interested in | 
					
						
						|  | question = "Describe [8] in detail." | 
					
						
						|  | question = DEFAULT_IMAGE_TOKEN + "\n" + question | 
					
						
						|  |  | 
					
						
						|  | conv_template = 'vicuna_v1' | 
					
						
						|  | conv = conv_templates[conv_template].copy() | 
					
						
						|  | conv.append_message(conv.roles[0], question) | 
					
						
						|  | conv.append_message(conv.roles[1], None) | 
					
						
						|  | prompt = conv.get_prompt() | 
					
						
						|  |  | 
					
						
						|  | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() | 
					
						
						|  |  | 
					
						
						|  | pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id | 
					
						
						|  | attention_masks = input_ids.ne(pad_token_ids).long().cuda() | 
					
						
						|  |  | 
					
						
						|  | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 | 
					
						
						|  | keywords = [stop_str] | 
					
						
						|  | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) | 
					
						
						|  |  | 
					
						
						|  | with torch.inference_mode(): | 
					
						
						|  | output_ids = model.generate( | 
					
						
						|  | inputs=input_ids, | 
					
						
						|  | images=image_tensor, | 
					
						
						|  | attention_mask=attention_masks, | 
					
						
						|  | modalities="image", | 
					
						
						|  | image_sizes=image_sizes, | 
					
						
						|  | use_cache=True, | 
					
						
						|  | stopping_criteria=[stopping_criteria], | 
					
						
						|  | max_new_tokens=4096 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() | 
					
						
						|  | print(pred) | 
					
						
						|  | ``` | 
					
						
						|  | </details> | 
					
						
						|  |  | 
					
						
						|  | **Video Inference** | 
					
						
						|  |  | 
					
						
						|  | For the video, we organize each frame into a list. You can use the format \<t\> to refer to a specific timestamp (e.g. <1>). | 
					
						
						|  |  | 
					
						
						|  | <details> | 
					
						
						|  | <summary>Inference without SoMs</summary> | 
					
						
						|  |  | 
					
						
						|  | Our model can perform inference on videos without [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts, in this case, it can be used in the same way as its base mode [LLaVA-NeXT](https://github.com/LLaVA-VL/LLaVA-NeXT). | 
					
						
						|  |  | 
					
						
						|  | ```python | 
					
						
						|  | import torch | 
					
						
						|  | import requests | 
					
						
						|  | from PIL import Image | 
					
						
						|  |  | 
					
						
						|  | frame_urls = [ | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_1.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_2.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_3.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_4.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_5.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_6.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_7.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/frame_8.jpg?raw=true" | 
					
						
						|  | ] | 
					
						
						|  | video = [Image.open(requests.get(frame_url, stream=True).raw) for frame_url in frame_urls] | 
					
						
						|  | video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda() | 
					
						
						|  | video = video.bfloat16() | 
					
						
						|  | videos = [video] | 
					
						
						|  |  | 
					
						
						|  | question = "Describe the video."  # overall video caption | 
					
						
						|  | question = "What happens at frame <1>?"  # caption a specific moment | 
					
						
						|  | question = DEFAULT_IMAGE_TOKEN + "\n" + question | 
					
						
						|  |  | 
					
						
						|  | conv_template = 'vicuna_v1' | 
					
						
						|  | conv = conv_templates[conv_template].copy() | 
					
						
						|  | conv.append_message(conv.roles[0], question) | 
					
						
						|  | conv.append_message(conv.roles[1], None) | 
					
						
						|  | prompt = conv.get_prompt() | 
					
						
						|  |  | 
					
						
						|  | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() | 
					
						
						|  |  | 
					
						
						|  | pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id | 
					
						
						|  | attention_masks = input_ids.ne(pad_token_ids).long().cuda() | 
					
						
						|  |  | 
					
						
						|  | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 | 
					
						
						|  | keywords = [stop_str] | 
					
						
						|  | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) | 
					
						
						|  |  | 
					
						
						|  | with torch.inference_mode(): | 
					
						
						|  | output_ids = model.generate( | 
					
						
						|  | inputs=input_ids, | 
					
						
						|  | images=videos, | 
					
						
						|  | attention_mask=attention_masks, | 
					
						
						|  | modalities="video", | 
					
						
						|  | use_cache=True, | 
					
						
						|  | stopping_criteria=[stopping_criteria], | 
					
						
						|  | max_new_tokens=4096 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() | 
					
						
						|  | print(pred) | 
					
						
						|  | ``` | 
					
						
						|  | </details> | 
					
						
						|  |  | 
					
						
						|  | <details> | 
					
						
						|  | <summary>Inference with SoMs</summary> | 
					
						
						|  |  | 
					
						
						|  | Our model performs more fine-grained understanding when [Set-of-Marks](https://arxiv.org/abs/2310.11441) visual prompts are provided. | 
					
						
						|  | You can refer to the instances that you are interested in using their IDs. | 
					
						
						|  | Compared to the previous inference code, the following code has no modifications except for the input video, which is visual prompted with Set-of-Marks. | 
					
						
						|  | Refer to [SAM2](https://github.com/facebookresearch/sam2) and [SoM](https://github.com/microsoft/SoM) to learn how to generate SoMs for a video. | 
					
						
						|  |  | 
					
						
						|  | ```python | 
					
						
						|  | import torch | 
					
						
						|  | import requests | 
					
						
						|  | from PIL import Image | 
					
						
						|  |  | 
					
						
						|  | frame_urls = [ | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_1.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_2.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_3.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_4.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_5.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_6.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_7.jpg?raw=true", | 
					
						
						|  | "https://github.com/inst-it/inst-it/blob/main/assets/demo/som_frame_8.jpg?raw=true" | 
					
						
						|  | ] | 
					
						
						|  | video = [Image.open(requests.get(frame_url, stream=True).raw) for frame_url in frame_urls] | 
					
						
						|  | video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda() | 
					
						
						|  | video = video.bfloat16() | 
					
						
						|  | videos = [video] | 
					
						
						|  |  | 
					
						
						|  | # You can use [id] to refer to the instances that you are interested in | 
					
						
						|  | question = "Is [3] visible at <1>?" | 
					
						
						|  | question = DEFAULT_IMAGE_TOKEN + "\n" + question | 
					
						
						|  |  | 
					
						
						|  | conv_template = 'vicuna_v1' | 
					
						
						|  | conv = conv_templates[conv_template].copy() | 
					
						
						|  | conv.append_message(conv.roles[0], question) | 
					
						
						|  | conv.append_message(conv.roles[1], None) | 
					
						
						|  | prompt = conv.get_prompt() | 
					
						
						|  |  | 
					
						
						|  | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() | 
					
						
						|  |  | 
					
						
						|  | pad_token_ids = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id | 
					
						
						|  | attention_masks = input_ids.ne(pad_token_ids).long().cuda() | 
					
						
						|  |  | 
					
						
						|  | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 | 
					
						
						|  | keywords = [stop_str] | 
					
						
						|  | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) | 
					
						
						|  |  | 
					
						
						|  | with torch.inference_mode(): | 
					
						
						|  | output_ids = model.generate( | 
					
						
						|  | inputs=input_ids, | 
					
						
						|  | images=videos, | 
					
						
						|  | attention_mask=attention_masks, | 
					
						
						|  | modalities="video", | 
					
						
						|  | use_cache=True, | 
					
						
						|  | stopping_criteria=[stopping_criteria], | 
					
						
						|  | max_new_tokens=4096 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() | 
					
						
						|  | print(pred) | 
					
						
						|  | ``` | 
					
						
						|  | </details> | 
					
						
						|  |  | 
					
						
						|  | ## Contact | 
					
						
						|  | Feel free to contact us if you have any questions or suggestions | 
					
						
						|  | - Email (Wujian Peng): [email protected] | 
					
						
						|  | - Email (Lingchen Meng): [email protected] | 
					
						
						|  |  | 
					
						
						|  | ## Citation | 
					
						
						|  | ``` bibtex | 
					
						
						|  | @article{peng2024inst, | 
					
						
						|  | title={Inst-IT: Boosting Multimodal Instance Understanding via Explicit Visual Prompt Instruction Tuning}, | 
					
						
						|  | author={Peng, Wujian and Meng, Lingchen and Chen, Yitong and Xie, Yiweng and Liu, Yang and Gui, Tao and Xu, Hang and Qiu, Xipeng and Wu, Zuxuan and Jiang, Yu-Gang}, | 
					
						
						|  | journal={arXiv preprint arXiv:2412.03565}, | 
					
						
						|  | year={2024} | 
					
						
						|  | } | 
					
						
						|  | ``` |