Add library_name to metadata (#11)

- Add library_name to metadata (84e272b26f7f71cd69c8eea0e3133e1276ff4383)

Co-authored-by: Niels Rogge <[email protected]>

Files changed (1) hide show

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-license: apache-2.0
 base_model:
 - Qwen/Qwen2.5-7B-Instruct
-pipeline_tag: any-to-any
 language:
 - en
 - zh
-datasets:
-- HuggingFaceFV/finevideo
 ---
 # Ola-7B
@@ -30,7 +31,7 @@ Ola offers an on-demand solution to seamlessly and efficiently process visual in
 We provide a simple generation process for using our model. For more details, please refer to our [Github Repo](https://github.com/Ola-Omni/Ola)
-```
 import os
 os.environ['LOWRES_RESIZE'] = '384x32'
 os.environ['HIGHRES_BASE'] = '0x32'
@@ -177,11 +178,15 @@ def ola_inference(multimodal, audio_path):
     else:
         qs = ''
     if USE_SPEECH and audio_path:
-        qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
     elif USE_SPEECH:
-        qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
     else:
-        qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
     conv = conv_templates[conv_mode].copy()
     conv.append_message(conv.roles[0], qs)
@@ -220,6 +225,7 @@ def ola_inference(multimodal, audio_path):
         for visual in image:
             image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
             image_tensor.append(image_tensor_)
             image_highres_tensor.append(image_highres_tensor_)
         if all(x.shape == image_tensor[0].shape for x in image_tensor):
             image_tensor = torch.stack(image_tensor, dim=0)

 ---
 base_model:
 - Qwen/Qwen2.5-7B-Instruct
+datasets:
+- HuggingFaceFV/finevideo
 language:
 - en
 - zh
+license: apache-2.0
+pipeline_tag: any-to-any
+library_name: transformers
 ---
 # Ola-7B
 We provide a simple generation process for using our model. For more details, please refer to our [Github Repo](https://github.com/Ola-Omni/Ola)
+```python
 import os
 os.environ['LOWRES_RESIZE'] = '384x32'
 os.environ['HIGHRES_BASE'] = '0x32'
     else:
         qs = ''
     if USE_SPEECH and audio_path:
+        qs = DEFAULT_IMAGE_TOKEN + "
+" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '
+'
     elif USE_SPEECH:
+        qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "
+" + qs
     else:
+        qs = DEFAULT_IMAGE_TOKEN + "
+" + qs
     conv = conv_templates[conv_mode].copy()
     conv.append_message(conv.roles[0], qs)
         for visual in image:
             image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
             image_tensor.append(image_tensor_)
             image_highres_tensor.append(image_highres_tensor_)
         if all(x.shape == image_tensor[0].shape for x in image_tensor):
             image_tensor = torch.stack(image_tensor, dim=0)