HuggingFaceTB
/

SmolVLM2-2.2B-Instruct

+{
+    "bomFormat": "CycloneDX",
+    "specVersion": "1.6",
+    "serialNumber": "urn:uuid:2f96c755-4887-4315-83f9-22de2e18b301",
+    "version": 1,
+    "metadata": {
+        "timestamp": "2025-06-05T09:37:43.782903+00:00",
+        "component": {
+            "type": "machine-learning-model",
+            "bom-ref": "HuggingFaceTB/SmolVLM2-2.2B-Instruct-7235ccb9-479b-53f2-94fa-6512d4edebb3",
+            "name": "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+            "externalReferences": [
+                {
+                    "url": "https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+                    "type": "documentation"
+                }
+            ],
+            "modelCard": {
+                "modelParameters": {
+                    "task": "image-text-to-text",
+                    "architectureFamily": "smolvlm",
+                    "modelArchitecture": "SmolVLMForConditionalGeneration",
+                    "datasets": [
+                        {
+                            "ref": "HuggingFaceM4/the_cauldron-0b60b937-29a7-5f0c-9fa6-ec10bf894687"
+                        },
+                        {
+                            "ref": "HuggingFaceM4/Docmatix-1623432e-1ae3-5888-a9c9-23e566c5a05a"
+                        },
+                        {
+                            "ref": "lmms-lab/LLaVA-OneVision-Data-5dedde7e-a93e-51b5-99ad-6e589118a200"
+                        },
+                        {
+                            "ref": "lmms-lab/M4-Instruct-Data-67437f1e-35b0-5a44-8de0-217c202cacf0"
+                        },
+                        {
+                            "ref": "HuggingFaceFV/finevideo-b6414f8b-8137-53ce-bed4-674af789a978"
+                        },
+                        {
+                            "ref": "MAmmoTH-VL/MAmmoTH-VL-Instruct-12M-2c6bc921-c3e2-5525-97ec-d8782f1581e5"
+                        },
+                        {
+                            "ref": "lmms-lab/LLaVA-Video-178K-3489c331-beb5-56f7-87d9-67a7c5d6437c"
+                        },
+                        {
+                            "ref": "orrzohar/Video-STaR-7f15539e-ebd4-570c-92c7-ffffc5190a49"
+                        },
+                        {
+                            "ref": "Mutonix/Vript-4215585f-ed48-5399-8b8c-640ad24fcb01"
+                        },
+                        {
+                            "ref": "TIGER-Lab/VISTA-400K-ce87b7d4-1f05-5464-89f5-82ca09d29429"
+                        },
+                        {
+                            "ref": "Enxin/MovieChat-1K_train-fa5b461a-f000-517a-8048-3363781aa109"
+                        },
+                        {
+                            "ref": "ShareGPT4Video/ShareGPT4Video-4ef9dfce-9cf4-5a48-a489-ac850636b73d"
+                        }
+                    ]
+                },
+                "properties": [
+                    {
+                        "name": "library_name",
+                        "value": "transformers"
+                    },
+                    {
+                        "name": "base_model",
+                        "value": "HuggingFaceTB/SmolVLM-Instruct"
+                    }
+                ],
+                "consideration": {
+                    "useCases": "SmolVLM2 can be used for inference on multimodal (video / image / text) tasks where the input consists of text queries along with video or one or more images. Text and media files can be interleaved arbitrarily, enabling tasks like captioning, visual question answering, and storytelling based on visual content. The model does not support image or video generation.To fine-tune SmolVLM2 on a specific task, you can follow [the fine-tuning tutorial](https://github.com/huggingface/smollm/blob/main/vision/finetuning/Smol_VLM_FT.ipynb)."
+                }
+            },
+            "authors": [
+                {
+                    "name": "HuggingFaceTB"
+                }
+            ],
+            "licenses": [
+                {
+                    "license": {
+                        "id": "Apache-2.0",
+                        "url": "https://spdx.org/licenses/Apache-2.0.html"
+                    }
+                }
+            ],
+            "description": "- **Developed by:** Hugging Face \ud83e\udd17- **Model type:** Multi-modal model (image/multi-image/video/text)- **Language(s) (NLP):** English- **License:** Apache 2.0- **Architecture:** Based on [Idefics3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (see technical summary)",
+            "tags": [
+                "transformers",
+                "safetensors",
+                "smolvlm",
+                "image-text-to-text",
+                "video-text-to-text",
+                "conversational",
+                "en",
+                "dataset:HuggingFaceM4/the_cauldron",
+                "dataset:HuggingFaceM4/Docmatix",
+                "dataset:lmms-lab/LLaVA-OneVision-Data",
+                "dataset:lmms-lab/M4-Instruct-Data",
+                "dataset:HuggingFaceFV/finevideo",
+                "dataset:MAmmoTH-VL/MAmmoTH-VL-Instruct-12M",
+                "dataset:lmms-lab/LLaVA-Video-178K",
+                "dataset:orrzohar/Video-STaR",
+                "dataset:Mutonix/Vript",
+                "dataset:TIGER-Lab/VISTA-400K",
+                "dataset:Enxin/MovieChat-1K_train",
+                "dataset:ShareGPT4Video/ShareGPT4Video",
+                "arxiv:2504.05299",
+                "base_model:HuggingFaceTB/SmolVLM-Instruct",
+                "base_model:finetune:HuggingFaceTB/SmolVLM-Instruct",
+                "license:apache-2.0",
+                "endpoints_compatible",
+                "region:us"
+            ]
+        }
+    },
+    "components": [
+        {
+            "type": "data",
+            "bom-ref": "HuggingFaceM4/the_cauldron-0b60b937-29a7-5f0c-9fa6-ec10bf894687",
+            "name": "HuggingFaceM4/the_cauldron",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "HuggingFaceM4/the_cauldron-0b60b937-29a7-5f0c-9fa6-ec10bf894687",
+                    "name": "HuggingFaceM4/the_cauldron",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/HuggingFaceM4/the_cauldron",
+                        "properties": [
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ai2d {\"split\": \"train\", \"path\": \"ai2d/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: aokvqa {\"split\": \"train\", \"path\": \"aokvqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: chart2text {\"split\": \"train\", \"path\": \"chart2text/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: chartqa {\"split\": \"train\", \"path\": \"chartqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: clevr {\"split\": \"train\", \"path\": \"clevr/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: clevr_math {\"split\": \"train\", \"path\": \"clevr_math/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: cocoqa {\"split\": \"train\", \"path\": \"cocoqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: datikz {\"split\": \"train\", \"path\": \"datikz/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: diagram_image_to_text {\"split\": \"train\", \"path\": \"diagram_image_to_text/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: docvqa {\"split\": \"train\", \"path\": \"docvqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: dvqa {\"split\": \"train\", \"path\": \"dvqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: figureqa {\"split\": \"train\", \"path\": \"figureqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: finqa {\"split\": \"train\", \"path\": \"finqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: geomverse {\"split\": \"train\", \"path\": \"geomverse/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: hateful_memes {\"split\": \"train\", \"path\": \"hateful_memes/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: hitab {\"split\": \"train\", \"path\": \"hitab/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: iam {\"split\": \"train\", \"path\": \"iam/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: iconqa {\"split\": \"train\", \"path\": \"iconqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: infographic_vqa {\"split\": \"train\", \"path\": \"infographic_vqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: intergps {\"split\": \"train\", \"path\": \"intergps/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: localized_narratives {\"split\": \"train\", \"path\": \"localized_narratives/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: mapqa {\"split\": \"train\", \"path\": \"mapqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: mimic_cgd {\"split\": \"train\", \"path\": \"mimic_cgd/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: multihiertt {\"split\": \"train\", \"path\": \"multihiertt/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: nlvr2 {\"split\": \"train\", \"path\": \"nlvr2/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ocrvqa {\"split\": \"train\", \"path\": \"ocrvqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: okvqa {\"split\": \"train\", \"path\": \"okvqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: plotqa {\"split\": \"train\", \"path\": \"plotqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: raven {\"split\": \"train\", \"path\": \"raven/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: rendered_text {\"split\": \"train\", \"path\": \"rendered_text/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: robut_sqa {\"split\": \"train\", \"path\": \"robut_sqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: robut_wikisql {\"split\": \"train\", \"path\": \"robut_wikisql/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: robut_wtq {\"split\": \"train\", \"path\": \"robut_wtq/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: scienceqa {\"split\": \"train\", \"path\": \"scienceqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: screen2words {\"split\": \"train\", \"path\": \"screen2words/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: spot_the_diff {\"split\": \"train\", \"path\": \"spot_the_diff/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: st_vqa {\"split\": \"train\", \"path\": \"st_vqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: tabmwp {\"split\": \"train\", \"path\": \"tabmwp/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: tallyqa {\"split\": \"train\", \"path\": \"tallyqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: tat_qa {\"split\": \"train\", \"path\": \"tat_qa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: textcaps {\"split\": \"train\", \"path\": \"textcaps/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: textvqa {\"split\": \"train\", \"path\": \"textvqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: tqa {\"split\": \"train\", \"path\": \"tqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vistext {\"split\": \"train\", \"path\": \"vistext/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: visual7w {\"split\": \"train\", \"path\": \"visual7w/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: visualmrc {\"split\": \"train\", \"path\": \"visualmrc/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vqarad {\"split\": \"train\", \"path\": \"vqarad/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vqav2 {\"split\": \"train\", \"path\": \"vqav2/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vsr {\"split\": \"train\", \"path\": \"vsr/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: websight {\"split\": \"train\", \"path\": \"websight/train-*\"}"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "HuggingFaceM4",
+                                    "url": "https://huggingface.co/HuggingFaceM4"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tDataset Card for The Cauldron\n\t\n\n\n\n\t\n\t\t\n\t\tDataset description\n\t\n\nThe Cauldron is part of the Idefics2 release.\nIt is a massive collection of 50 vision-language datasets (training sets only) that were used for the fine-tuning of the vision-language model Idefics2.\n\n\t\n\t\t\n\t\tLoad the dataset\n\t\n\nTo load the dataset, install the library datasets with pip install datasets. Then,\nfrom datasets import load_dataset\nds = load_dataset(\"HuggingFaceM4/the_cauldron\", \"ai2d\")\n\nto download and load the\u2026 See the full description on the dataset page: https://huggingface.co/datasets/HuggingFaceM4/the_cauldron."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "HuggingFaceM4/Docmatix-1623432e-1ae3-5888-a9c9-23e566c5a05a",
+            "name": "HuggingFaceM4/Docmatix",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "HuggingFaceM4/Docmatix-1623432e-1ae3-5888-a9c9-23e566c5a05a",
+                    "name": "HuggingFaceM4/Docmatix",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/HuggingFaceM4/Docmatix",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "visual-question-answering"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "size_categories",
+                                "value": "1M<n<10M"
+                            },
+                            {
+                                "name": "pretty_name",
+                                "value": "Docmatix"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: images {\"split\": \"train\", \"path\": \"data/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: pdf {\"split\": \"train\", \"path\": \"pdf/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: zero-shot-exp {\"split\": \"train\", \"path\": \"zero-shot-exp/train-*\"}, {\"split\": \"test\", \"path\": \"zero-shot-exp/test-*\"}"
+                            },
+                            {
+                                "name": "license",
+                                "value": "mit"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "HuggingFaceM4",
+                                    "url": "https://huggingface.co/HuggingFaceM4"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tDataset Card for Docmatix\n\t\n\n\n\n\t\n\t\t\n\t\tDataset description\n\t\n\nDocmatix is part of the Idefics3 release (stay tuned).\nIt is a massive dataset for Document Visual Question Answering that was used for the fine-tuning of the vision-language model Idefics3.\n\n\t\n\t\t\n\t\tLoad the dataset\n\t\n\nTo load the dataset, install the library datasets with pip install datasets. Then,\nfrom datasets import load_dataset\nds = load_dataset(\"HuggingFaceM4/Docmatix\")\n\nIf you want the dataset to link to the pdf files\u2026 See the full description on the dataset page: https://huggingface.co/datasets/HuggingFaceM4/Docmatix."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "lmms-lab/LLaVA-OneVision-Data-5dedde7e-a93e-51b5-99ad-6e589118a200",
+            "name": "lmms-lab/LLaVA-OneVision-Data",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "lmms-lab/LLaVA-OneVision-Data-5dedde7e-a93e-51b5-99ad-6e589118a200",
+                    "name": "lmms-lab/LLaVA-OneVision-Data",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data",
+                        "properties": [
+                            {
+                                "name": "language",
+                                "value": "en, zh"
+                            },
+                            {
+                                "name": "pretty_name",
+                                "value": "llava-onevision-data"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: CLEVR-Math(MathV360K) {\"split\": \"train\", \"path\": \"CLEVR-Math(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: Evol-Instruct-GPT4-Turbo {\"split\": \"train\", \"path\": \"Evol-Instruct-GPT4-Turbo/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: FigureQA(MathV360K) {\"split\": \"train\", \"path\": \"FigureQA(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: GEOS(MathV360K) {\"split\": \"train\", \"path\": \"GEOS(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: GeoQA+(MathV360K) {\"split\": \"train\", \"path\": \"GeoQA+(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: Geometry3K(MathV360K) {\"split\": \"train\", \"path\": \"Geometry3K(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: IconQA(MathV360K) {\"split\": \"train\", \"path\": \"IconQA(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: MapQA(MathV360K) {\"split\": \"train\", \"path\": \"MapQA(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: MathV360K_TQA {\"split\": \"train\", \"path\": \"MathV360K_TQA/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: MathV360K_VQA-AS {\"split\": \"train\", \"path\": \"MathV360K_VQA-AS/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: MathV360K_VQA-RAD {\"split\": \"train\", \"path\": \"MathV360K_VQA-RAD/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: PMC-VQA(MathV360K) {\"split\": \"train\", \"path\": \"PMC-VQA(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: Super-CLEVR(MathV360K) {\"split\": \"train\", \"path\": \"Super-CLEVR(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: TabMWP(MathV360K) {\"split\": \"train\", \"path\": \"TabMWP(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: UniGeo(MathV360K) {\"split\": \"train\", \"path\": \"UniGeo(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: VisualWebInstruct(filtered) {\"split\": \"train\", \"path\": \"VisualWebInstruct(filtered)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: VizWiz(MathV360K) {\"split\": \"train\", \"path\": \"VizWiz(MathV360K)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ai2d(cauldron,llava_format) {\"split\": \"train\", \"path\": \"ai2d(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ai2d(gpt4v) {\"split\": \"train\", \"path\": \"ai2d(gpt4v)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ai2d(internvl) {\"split\": \"train\", \"path\": \"ai2d(internvl)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: allava_instruct_laion4v {\"split\": \"train\", \"path\": \"allava_instruct_laion4v/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: allava_instruct_vflan4v {\"split\": \"train\", \"path\": \"allava_instruct_vflan4v/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: aokvqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"aokvqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: chart2text(cauldron) {\"split\": \"train\", \"path\": \"chart2text(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: chartqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"chartqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: chrome_writting {\"split\": \"train\", \"path\": \"chrome_writting/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: clevr(cauldron,llava_format) {\"split\": \"train\", \"path\": \"clevr(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: diagram_image_to_text(cauldron) {\"split\": \"train\", \"path\": \"diagram_image_to_text(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: dvqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"dvqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: figureqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"figureqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: geo170k(align) {\"split\": \"train\", \"path\": \"geo170k(align)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: geo170k(qa) {\"split\": \"train\", \"path\": \"geo170k(qa)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: geo3k {\"split\": \"train\", \"path\": \"geo3k/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: geomverse(cauldron) {\"split\": \"train\", \"path\": \"geomverse(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: hateful_memes(cauldron,llava_format) {\"split\": \"train\", \"path\": \"hateful_memes(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: hitab(cauldron,llava_format) {\"split\": \"train\", \"path\": \"hitab(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: hme100k {\"split\": \"train\", \"path\": \"hme100k/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: iam(cauldron) {\"split\": \"train\", \"path\": \"iam(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: iconqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"iconqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: iiit5k {\"split\": \"train\", \"path\": \"iiit5k/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: image_textualization(filtered) {\"split\": \"train\", \"path\": \"image_textualization(filtered)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: infographic(gpt4v) {\"split\": \"train\", \"path\": \"infographic(gpt4v)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: infographic_vqa {\"split\": \"train\", \"path\": \"infographic_vqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: infographic_vqa_llava_format {\"split\": \"train\", \"path\": \"infographic_vqa_llava_format/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: intergps(cauldron,llava_format) {\"split\": \"train\", \"path\": \"intergps(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: k12_printing {\"split\": \"train\", \"path\": \"k12_printing/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: llava_wild_4v_12k_filtered {\"split\": \"train\", \"path\": \"llava_wild_4v_12k_filtered/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: llava_wild_4v_39k_filtered {\"split\": \"train\", \"path\": \"llava_wild_4v_39k_filtered/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: llavar_gpt4_20k {\"split\": \"train\", \"path\": \"llavar_gpt4_20k/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: lrv_chart {\"split\": \"train\", \"path\": \"lrv_chart/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: lrv_normal(filtered) {\"split\": \"train\", \"path\": \"lrv_normal(filtered)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: magpie_pro(l3_80b_mt) {\"split\": \"train\", \"path\": \"magpie_pro(l3_80b_mt)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: magpie_pro(l3_80b_st) {\"split\": \"train\", \"path\": \"magpie_pro(l3_80b_st)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: magpie_pro(qwen2_72b_st) {\"split\": \"train\", \"path\": \"magpie_pro(qwen2_72b_st)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: mapqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"mapqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: mathqa {\"split\": \"train\", \"path\": \"mathqa/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: mavis_math_metagen {\"split\": \"train\", \"path\": \"mavis_math_metagen/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: mavis_math_rule_geo {\"split\": \"train\", \"path\": \"mavis_math_rule_geo/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: multihiertt(cauldron) {\"split\": \"train\", \"path\": \"multihiertt(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: orand_car_a {\"split\": \"train\", \"path\": \"orand_car_a/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: raven(cauldron) {\"split\": \"train\", \"path\": \"raven(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: rendered_text(cauldron) {\"split\": \"train\", \"path\": \"rendered_text(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: robut_sqa(cauldron) {\"split\": \"train\", \"path\": \"robut_sqa(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: robut_wikisql(cauldron) {\"split\": \"train\", \"path\": \"robut_wikisql(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: robut_wtq(cauldron,llava_format) {\"split\": \"train\", \"path\": \"robut_wtq(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: scienceqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"scienceqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: scienceqa(nona_context) {\"split\": \"train\", \"path\": \"scienceqa(nona_context)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: screen2words(cauldron) {\"split\": \"train\", \"path\": \"screen2words(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: sharegpt4o {\"split\": \"train\", \"path\": \"sharegpt4o/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: sharegpt4v(coco) {\"split\": \"train\", \"path\": \"sharegpt4v(coco)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: sharegpt4v(knowledge) {\"split\": \"train\", \"path\": \"sharegpt4v(knowledge)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: sharegpt4v(llava) {\"split\": \"train\", \"path\": \"sharegpt4v(llava)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: sharegpt4v(sam) {\"split\": \"train\", \"path\": \"sharegpt4v(sam)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: sroie {\"split\": \"train\", \"path\": \"sroie/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: st_vqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"st_vqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: tabmwp(cauldron) {\"split\": \"train\", \"path\": \"tabmwp(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: tallyqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"tallyqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: textcaps {\"split\": \"train\", \"path\": \"textcaps/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: textocr(gpt4v) {\"split\": \"train\", \"path\": \"textocr(gpt4v)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: tqa(cauldron,llava_format) {\"split\": \"train\", \"path\": \"tqa(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ureader_cap {\"split\": \"train\", \"path\": \"ureader_cap/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ureader_ie {\"split\": \"train\", \"path\": \"ureader_ie/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vision_flan(filtered) {\"split\": \"train\", \"path\": \"vision_flan(filtered)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vistext(cauldron) {\"split\": \"train\", \"path\": \"vistext(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: visual7w(cauldron,llava_format) {\"split\": \"train\", \"path\": \"visual7w(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: visualmrc(cauldron) {\"split\": \"train\", \"path\": \"visualmrc(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vqarad(cauldron,llava_format) {\"split\": \"train\", \"path\": \"vqarad(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vsr(cauldron,llava_format) {\"split\": \"train\", \"path\": \"vsr(cauldron,llava_format)/train-*\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: websight(cauldron) {\"split\": \"train\", \"path\": \"websight(cauldron)/train-*\"}"
+                            },
+                            {
+                                "name": "license",
+                                "value": "apache-2.0"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "lmms-lab",
+                                    "url": "https://huggingface.co/lmms-lab"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tDataset Card for LLaVA-OneVision\n\t\n\n[2024-09-01]: Uploaded VisualWebInstruct(filtered), it's used in OneVision Stage\n\nalmost all subsets are uploaded with HF's required format and you can use the recommended interface to download them and follow our code below to convert them. \n\n\nthe subset of ureader_kg and ureader_qa are uploaded with the processed jsons and tar.gz of image folders.\nYou may directly download them from the following url.\u2026 See the full description on the dataset page: https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "lmms-lab/M4-Instruct-Data-67437f1e-35b0-5a44-8de0-217c202cacf0",
+            "name": "lmms-lab/M4-Instruct-Data",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "lmms-lab/M4-Instruct-Data-67437f1e-35b0-5a44-8de0-217c202cacf0",
+                    "name": "lmms-lab/M4-Instruct-Data",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/lmms-lab/M4-Instruct-Data",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "visual-question-answering, question-answering"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "size_categories",
+                                "value": "100K<n<1M"
+                            },
+                            {
+                                "name": "pretty_name",
+                                "value": "M4-Instruct"
+                            },
+                            {
+                                "name": "license",
+                                "value": "cc-by-4.0"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "lmms-lab",
+                                    "url": "https://huggingface.co/lmms-lab"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tM4-Instruct Dataset Card\n\t\n\n\n\t\n\t\t\n\t\tDataset details\n\t\n\nDataset type:\nM4-Instruct is a set of multi-image datasets that are collected from public datasets or generated by the GPT-4V API.\nIt is constructed for training LMMs for their interleaved multi-image capbilities, e.g., LLaVA-NeXT-Interleave.\nDataset date:\nM4-Instruct was collected in April 2024, and released in June 2024.\nPaper or resources for more information:\nBlog:\u2026 See the full description on the dataset page: https://huggingface.co/datasets/lmms-lab/M4-Instruct-Data."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "HuggingFaceFV/finevideo-b6414f8b-8137-53ce-bed4-674af789a978",
+            "name": "HuggingFaceFV/finevideo",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "HuggingFaceFV/finevideo-b6414f8b-8137-53ce-bed4-674af789a978",
+                    "name": "HuggingFaceFV/finevideo",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/HuggingFaceFV/finevideo",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "visual-question-answering, video-text-to-text"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "size_categories",
+                                "value": "10K<n<100K"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: default {\"split\": \"train\", \"path\": \"data/train-*\"}"
+                            },
+                            {
+                                "name": "license",
+                                "value": "cc"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "HuggingFaceFV",
+                                    "url": "https://huggingface.co/HuggingFaceFV"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tFineVideo\n\t\n\n\n  \n\n\n\nFineVideo\nDescription\nDataset Explorer\nRevisions\nDataset Distribution\n\n\nHow to download and use FineVideo\nUsing datasets\nUsing huggingface_hub\nLoad a subset of the dataset\n\n\nDataset StructureData Instances\nData Fields\n\n\nDataset Creation\nLicense CC-By\nConsiderations for Using the Data\nSocial Impact of Dataset\nDiscussion of Biases\n\n\nAdditional Information\nCredits\nFuture Work\nOpting out of FineVideo\nCitation Information\n\n\nTerms of use for FineVideo\u2026 See the full description on the dataset page: https://huggingface.co/datasets/HuggingFaceFV/finevideo."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "MAmmoTH-VL/MAmmoTH-VL-Instruct-12M-2c6bc921-c3e2-5525-97ec-d8782f1581e5",
+            "name": "MAmmoTH-VL/MAmmoTH-VL-Instruct-12M",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "MAmmoTH-VL/MAmmoTH-VL-Instruct-12M-2c6bc921-c3e2-5525-97ec-d8782f1581e5",
+                    "name": "MAmmoTH-VL/MAmmoTH-VL-Instruct-12M",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/MAmmoTH-VL/MAmmoTH-VL-Instruct-12M",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "visual-question-answering, question-answering"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "size_categories",
+                                "value": "10M<n<100M"
+                            },
+                            {
+                                "name": "license",
+                                "value": "apache-2.0"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "MAmmoTH-VL",
+                                    "url": "https://huggingface.co/MAmmoTH-VL"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tMAmmoTH-VL-Instruct-12M\n\t\n\n\ud83c\udfe0 Homepage | \ud83e\udd16 MAmmoTH-VL-8B | \ud83d\udcbb Code | \ud83d\udcc4 Arxiv | \ud83d\udcd5 PDF | \ud83d\udda5\ufe0f Demo\n\n\t\n\t\t\n\t\n\t\n\t\tIntroduction\n\t\n\nOur simple yet scalable visual instruction data rewriting pipeline consists of three steps: manual data source collection, rewriting using MLLMs/LLMs, and filtering via the same MLLM as a judge. Examples below illustrate transformations in math and science categories, showcasing detailed, step-by-step responses.\n\n\t\n\t\t\n\t\tThe data distribution of\u2026 See the full description on the dataset page: https://huggingface.co/datasets/MAmmoTH-VL/MAmmoTH-VL-Instruct-12M."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "lmms-lab/LLaVA-Video-178K-3489c331-beb5-56f7-87d9-67a7c5d6437c",
+            "name": "lmms-lab/LLaVA-Video-178K",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "lmms-lab/LLaVA-Video-178K-3489c331-beb5-56f7-87d9-67a7c5d6437c",
+                    "name": "lmms-lab/LLaVA-Video-178K",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "visual-question-answering, video-text-to-text"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 0_30_s_academic_v0_1 {\"split\": \"caption\", \"path\": \"0_30_s_academic_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"0_30_s_academic_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"0_30_s_academic_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 0_30_s_youtube_v0_1 {\"split\": \"caption\", \"path\": \"0_30_s_youtube_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"0_30_s_youtube_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"0_30_s_youtube_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 0_30_s_activitynet {\"split\": \"open_ended\", \"path\": \"0_30_s_activitynet/*oe*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 0_30_s_perceptiontest {\"split\": \"multi_choice\", \"path\": \"0_30_s_perceptiontest/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 0_30_s_nextqa {\"split\": \"open_ended\", \"path\": \"0_30_s_nextqa/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"0_30_s_nextqa/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 30_60_s_academic_v0_1 {\"split\": \"caption\", \"path\": \"30_60_s_academic_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"30_60_s_academic_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"30_60_s_academic_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 30_60_s_youtube_v0_1 {\"split\": \"caption\", \"path\": \"30_60_s_youtube_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"30_60_s_youtube_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"30_60_s_youtube_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 30_60_s_activitynet {\"split\": \"open_ended\", \"path\": \"30_60_s_activitynet/*oe*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 30_60_s_perceptiontest {\"split\": \"multi_choice\", \"path\": \"30_60_s_perceptiontest/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 30_60_s_nextqa {\"split\": \"open_ended\", \"path\": \"30_60_s_nextqa/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"30_60_s_nextqa/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 1_2_m_youtube_v0_1 {\"split\": \"caption\", \"path\": \"1_2_m_youtube_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"1_2_m_youtube_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"1_2_m_youtube_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 1_2_m_academic_v0_1 {\"split\": \"caption\", \"path\": \"1_2_m_academic_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"1_2_m_academic_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"1_2_m_academic_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 1_2_m_activitynet {\"split\": \"open_ended\", \"path\": \"1_2_m_activitynet/*oe*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 1_2_m_nextqa {\"split\": \"open_ended\", \"path\": \"1_2_m_nextqa/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"1_2_m_nextqa/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 2_3_m_youtube_v0_1 {\"split\": \"caption\", \"path\": \"2_3_m_youtube_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"2_3_m_youtube_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"2_3_m_youtube_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 2_3_m_academic_v0_1 {\"split\": \"caption\", \"path\": \"2_3_m_academic_v0_1/*cap*.json\"}, {\"split\": \"open_ended\", \"path\": \"2_3_m_academic_v0_1/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"2_3_m_academic_v0_1/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 2_3_m_activitynet {\"split\": \"open_ended\", \"path\": \"2_3_m_activitynet/*oe*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: 2_3_m_nextqa {\"split\": \"open_ended\", \"path\": \"2_3_m_nextqa/*oe*.json\"}, {\"split\": \"multi_choice\", \"path\": \"2_3_m_nextqa/*mc*.json\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: llava_hound {\"split\": \"open_ended\", \"path\": \"llava_hound/sharegptvideo_qa_255k_processed.json\"}"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "lmms-lab",
+                                    "url": "https://huggingface.co/lmms-lab"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tDataset Card for LLaVA-Video-178K\n\t\n\n\n\t\n\t\t\n\t\tUses\n\t\n\nThis dataset is used for the training of the LLaVA-Video model. We only allow the use of this dataset for academic research and education purpose. For OpenAI GPT-4 generated data, we recommend the users to check the OpenAI Usage Policy.\n\n\t\n\t\t\n\t\tData Sources\n\t\n\nFor the training of LLaVA-Video, we utilized video-language data from five primary sources:\n\nLLaVA-Video-178K: This dataset includes 178,510 caption entries, 960,792 open-ended\u2026 See the full description on the dataset page: https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "orrzohar/Video-STaR-7f15539e-ebd4-570c-92c7-ffffc5190a49",
+            "name": "orrzohar/Video-STaR",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "orrzohar/Video-STaR-7f15539e-ebd4-570c-92c7-ffffc5190a49",
+                    "name": "orrzohar/Video-STaR",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/orrzohar/Video-STaR",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "visual-question-answering, question-answering"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "size_categories",
+                                "value": "100K<n<1M"
+                            },
+                            {
+                                "name": "pretty_name",
+                                "value": "VSTaR-1M dataset"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: Kinetics700 \"f\", \"t\", \"_\", \"j\", \"s\", \"o\", \"n\", \"/\", \"k\", \"i\", \"n\", \"e\", \"t\", \"i\", \"c\", \"s\", \"7\", \"0\", \"0\", \"_\", \"t\", \"u\", \"n\", \"e\", \"_\", \".\", \"j\", \"s\", \"o\", \"n\""
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: STAR-benchmark \"f\", \"t\", \"_\", \"j\", \"s\", \"o\", \"n\", \"/\", \"s\", \"t\", \"a\", \"r\", \"b\", \"_\", \"t\", \"u\", \"n\", \"e\", \"_\", \".\", \"j\", \"s\", \"o\", \"n\""
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: FineDiving \"f\", \"t\", \"_\", \"j\", \"s\", \"o\", \"n\", \"/\", \"f\", \"i\", \"n\", \"e\", \"d\", \"i\", \"v\", \"i\", \"n\", \"g\", \"_\", \"t\", \"u\", \"n\", \"e\", \"_\", \".\", \"j\", \"s\", \"o\", \"n\""
+                            },
+                            {
+                                "name": "license",
+                                "value": "apache-2.0"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "orrzohar",
+                                    "url": "https://huggingface.co/orrzohar"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tVideo-STaR 1M Dataset Card\n\t\n\n[\ud83d\udda5\ufe0f Website]\n[\ud83d\udcf0 Paper] \n[\ud83d\udcab Code]\n[\ud83e\udd17 Demo]\n\n\t\n\t\t\n\t\t\ud83c\udfa5 Dataset details\n\t\n\nDataset type:\nVSTaR-1M is a 1M instruction tuning dataset, created using Video-STaR, with the source datasets: \n\nKinetics700\nSTAR-benchmark\nFineDiving\n\nThe videos for VSTaR-1M can be found in the links above. \nVSTaR-1M is built off of diverse task with the goal of enhancing video-language alignment in Large Video-Language Models (LVLMs).\n\nkinetics700_tune_.json - Instruction tuning\u2026 See the full description on the dataset page: https://huggingface.co/datasets/orrzohar/Video-STaR."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "Mutonix/Vript-4215585f-ed48-5399-8b8c-640ad24fcb01",
+            "name": "Mutonix/Vript",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "Mutonix/Vript-4215585f-ed48-5399-8b8c-640ad24fcb01",
+                    "name": "Mutonix/Vript",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/Mutonix/Vript",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "video-classification, visual-question-answering, text-to-video, text-to-image, image-to-video"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "size_categories",
+                                "value": "100K<n<1M"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vript-long {\"split\": \"train\", \"path\": \"vript_captions/vript_long_videos_captions.jsonl\"}"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: vript-short {\"split\": \"train\", \"path\": \"vript_captions/vript_short_videos_captions.jsonl\"}"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "Mutonix",
+                                    "url": "https://huggingface.co/Mutonix"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\t\ud83c\udfac Vript: Refine Video Captioning into Video Scripting [Github Repo]\n\t\n\n\nWe construct a fine-grained video-text dataset with 12K annotated high-resolution videos (~400k clips). The annotation of this dataset is inspired by the video script. If we want to make a video, we have to first write a script to organize how to shoot the scenes in the videos. To shoot a scene, we need to decide the content, shot type (medium shot, close-up, etc), and how the camera moves (panning, tilting, etc).\u2026 See the full description on the dataset page: https://huggingface.co/datasets/Mutonix/Vript."
+                }
+            ]
+        },
+        {
+            "type": "data",
+            "bom-ref": "TIGER-Lab/VISTA-400K-ce87b7d4-1f05-5464-89f5-82ca09d29429",
+            "name": "TIGER-Lab/VISTA-400K",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "TIGER-Lab/VISTA-400K-ce87b7d4-1f05-5464-89f5-82ca09d29429",
+                    "name": "TIGER-Lab/VISTA-400K",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/TIGER-Lab/VISTA-400K",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "question-answering, video-text-to-text"
+                            },
+                            {
+                                "name": "license",
+                                "value": "mit"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "TIGER-Lab",
+                                    "url": "https://huggingface.co/TIGER-Lab"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tVISTA-400K\n\t\n\nThis repo contains all subsets for VISTA-400K. VISTA is a video spatiotemporal augmentation method that generates long-duration and high-resolution video instruction-following data to enhance the video understanding capabilities of video LMMs.\n\n\t\n\t\t\n\t\tThis repo is under construction. Please stay tuned.\n\t\n\n\ud83c\udf10 Homepage | \ud83d\udcd6 arXiv | \ud83d\udcbb GitHub | \ud83e\udd17 VISTA-400K | \ud83e\udd17 Models | \ud83e\udd17 HRVideoBench\n\n\t\n\t\n\t\n\t\tVideo Instruction Data Synthesis Pipeline\n\t\n\n\nVISTA leverages insights from\u2026 See the full description on the dataset page: https://huggingface.co/datasets/TIGER-Lab/VISTA-400K."
+                }
+            ]
+        },
+        null,
+        {
+            "type": "data",
+            "bom-ref": "ShareGPT4Video/ShareGPT4Video-4ef9dfce-9cf4-5a48-a489-ac850636b73d",
+            "name": "ShareGPT4Video/ShareGPT4Video",
+            "data": [
+                {
+                    "type": "dataset",
+                    "bom-ref": "ShareGPT4Video/ShareGPT4Video-4ef9dfce-9cf4-5a48-a489-ac850636b73d",
+                    "name": "ShareGPT4Video/ShareGPT4Video",
+                    "contents": {
+                        "url": "https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video",
+                        "properties": [
+                            {
+                                "name": "task_categories",
+                                "value": "visual-question-answering, question-answering"
+                            },
+                            {
+                                "name": "language",
+                                "value": "en"
+                            },
+                            {
+                                "name": "size_categories",
+                                "value": "1M<n"
+                            },
+                            {
+                                "name": "pretty_name",
+                                "value": "ShareGPT4Video Captions Dataset Card"
+                            },
+                            {
+                                "name": "configs",
+                                "value": "Name of the dataset subset: ShareGPT4Video \"s\", \"h\", \"a\", \"r\", \"e\", \"g\", \"p\", \"t\", \"4\", \"v\", \"i\", \"d\", \"e\", \"o\", \"_\", \"4\", \"0\", \"k\", \".\", \"j\", \"s\", \"o\", \"n\", \"l\""
+                            },
+                            {
+                                "name": "license",
+                                "value": "cc-by-nc-4.0"
+                            }
+                        ]
+                    },
+                    "governance": {
+                        "owners": [
+                            {
+                                "organization": {
+                                    "name": "ShareGPT4Video",
+                                    "url": "https://huggingface.co/ShareGPT4Video"
+                                }
+                            }
+                        ]
+                    },
+                    "description": "\n\t\n\t\t\n\t\tShareGPT4Video 4.8M Dataset Card\n\t\n\n\n\t\n\t\t\n\t\tDataset details\n\t\n\nDataset type:\nShareGPT4Video Captions 4.8M is a set of GPT4-Vision-powered multi-modal captions data of videos.\nIt is constructed to enhance modality alignment and fine-grained visual concept perception in Large Video-Language Models (LVLMs) and Text-to-Video Models (T2VMs). This advancement aims to bring LVLMs and T2VMs towards the capabilities of GPT4V and Sora.\n\nsharegpt4video_40k.jsonl is generated by GPT4-Vision\u2026 See the full description on the dataset page: https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video."
+                }
+            ]
+        }
+    ]
+}