Spaces:

h2oai
/

h2ovl-mississippi-benchmarks

Running

App Files Files Community

Shanshan Wang commited on Sep 27, 2024

Commit

9581bcd

1 Parent(s): 50fb3ca

updated multimodal benchmarks

Browse files

Files changed (2) hide show

app.py +64 -9
filtered_opencompass.csv +16 -0

app.py CHANGED Viewed

@@ -4,8 +4,10 @@ import gradio as gr
 data_path = '0926-OCRBench-opensource.csv'
 data = pd.read_csv(data_path).fillna(0)
 # set the data types for the columns
 dtype_dict = {
     "Model": str,
@@ -69,18 +71,71 @@ def plot_metric(selected_metric):
     return fig
-# Gradio Blocks Interface
 def create_interface():
     with gr.Blocks() as interface:
-        with gr.Row():
-            with gr.Column(scale=4):  # Column for the plot (takes 4 parts of the total space)
-                plot = gr.Plot(value=plot_metric("OCRBench"), label="OCR Benchmark Metrics")  # default plot component initially
-            with gr.Column(scale=1):  # Column for the dropdown (takes 1 part of the total space)
-                metrics = list(data_valid.columns[5:-1])  # List of metric columns (excluding 'Model' and 'Parameter Size')
-                dropdown = gr.Dropdown(metrics, label="Select Metric", value="OCRBench")
-        # Update the plot when dropdown selection changes
-        dropdown.change(fn=plot_metric, inputs=dropdown, outputs=plot)
     return interface

 data_path = '0926-OCRBench-opensource.csv'
+data_mmlm_path = 'filtered_opencompass.csv'
 data = pd.read_csv(data_path).fillna(0)
+######## OCRBench ########
 # set the data types for the columns
 dtype_dict = {
     "Model": str,
     return fig
+####### OpenCompass ########
+data_mmlm = pd.read_csv(data_mmlm_path).fillna(0)
+data_mmlm.rename(columns={"Avg. Score (8 single image benchmarks)": "Average Score"}, inplace=True)
+metrics_column = list(data_mmlm.columns)[6:]
+def plot_metric_mmlm_grouped(category):
+    # Filter the data based on the selected category
+    filtered_data = data_mmlm[data_mmlm["Category"] == category].copy()
+    # Melt the dataframe to have a "Metric" column and a "Score" column
+    melted_data = pd.melt(
+        filtered_data,
+        id_vars=["Models"],  # Keep the Model column as identifier
+        value_vars=metrics_column,  # Melt all the metric columns
+        var_name="Metrics",  # Name for the new column containing metrics
+        value_name="Score"  # Name for the new column containing scores
+    )
+    # Generate a grouped bar chart
+    fig = px.bar(
+        melted_data,
+        x="Metrics",
+        y="Score",
+        color="Models",  # Differentiate metrics by color
+        barmode="group",  # Grouped bars
+        title=f"Scores for All Metrics in {category} Category"
+    )
+    fig.update_layout(
+        xaxis_title="Metrics",
+        yaxis_title="Score",
+        height=600,
+        margin=dict(t=50, l=50, r=100, b=50),
+    )
+    return fig
+# Gradio Blocks Interface with Tabs
 def create_interface():
     with gr.Blocks() as interface:
+        with gr.Tabs():
+            with gr.Tab("OCRBench"):
+                with gr.Row():
+                    with gr.Column(scale=4):  # Column for the plot (takes 4 parts of the total space)
+                        plot = gr.Plot(value=plot_metric("OCRBench"), label="OCR Benchmark Metrics")  # default plot component initially
+                    with gr.Column(scale=1):  # Column for the dropdown (takes 1 part of the total space)
+                        metrics = list(data_valid.columns[5:-1])  # List of metric columns (excluding 'Model' and 'Parameter Size')
+                        dropdown = gr.Dropdown(metrics, label="Select Metric", value="OCRBench")
+                # Update the plot when dropdown selection changes
+                dropdown.change(fn=plot_metric, inputs=dropdown, outputs=plot)
+            with gr.Tab("8 Multi-modal Benchmarks"):
+                with gr.Row():
+                    # Dropdown for selecting the category
+                    categories = data_mmlm["Category"].unique().tolist()
+                    category_dropdown = gr.Dropdown(categories, label="Select Category", value=categories[0])
+                with gr.Row():
+                    mm_plot = gr.Plot(value=plot_metric_mmlm_grouped(categories[0]), label="Grouped Metrics for Models")
+                # Update the plot based on category dropdown changes
+                category_dropdown.change(fn=plot_metric_mmlm_grouped, inputs=category_dropdown, outputs=mm_plot)
     return interface

filtered_opencompass.csv ADDED Viewed

	@@ -0,0 +1,16 @@

+Category,Models,Type,Params (B),Language Model,Vision Model,Avg. Score (8 single image benchmarks),MMBench V1.1_TEST,MMStar,MMMU,Math Vista,Hallusion Bench Avg,AI2D_TEST,OCR Bench,MMVet
+Similar score models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
+Similar score models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
+Similar score models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
+Similar score models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
+Similar score models,Claude3-Opus - Anthropic,Closed,Unknown,,,54.4,59.1,45.7,54.9,45.8,37.8,70.6,69.4,51.7
+Similar score models,Claude3-Sonnet- Anthropic,Closed,Unknown,,,53.5,63.9,44.2,47.4,45.0,41.3,69.9,64.6,51.7
+Similar score models,Cambrian-13B,Open,13,Vicuna-v1.5-13B,CLIP ViT-L/14,53.3,67.5,47.1,41.6,47.4,39.4,73.6,61.0,48.9
+Similar score models,Qwen-VL-Plus - Alibaba,Closed,Unknown,,,52.2,66.2,39.7,39.8,37.6,40.6,65.7,72.6,55.7
+Similar size models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
+Similar size models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
+Similar size models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
+Similar size models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
+Similar size models,MiniCPM-V-2 ,Open,2.8,MiniCPM-2.4B,SigLip-400M,47.9,65.8,39.1,38.2,39.8,36.1,62.9,60.5,41.0
+Similar size models,PaliGemma-3B-mix-448 ,Open,3,Gemma-2B,SigLip-400M,46.6,65.6,48.3,34.9,28.7,32.2,68.3,61.4,33.1
+Similar size models,DeepSeek-VL-1.3B ,Open,2,DeekSeek-1B,SAM-B & SigLIP-L,39.6,63.8,39.9,33.8,29.8,27.6,51.5,41.3,29.2