Shanshan Wang
commited on
Commit
·
9581bcd
1
Parent(s):
50fb3ca
updated multimodal benchmarks
Browse files- app.py +64 -9
- filtered_opencompass.csv +16 -0
app.py
CHANGED
|
@@ -4,8 +4,10 @@ import gradio as gr
|
|
| 4 |
|
| 5 |
|
| 6 |
data_path = '0926-OCRBench-opensource.csv'
|
|
|
|
| 7 |
data = pd.read_csv(data_path).fillna(0)
|
| 8 |
|
|
|
|
| 9 |
# set the data types for the columns
|
| 10 |
dtype_dict = {
|
| 11 |
"Model": str,
|
|
@@ -69,18 +71,71 @@ def plot_metric(selected_metric):
|
|
| 69 |
|
| 70 |
return fig
|
| 71 |
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
def create_interface():
|
| 74 |
with gr.Blocks() as interface:
|
| 75 |
-
with gr.
|
| 76 |
-
with gr.
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
|
| 85 |
return interface
|
| 86 |
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
data_path = '0926-OCRBench-opensource.csv'
|
| 7 |
+
data_mmlm_path = 'filtered_opencompass.csv'
|
| 8 |
data = pd.read_csv(data_path).fillna(0)
|
| 9 |
|
| 10 |
+
######## OCRBench ########
|
| 11 |
# set the data types for the columns
|
| 12 |
dtype_dict = {
|
| 13 |
"Model": str,
|
|
|
|
| 71 |
|
| 72 |
return fig
|
| 73 |
|
| 74 |
+
|
| 75 |
+
####### OpenCompass ########
|
| 76 |
+
data_mmlm = pd.read_csv(data_mmlm_path).fillna(0)
|
| 77 |
+
data_mmlm.rename(columns={"Avg. Score (8 single image benchmarks)": "Average Score"}, inplace=True)
|
| 78 |
+
metrics_column = list(data_mmlm.columns)[6:]
|
| 79 |
+
|
| 80 |
+
def plot_metric_mmlm_grouped(category):
|
| 81 |
+
# Filter the data based on the selected category
|
| 82 |
+
filtered_data = data_mmlm[data_mmlm["Category"] == category].copy()
|
| 83 |
+
|
| 84 |
+
# Melt the dataframe to have a "Metric" column and a "Score" column
|
| 85 |
+
melted_data = pd.melt(
|
| 86 |
+
filtered_data,
|
| 87 |
+
id_vars=["Models"], # Keep the Model column as identifier
|
| 88 |
+
value_vars=metrics_column, # Melt all the metric columns
|
| 89 |
+
var_name="Metrics", # Name for the new column containing metrics
|
| 90 |
+
value_name="Score" # Name for the new column containing scores
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Generate a grouped bar chart
|
| 94 |
+
fig = px.bar(
|
| 95 |
+
melted_data,
|
| 96 |
+
x="Metrics",
|
| 97 |
+
y="Score",
|
| 98 |
+
color="Models", # Differentiate metrics by color
|
| 99 |
+
barmode="group", # Grouped bars
|
| 100 |
+
title=f"Scores for All Metrics in {category} Category"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
fig.update_layout(
|
| 104 |
+
xaxis_title="Metrics",
|
| 105 |
+
yaxis_title="Score",
|
| 106 |
+
height=600,
|
| 107 |
+
margin=dict(t=50, l=50, r=100, b=50),
|
| 108 |
+
)
|
| 109 |
+
return fig
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# Gradio Blocks Interface with Tabs
|
| 113 |
def create_interface():
|
| 114 |
with gr.Blocks() as interface:
|
| 115 |
+
with gr.Tabs():
|
| 116 |
+
with gr.Tab("OCRBench"):
|
| 117 |
+
with gr.Row():
|
| 118 |
+
with gr.Column(scale=4): # Column for the plot (takes 4 parts of the total space)
|
| 119 |
+
plot = gr.Plot(value=plot_metric("OCRBench"), label="OCR Benchmark Metrics") # default plot component initially
|
| 120 |
+
with gr.Column(scale=1): # Column for the dropdown (takes 1 part of the total space)
|
| 121 |
+
metrics = list(data_valid.columns[5:-1]) # List of metric columns (excluding 'Model' and 'Parameter Size')
|
| 122 |
+
dropdown = gr.Dropdown(metrics, label="Select Metric", value="OCRBench")
|
| 123 |
+
|
| 124 |
+
# Update the plot when dropdown selection changes
|
| 125 |
+
dropdown.change(fn=plot_metric, inputs=dropdown, outputs=plot)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
with gr.Tab("8 Multi-modal Benchmarks"):
|
| 129 |
+
with gr.Row():
|
| 130 |
+
# Dropdown for selecting the category
|
| 131 |
+
categories = data_mmlm["Category"].unique().tolist()
|
| 132 |
+
category_dropdown = gr.Dropdown(categories, label="Select Category", value=categories[0])
|
| 133 |
+
|
| 134 |
+
with gr.Row():
|
| 135 |
+
mm_plot = gr.Plot(value=plot_metric_mmlm_grouped(categories[0]), label="Grouped Metrics for Models")
|
| 136 |
|
| 137 |
+
# Update the plot based on category dropdown changes
|
| 138 |
+
category_dropdown.change(fn=plot_metric_mmlm_grouped, inputs=category_dropdown, outputs=mm_plot)
|
| 139 |
|
| 140 |
return interface
|
| 141 |
|
filtered_opencompass.csv
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Category,Models,Type,Params (B),Language Model,Vision Model,Avg. Score (8 single image benchmarks),MMBench V1.1_TEST,MMStar,MMMU,Math Vista,Hallusion Bench Avg,AI2D_TEST,OCR Bench,MMVet
|
| 2 |
+
Similar score models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
|
| 3 |
+
Similar score models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
|
| 4 |
+
Similar score models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
|
| 5 |
+
Similar score models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
|
| 6 |
+
Similar score models,Claude3-Opus - Anthropic,Closed,Unknown,,,54.4,59.1,45.7,54.9,45.8,37.8,70.6,69.4,51.7
|
| 7 |
+
Similar score models,Claude3-Sonnet- Anthropic,Closed,Unknown,,,53.5,63.9,44.2,47.4,45.0,41.3,69.9,64.6,51.7
|
| 8 |
+
Similar score models,Cambrian-13B,Open,13,Vicuna-v1.5-13B,CLIP ViT-L/14,53.3,67.5,47.1,41.6,47.4,39.4,73.6,61.0,48.9
|
| 9 |
+
Similar score models,Qwen-VL-Plus - Alibaba,Closed,Unknown,,,52.2,66.2,39.7,39.8,37.6,40.6,65.7,72.6,55.7
|
| 10 |
+
Similar size models,Qwen2-VL-2B,Open,2.1,Qwen2-1.5B,ViT-600M,57.3,72.2,47.5,42.2,47.8,42.4,74.7,79.7,51.5
|
| 11 |
+
Similar size models,H2O-Mississippi-2B,Open,2.1,Danube2 1.8B,InternViT-300M,54.5,64.8,49.6,35.2,56.8,36.4,69.9,78.2,44.7
|
| 12 |
+
Similar size models,InternVL2-2B,Open,2.1,InternLM2-1.8B,InternViT-300M,54.0,69.6,49.8,36.3,46.0,38.0,74.1,78.1,39.7
|
| 13 |
+
Similar size models,Phi-3-Vision - Microsoft,Open,4.2,Phi-3,CLIP ViT-L/14,53.6,65.2,47.7,46.1,44.6,39.0,78.4,63.7,44.1
|
| 14 |
+
Similar size models,MiniCPM-V-2 ,Open,2.8,MiniCPM-2.4B,SigLip-400M,47.9,65.8,39.1,38.2,39.8,36.1,62.9,60.5,41.0
|
| 15 |
+
Similar size models,PaliGemma-3B-mix-448 ,Open,3,Gemma-2B,SigLip-400M,46.6,65.6,48.3,34.9,28.7,32.2,68.3,61.4,33.1
|
| 16 |
+
Similar size models,DeepSeek-VL-1.3B ,Open,2,DeekSeek-1B,SAM-B & SigLIP-L,39.6,63.8,39.9,33.8,29.8,27.6,51.5,41.3,29.2
|