Spaces:
Running
Running
[email protected]
commited on
Commit
Β·
fa0f3d4
1
Parent(s):
2c9911b
update
Browse files- app.py +9 -2
- src/about.py +5 -1
- src/display/utils.py +7 -1
- src/leaderboard/read_evals.py +1 -1
app.py
CHANGED
@@ -5,7 +5,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
5 |
from huggingface_hub import snapshot_download
|
6 |
import plotly.graph_objects as go
|
7 |
import plotly.express as px
|
8 |
-
from src.about import Tasks, AssetTasks
|
9 |
|
10 |
from src.about import (
|
11 |
CITATION_BUTTON_LABEL,
|
@@ -21,10 +21,12 @@ from src.display.utils import (
|
|
21 |
ASSET_BENCHMARK_COLS,
|
22 |
COLS,
|
23 |
ASSET_COLS,
|
|
|
24 |
EVAL_COLS,
|
25 |
EVAL_TYPES,
|
26 |
AutoEvalColumn,
|
27 |
AutoEvalColumnAsset,
|
|
|
28 |
ModelType,
|
29 |
fields,
|
30 |
WeightType,
|
@@ -63,6 +65,8 @@ print(ASSET_COLS)
|
|
63 |
|
64 |
ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
|
65 |
|
|
|
|
|
66 |
|
67 |
(
|
68 |
finished_eval_queue_df,
|
@@ -165,8 +169,11 @@ with demo:
|
|
165 |
|
166 |
with gr.TabItem("π οΈ Asset Benchmark", elem_id="llm-benchmark-asset-tab-table", id=1):
|
167 |
leaderboard = init_leaderboard(ASSET_LEADERBOARD_DF, AutoEvalColumnAsset)
|
|
|
|
|
|
|
168 |
|
169 |
-
with gr.TabItem("π Performance Plot", elem_id="llm-benchmark-tab-table", id=
|
170 |
print(LEADERBOARD_DF.columns)
|
171 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
172 |
perf_plot = gr.components.Plot(
|
|
|
5 |
from huggingface_hub import snapshot_download
|
6 |
import plotly.graph_objects as go
|
7 |
import plotly.express as px
|
8 |
+
from src.about import Tasks, AssetTasks, UncertaintyTasks
|
9 |
|
10 |
from src.about import (
|
11 |
CITATION_BUTTON_LABEL,
|
|
|
21 |
ASSET_BENCHMARK_COLS,
|
22 |
COLS,
|
23 |
ASSET_COLS,
|
24 |
+
UNCERTAINTY_COLS,
|
25 |
EVAL_COLS,
|
26 |
EVAL_TYPES,
|
27 |
AutoEvalColumn,
|
28 |
AutoEvalColumnAsset,
|
29 |
+
AutoEvalColumnUncertainty,
|
30 |
ModelType,
|
31 |
fields,
|
32 |
WeightType,
|
|
|
65 |
|
66 |
ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
|
67 |
|
68 |
+
UNCERTAINTY_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, UNCERTAINTY_COLS, ASSET_BENCHMARK_COLS, UncertaintyTasks)
|
69 |
+
|
70 |
|
71 |
(
|
72 |
finished_eval_queue_df,
|
|
|
169 |
|
170 |
with gr.TabItem("π οΈ Asset Benchmark", elem_id="llm-benchmark-asset-tab-table", id=1):
|
171 |
leaderboard = init_leaderboard(ASSET_LEADERBOARD_DF, AutoEvalColumnAsset)
|
172 |
+
|
173 |
+
with gr.TabItem("π΅βπ« Uncertainty Benchmark", elem_id="llm-benchmark-asset-tab-table", id=2):
|
174 |
+
leaderboard = init_leaderboard(UNCERTAINTY_LEADERBOARD_DF, AutoEvalColumnUncertainty)
|
175 |
|
176 |
+
with gr.TabItem("π Performance Plot", elem_id="llm-benchmark-tab-table", id=3):
|
177 |
print(LEADERBOARD_DF.columns)
|
178 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
179 |
perf_plot = gr.components.Plot(
|
src/about.py
CHANGED
@@ -17,7 +17,6 @@ class Tasks(Enum):
|
|
17 |
task2 = Task("acc_el", "acc_el", "Acc_El")
|
18 |
task3 = Task("acc_perturb", "perturb_score", "Acc_Perturb")
|
19 |
task4 = Task("score_consistency", "consist_score", "Consistency_Score")
|
20 |
-
task5 = Task("uncertainty", "uncertainty_score", "Uncertainty_Score")
|
21 |
|
22 |
class AssetTasks(Enum):
|
23 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
@@ -32,6 +31,11 @@ class AssetTasks(Enum):
|
|
32 |
task8 = Task("acc_fan", "acc_fan", "acc_fan")
|
33 |
task9 = Task("acc_power_transformer", "acc_power_transformer", "acc_power_transformer")
|
34 |
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# {
|
37 |
# "acc_overall": {
|
|
|
17 |
task2 = Task("acc_el", "acc_el", "Acc_El")
|
18 |
task3 = Task("acc_perturb", "perturb_score", "Acc_Perturb")
|
19 |
task4 = Task("score_consistency", "consist_score", "Consistency_Score")
|
|
|
20 |
|
21 |
class AssetTasks(Enum):
|
22 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
|
|
31 |
task8 = Task("acc_fan", "acc_fan", "acc_fan")
|
32 |
task9 = Task("acc_power_transformer", "acc_power_transformer", "acc_power_transformer")
|
33 |
|
34 |
+
class UncertaintyTasks(Enum):
|
35 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
36 |
+
task0 = Task("fmsr_ss", "fmsr_ss", "fmsr_ss")
|
37 |
+
task1 = Task("fmsr_coverage_rate", "fmsr_coverage_rate", "fmsr_coverage_rate")
|
38 |
+
task2 = Task("fmsr_uacc", "fmsr_uacc", "fmsr_uacc")
|
39 |
|
40 |
# {
|
41 |
# "acc_overall": {
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks, AssetTasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -50,6 +50,9 @@ auto_eval_column_asset_dict = get_auto_eval_column_dict(AssetTasks)
|
|
50 |
# We use make dataclass to dynamically fill the scores from Tasks
|
51 |
AutoEvalColumnAsset = make_dataclass("AutoEvalColumnAsset", auto_eval_column_asset_dict, frozen=True)
|
52 |
|
|
|
|
|
|
|
53 |
|
54 |
## For the queue columns in the submission tab
|
55 |
@dataclass(frozen=True)
|
@@ -111,10 +114,13 @@ class Precision(Enum):
|
|
111 |
# Column selection
|
112 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
113 |
ASSET_COLS = [c.name for c in fields(AutoEvalColumnAsset) if not c.hidden]
|
|
|
|
|
114 |
|
115 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
116 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
117 |
|
118 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
119 |
ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
|
|
|
120 |
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, AssetTasks, UncertaintyTasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
50 |
# We use make dataclass to dynamically fill the scores from Tasks
|
51 |
AutoEvalColumnAsset = make_dataclass("AutoEvalColumnAsset", auto_eval_column_asset_dict, frozen=True)
|
52 |
|
53 |
+
auto_eval_column_uncertainty_dict = get_auto_eval_column_dict(UncertaintyTasks)
|
54 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
55 |
+
AutoEvalColumnUncertainty = make_dataclass("AutoEvalColumnUncertainty", auto_eval_column_uncertainty_dict, frozen=True)
|
56 |
|
57 |
## For the queue columns in the submission tab
|
58 |
@dataclass(frozen=True)
|
|
|
114 |
# Column selection
|
115 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
116 |
ASSET_COLS = [c.name for c in fields(AutoEvalColumnAsset) if not c.hidden]
|
117 |
+
UNCERTAINTY_COLS = [c.name for c in fields(AutoEvalColumnUncertainty) if not c.hidden]
|
118 |
+
|
119 |
|
120 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
121 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
122 |
|
123 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
124 |
ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
|
125 |
+
ASSET_BENCHMARK_COLS = [t.value.col_name for t in UncertaintyTasks]
|
126 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -114,7 +114,7 @@ class EvalResult:
|
|
114 |
def to_dict(self, task_class):
|
115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
116 |
#ignore uncertainty for overall calculation
|
117 |
-
scores = [v for k, v in zip(self.results.keys(), self.results.values()) if v is not None
|
118 |
average = sum(scores) / len(scores)
|
119 |
# average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
|
120 |
data_dict = {
|
|
|
114 |
def to_dict(self, task_class):
|
115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
116 |
#ignore uncertainty for overall calculation
|
117 |
+
scores = [v for k, v in zip(self.results.keys(), self.results.values()) if v is not None]
|
118 |
average = sum(scores) / len(scores)
|
119 |
# average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
|
120 |
data_dict = {
|