[email protected] commited on
Commit
fa0f3d4
Β·
1 Parent(s): 2c9911b
app.py CHANGED
@@ -5,7 +5,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
  import plotly.graph_objects as go
7
  import plotly.express as px
8
- from src.about import Tasks, AssetTasks
9
 
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
@@ -21,10 +21,12 @@ from src.display.utils import (
21
  ASSET_BENCHMARK_COLS,
22
  COLS,
23
  ASSET_COLS,
 
24
  EVAL_COLS,
25
  EVAL_TYPES,
26
  AutoEvalColumn,
27
  AutoEvalColumnAsset,
 
28
  ModelType,
29
  fields,
30
  WeightType,
@@ -63,6 +65,8 @@ print(ASSET_COLS)
63
 
64
  ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
65
 
 
 
66
 
67
  (
68
  finished_eval_queue_df,
@@ -165,8 +169,11 @@ with demo:
165
 
166
  with gr.TabItem("πŸ› οΈ Asset Benchmark", elem_id="llm-benchmark-asset-tab-table", id=1):
167
  leaderboard = init_leaderboard(ASSET_LEADERBOARD_DF, AutoEvalColumnAsset)
 
 
 
168
 
169
- with gr.TabItem("πŸ“Š Performance Plot", elem_id="llm-benchmark-tab-table", id=2):
170
  print(LEADERBOARD_DF.columns)
171
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
172
  perf_plot = gr.components.Plot(
 
5
  from huggingface_hub import snapshot_download
6
  import plotly.graph_objects as go
7
  import plotly.express as px
8
+ from src.about import Tasks, AssetTasks, UncertaintyTasks
9
 
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
 
21
  ASSET_BENCHMARK_COLS,
22
  COLS,
23
  ASSET_COLS,
24
+ UNCERTAINTY_COLS,
25
  EVAL_COLS,
26
  EVAL_TYPES,
27
  AutoEvalColumn,
28
  AutoEvalColumnAsset,
29
+ AutoEvalColumnUncertainty,
30
  ModelType,
31
  fields,
32
  WeightType,
 
65
 
66
  ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
67
 
68
+ UNCERTAINTY_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, UNCERTAINTY_COLS, ASSET_BENCHMARK_COLS, UncertaintyTasks)
69
+
70
 
71
  (
72
  finished_eval_queue_df,
 
169
 
170
  with gr.TabItem("πŸ› οΈ Asset Benchmark", elem_id="llm-benchmark-asset-tab-table", id=1):
171
  leaderboard = init_leaderboard(ASSET_LEADERBOARD_DF, AutoEvalColumnAsset)
172
+
173
+ with gr.TabItem("πŸ˜΅β€πŸ’« Uncertainty Benchmark", elem_id="llm-benchmark-asset-tab-table", id=2):
174
+ leaderboard = init_leaderboard(UNCERTAINTY_LEADERBOARD_DF, AutoEvalColumnUncertainty)
175
 
176
+ with gr.TabItem("πŸ“Š Performance Plot", elem_id="llm-benchmark-tab-table", id=3):
177
  print(LEADERBOARD_DF.columns)
178
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
179
  perf_plot = gr.components.Plot(
src/about.py CHANGED
@@ -17,7 +17,6 @@ class Tasks(Enum):
17
  task2 = Task("acc_el", "acc_el", "Acc_El")
18
  task3 = Task("acc_perturb", "perturb_score", "Acc_Perturb")
19
  task4 = Task("score_consistency", "consist_score", "Consistency_Score")
20
- task5 = Task("uncertainty", "uncertainty_score", "Uncertainty_Score")
21
 
22
  class AssetTasks(Enum):
23
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
@@ -32,6 +31,11 @@ class AssetTasks(Enum):
32
  task8 = Task("acc_fan", "acc_fan", "acc_fan")
33
  task9 = Task("acc_power_transformer", "acc_power_transformer", "acc_power_transformer")
34
 
 
 
 
 
 
35
 
36
  # {
37
  # "acc_overall": {
 
17
  task2 = Task("acc_el", "acc_el", "Acc_El")
18
  task3 = Task("acc_perturb", "perturb_score", "Acc_Perturb")
19
  task4 = Task("score_consistency", "consist_score", "Consistency_Score")
 
20
 
21
  class AssetTasks(Enum):
22
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
 
31
  task8 = Task("acc_fan", "acc_fan", "acc_fan")
32
  task9 = Task("acc_power_transformer", "acc_power_transformer", "acc_power_transformer")
33
 
34
+ class UncertaintyTasks(Enum):
35
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
36
+ task0 = Task("fmsr_ss", "fmsr_ss", "fmsr_ss")
37
+ task1 = Task("fmsr_coverage_rate", "fmsr_coverage_rate", "fmsr_coverage_rate")
38
+ task2 = Task("fmsr_uacc", "fmsr_uacc", "fmsr_uacc")
39
 
40
  # {
41
  # "acc_overall": {
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks, AssetTasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -50,6 +50,9 @@ auto_eval_column_asset_dict = get_auto_eval_column_dict(AssetTasks)
50
  # We use make dataclass to dynamically fill the scores from Tasks
51
  AutoEvalColumnAsset = make_dataclass("AutoEvalColumnAsset", auto_eval_column_asset_dict, frozen=True)
52
 
 
 
 
53
 
54
  ## For the queue columns in the submission tab
55
  @dataclass(frozen=True)
@@ -111,10 +114,13 @@ class Precision(Enum):
111
  # Column selection
112
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
113
  ASSET_COLS = [c.name for c in fields(AutoEvalColumnAsset) if not c.hidden]
 
 
114
 
115
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
116
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
117
 
118
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
119
  ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
 
120
 
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, AssetTasks, UncertaintyTasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
50
  # We use make dataclass to dynamically fill the scores from Tasks
51
  AutoEvalColumnAsset = make_dataclass("AutoEvalColumnAsset", auto_eval_column_asset_dict, frozen=True)
52
 
53
+ auto_eval_column_uncertainty_dict = get_auto_eval_column_dict(UncertaintyTasks)
54
+ # We use make dataclass to dynamically fill the scores from Tasks
55
+ AutoEvalColumnUncertainty = make_dataclass("AutoEvalColumnUncertainty", auto_eval_column_uncertainty_dict, frozen=True)
56
 
57
  ## For the queue columns in the submission tab
58
  @dataclass(frozen=True)
 
114
  # Column selection
115
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
116
  ASSET_COLS = [c.name for c in fields(AutoEvalColumnAsset) if not c.hidden]
117
+ UNCERTAINTY_COLS = [c.name for c in fields(AutoEvalColumnUncertainty) if not c.hidden]
118
+
119
 
120
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
121
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
122
 
123
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
124
  ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
125
+ ASSET_BENCHMARK_COLS = [t.value.col_name for t in UncertaintyTasks]
126
 
src/leaderboard/read_evals.py CHANGED
@@ -114,7 +114,7 @@ class EvalResult:
114
  def to_dict(self, task_class):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
  #ignore uncertainty for overall calculation
117
- scores = [v for k, v in zip(self.results.keys(), self.results.values()) if v is not None and k != 'uncertainty']
118
  average = sum(scores) / len(scores)
119
  # average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
120
  data_dict = {
 
114
  def to_dict(self, task_class):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
116
  #ignore uncertainty for overall calculation
117
+ scores = [v for k, v in zip(self.results.keys(), self.results.values()) if v is not None]
118
  average = sum(scores) / len(scores)
119
  # average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
120
  data_dict = {