[email protected] commited on
Commit
9592714
·
1 Parent(s): fa0f3d4

uncertainty benchmark

Browse files
Files changed (3) hide show
  1. app.py +5 -2
  2. src/display/utils.py +2 -2
  3. src/leaderboard/read_evals.py +2 -2
app.py CHANGED
@@ -19,6 +19,7 @@ from src.display.css_html_js import custom_css
19
  from src.display.utils import (
20
  BENCHMARK_COLS,
21
  ASSET_BENCHMARK_COLS,
 
22
  COLS,
23
  ASSET_COLS,
24
  UNCERTAINTY_COLS,
@@ -65,8 +66,10 @@ print(ASSET_COLS)
65
 
66
  ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
67
 
68
- UNCERTAINTY_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, UNCERTAINTY_COLS, ASSET_BENCHMARK_COLS, UncertaintyTasks)
69
-
 
 
70
 
71
  (
72
  finished_eval_queue_df,
 
19
  from src.display.utils import (
20
  BENCHMARK_COLS,
21
  ASSET_BENCHMARK_COLS,
22
+ UNCERTAINTY_BENCHMARK_COLS,
23
  COLS,
24
  ASSET_COLS,
25
  UNCERTAINTY_COLS,
 
66
 
67
  ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
68
 
69
+ UNCERTAINTY_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, UNCERTAINTY_COLS, UNCERTAINTY_BENCHMARK_COLS, UncertaintyTasks)
70
+ missing_uncertainties = (UNCERTAINTY_LEADERBOARD_DF[UNCERTAINTY_BENCHMARK_COLS] == 0).all(axis=1)
71
+ UNCERTAINTY_LEADERBOARD_DF = UNCERTAINTY_LEADERBOARD_DF[~missing_uncertainties]
72
+ UNCERTAINTY_LEADERBOARD_DF = UNCERTAINTY_LEADERBOARD_DF.loc[:,~UNCERTAINTY_LEADERBOARD_DF.columns.duplicated()]
73
 
74
  (
75
  finished_eval_queue_df,
src/display/utils.py CHANGED
@@ -53,7 +53,7 @@ AutoEvalColumnAsset = make_dataclass("AutoEvalColumnAsset", auto_eval_column_ass
53
  auto_eval_column_uncertainty_dict = get_auto_eval_column_dict(UncertaintyTasks)
54
  # We use make dataclass to dynamically fill the scores from Tasks
55
  AutoEvalColumnUncertainty = make_dataclass("AutoEvalColumnUncertainty", auto_eval_column_uncertainty_dict, frozen=True)
56
-
57
  ## For the queue columns in the submission tab
58
  @dataclass(frozen=True)
59
  class EvalQueueColumn: # Queue column
@@ -122,5 +122,5 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
122
 
123
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
124
  ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
125
- ASSET_BENCHMARK_COLS = [t.value.col_name for t in UncertaintyTasks]
126
 
 
53
  auto_eval_column_uncertainty_dict = get_auto_eval_column_dict(UncertaintyTasks)
54
  # We use make dataclass to dynamically fill the scores from Tasks
55
  AutoEvalColumnUncertainty = make_dataclass("AutoEvalColumnUncertainty", auto_eval_column_uncertainty_dict, frozen=True)
56
+ AutoEvalColumnUncertainty.average.name = 'fmsr_uacc'
57
  ## For the queue columns in the submission tab
58
  @dataclass(frozen=True)
59
  class EvalQueueColumn: # Queue column
 
122
 
123
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
124
  ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
125
+ UNCERTAINTY_BENCHMARK_COLS = [t.value.col_name for t in UncertaintyTasks]
126
 
src/leaderboard/read_evals.py CHANGED
@@ -135,8 +135,8 @@ class EvalResult:
135
 
136
  for task in task_class:
137
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
138
- if task.value.col_name == 'Uncertainty_Score' and self.results[task.value.benchmark] == 0:
139
- data_dict[task.value.col_name] = None
140
  return data_dict
141
 
142
 
 
135
 
136
  for task in task_class:
137
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
138
+ # if task.value.col_name == 'Uncertainty_Score' and self.results[task.value.benchmark] == 0:
139
+ # data_dict[task.value.col_name] = None
140
  return data_dict
141
 
142