Spaces:
Runtime error
Runtime error
Commit
·
5ca644e
1
Parent(s):
1e82ba8
update
Browse files- cli/analysis-cli.py +31 -29
- cli/eval-cli.py +3 -2
- cli/fever-upload-cli.py +73 -0
- src/backend/envs.py +1 -0
- src/backend/tasks/fever/fever11.yaml +17 -0
cli/analysis-cli.py
CHANGED
|
@@ -111,8 +111,6 @@ if data_map is None:
|
|
| 111 |
for dataset_name, results_dict in data["results"].items():
|
| 112 |
for metric_name, value in results_dict.items():
|
| 113 |
|
| 114 |
-
# print(model_name, dataset_name, metric_name, value)
|
| 115 |
-
|
| 116 |
if ',' in metric_name and '_stderr' not in metric_name \
|
| 117 |
and 'f1' not in metric_name \
|
| 118 |
and model_name_to_model_map[model_name]["likes"] > 128:
|
|
@@ -160,9 +158,8 @@ if data_map is None:
|
|
| 160 |
if 'fever' in dataset_name:
|
| 161 |
to_add = False
|
| 162 |
|
| 163 |
-
if 'xsum' in dataset_name:
|
| 164 |
-
|
| 165 |
-
pass
|
| 166 |
|
| 167 |
if 'rouge' in metric_name:
|
| 168 |
value /= 100.0
|
|
@@ -186,8 +183,10 @@ if data_map is None:
|
|
| 186 |
|
| 187 |
model_name_lst = [m for m in data_map.keys()]
|
| 188 |
|
|
|
|
|
|
|
| 189 |
for model_name in model_name_lst:
|
| 190 |
-
if len(data_map[model_name]) <
|
| 191 |
del data_map[model_name]
|
| 192 |
|
| 193 |
plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
|
|
@@ -293,27 +292,30 @@ for plot_type in plot_type_lst:
|
|
| 293 |
|
| 294 |
print('figsize', (fig_width, fig_height))
|
| 295 |
|
| 296 |
-
print(f'Generating clustermap for {plot_type}')
|
| 297 |
-
|
| 298 |
-
# fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
|
| 299 |
-
fig = sns.clustermap(df,
|
| 300 |
-
method='ward',
|
| 301 |
-
metric='euclidean',
|
| 302 |
-
cmap='coolwarm',
|
| 303 |
-
figsize=(fig_width, fig_height), # figsize=(24, 16),
|
| 304 |
-
annot=True,
|
| 305 |
-
mask=o_df.isnull(),
|
| 306 |
-
dendrogram_ratio=dendrogram_ratio,
|
| 307 |
-
fmt='.2f',
|
| 308 |
-
col_cluster=col_cluster,
|
| 309 |
-
row_cluster=row_cluster)
|
| 310 |
-
|
| 311 |
-
# Adjust the size of the cells (less wide)
|
| 312 |
-
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
|
| 313 |
-
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
|
| 314 |
-
|
| 315 |
-
# Save the clustermap to file
|
| 316 |
-
fig.savefig(f'plots/clustermap_{plot_type}.pdf')
|
| 317 |
-
fig.savefig(f'plots/clustermap_{plot_type}.png')
|
| 318 |
-
|
| 319 |
o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
for dataset_name, results_dict in data["results"].items():
|
| 112 |
for metric_name, value in results_dict.items():
|
| 113 |
|
|
|
|
|
|
|
| 114 |
if ',' in metric_name and '_stderr' not in metric_name \
|
| 115 |
and 'f1' not in metric_name \
|
| 116 |
and model_name_to_model_map[model_name]["likes"] > 128:
|
|
|
|
| 158 |
if 'fever' in dataset_name:
|
| 159 |
to_add = False
|
| 160 |
|
| 161 |
+
if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
|
| 162 |
+
to_add = False
|
|
|
|
| 163 |
|
| 164 |
if 'rouge' in metric_name:
|
| 165 |
value /= 100.0
|
|
|
|
| 183 |
|
| 184 |
model_name_lst = [m for m in data_map.keys()]
|
| 185 |
|
| 186 |
+
nb_max_metrics = max(len(data_map[model_name]) for model_name in model_name_lst)
|
| 187 |
+
|
| 188 |
for model_name in model_name_lst:
|
| 189 |
+
if len(data_map[model_name]) < nb_max_metrics - 5:
|
| 190 |
del data_map[model_name]
|
| 191 |
|
| 192 |
plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
|
|
|
|
| 292 |
|
| 293 |
print('figsize', (fig_width, fig_height))
|
| 294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
|
| 296 |
+
|
| 297 |
+
print(f'Generating the clustermaps for {plot_type}')
|
| 298 |
+
|
| 299 |
+
for cmap in [None, 'coolwarm', 'viridis']:
|
| 300 |
+
fig = sns.clustermap(df,
|
| 301 |
+
method='ward',
|
| 302 |
+
metric='euclidean',
|
| 303 |
+
cmap=cmap,
|
| 304 |
+
figsize=(fig_width, fig_height), # figsize=(24, 16),
|
| 305 |
+
annot=True,
|
| 306 |
+
mask=o_df.isnull(),
|
| 307 |
+
dendrogram_ratio=dendrogram_ratio,
|
| 308 |
+
fmt='.2f',
|
| 309 |
+
col_cluster=col_cluster,
|
| 310 |
+
row_cluster=row_cluster)
|
| 311 |
+
|
| 312 |
+
# Adjust the size of the cells (less wide)
|
| 313 |
+
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
|
| 314 |
+
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
|
| 315 |
+
|
| 316 |
+
cmap_suffix = '' if cmap is None else f'_{cmap}'
|
| 317 |
+
|
| 318 |
+
# Save the clustermap to file
|
| 319 |
+
fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.pdf')
|
| 320 |
+
fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.png')
|
| 321 |
+
fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")
|
cli/eval-cli.py
CHANGED
|
@@ -36,7 +36,8 @@ def main():
|
|
| 36 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
| 37 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
| 38 |
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
| 39 |
-
my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
|
|
|
|
| 40 |
|
| 41 |
eval_logger = utils.eval_logger
|
| 42 |
import logging
|
|
@@ -59,7 +60,7 @@ def main():
|
|
| 59 |
|
| 60 |
# breakpoint()
|
| 61 |
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
| 62 |
-
batch_size=1, device="mps", use_cache=None, limit=
|
| 63 |
print('AAA', results["results"])
|
| 64 |
|
| 65 |
breakpoint()
|
|
|
|
| 36 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
| 37 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
| 38 |
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
| 39 |
+
# my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
|
| 40 |
+
my_task = Task("fever11", "acc", "FEVER", 8)
|
| 41 |
|
| 42 |
eval_logger = utils.eval_logger
|
| 43 |
import logging
|
|
|
|
| 60 |
|
| 61 |
# breakpoint()
|
| 62 |
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
| 63 |
+
batch_size=1, device="mps", use_cache=None, limit=1000, write_out=True)
|
| 64 |
print('AAA', results["results"])
|
| 65 |
|
| 66 |
breakpoint()
|
cli/fever-upload-cli.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import glob
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
import random
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
from datasets import Dataset, DatasetDict, load_dataset
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def convert(list_of_dicts):
|
| 13 |
+
res = {}
|
| 14 |
+
for d in list_of_dicts:
|
| 15 |
+
for k, v in d.items():
|
| 16 |
+
res.setdefault(k, []).append(v)
|
| 17 |
+
return res
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
v10 = load_dataset("fever", "v1.0")
|
| 21 |
+
name_lst = ['train', 'labelled_dev']
|
| 22 |
+
|
| 23 |
+
old_to_new_label_map = {
|
| 24 |
+
'SUPPORTS': 'supported',
|
| 25 |
+
'REFUTES': 'refuted'
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
data_map = {}
|
| 29 |
+
|
| 30 |
+
for name in name_lst:
|
| 31 |
+
instance_lst = []
|
| 32 |
+
|
| 33 |
+
for entry in tqdm(v10[name]):
|
| 34 |
+
id_ = entry['id']
|
| 35 |
+
label = entry['label']
|
| 36 |
+
claim = entry['claim']
|
| 37 |
+
|
| 38 |
+
evidence_id = entry['evidence_id']
|
| 39 |
+
evidence_wiki_url = entry['evidence_wiki_url']
|
| 40 |
+
|
| 41 |
+
if evidence_id != -1:
|
| 42 |
+
assert label in {'SUPPORTS', 'REFUTES'}
|
| 43 |
+
|
| 44 |
+
instance = {'id': id_, 'label': old_to_new_label_map[label], 'claim': claim}
|
| 45 |
+
instance_lst.append(instance)
|
| 46 |
+
|
| 47 |
+
key = 'dev' if name in {'labelled_dev'} else name
|
| 48 |
+
|
| 49 |
+
instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d['claim'])
|
| 50 |
+
|
| 51 |
+
label_to_instance_lst = {}
|
| 52 |
+
for e in instance_lst:
|
| 53 |
+
if e['label'] not in label_to_instance_lst:
|
| 54 |
+
label_to_instance_lst[e['label']] = []
|
| 55 |
+
label_to_instance_lst[e['label']].append(e)
|
| 56 |
+
|
| 57 |
+
min_len = min(len(v) for k, v in label_to_instance_lst.items())
|
| 58 |
+
|
| 59 |
+
new_instance_lst = []
|
| 60 |
+
for k in sorted(label_to_instance_lst.keys()):
|
| 61 |
+
new_instance_lst += label_to_instance_lst[k][:min_len]
|
| 62 |
+
|
| 63 |
+
random.Random(42).shuffle(new_instance_lst)
|
| 64 |
+
data_map[key] = new_instance_lst
|
| 65 |
+
|
| 66 |
+
ds_path = 'pminervini/hl-fever'
|
| 67 |
+
|
| 68 |
+
task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
|
| 69 |
+
ds_dict = DatasetDict(task_to_ds_map)
|
| 70 |
+
|
| 71 |
+
ds_dict.push_to_hub(ds_path, "v1.0")
|
| 72 |
+
|
| 73 |
+
# breakpoint()
|
src/backend/envs.py
CHANGED
|
@@ -46,6 +46,7 @@ class Tasks(Enum):
|
|
| 46 |
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
|
| 47 |
|
| 48 |
# task15 = Task("fever10", "acc", "FEVER", 16)
|
|
|
|
| 49 |
|
| 50 |
task16 = Task("squadv2", "exact", "SQuADv2", 4)
|
| 51 |
|
|
|
|
| 46 |
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
|
| 47 |
|
| 48 |
# task15 = Task("fever10", "acc", "FEVER", 16)
|
| 49 |
+
task15_1 = Task("fever11", "acc", "FEVER", 8)
|
| 50 |
|
| 51 |
task16 = Task("squadv2", "exact", "SQuADv2", 4)
|
| 52 |
|
src/backend/tasks/fever/fever11.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
group: fever
|
| 2 |
+
task: fever11
|
| 3 |
+
dataset_path: pminervini/hl-fever
|
| 4 |
+
dataset_name: v1.0
|
| 5 |
+
output_type: multiple_choice
|
| 6 |
+
training_split: train
|
| 7 |
+
validation_split: dev
|
| 8 |
+
test_split: null
|
| 9 |
+
doc_to_text: "Claim: {{claim}}\nLabel:"
|
| 10 |
+
doc_to_choice: ["supported", "refuted"]
|
| 11 |
+
doc_to_target: label
|
| 12 |
+
metric_list:
|
| 13 |
+
- metric: acc
|
| 14 |
+
aggregation: mean
|
| 15 |
+
higher_is_better: true
|
| 16 |
+
metadata:
|
| 17 |
+
version: 0.0
|