Spaces:
Runtime error
Runtime error
Commit
·
95cc038
1
Parent(s):
62679c8
update
Browse files- cli/averitec-upload-cli.py +12 -0
- cli/halueval-cli.py +11 -2
- cli/submit-cli.py +7 -2
cli/averitec-upload-cli.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
|
| 5 |
+
path = 'pminervini/averitec'
|
| 6 |
+
|
| 7 |
+
ds = load_dataset("json",
|
| 8 |
+
data_files={
|
| 9 |
+
'train': '/Users/pasquale/workspace/AVeriTeC/data/train.json',
|
| 10 |
+
'dev': '/Users/pasquale/workspace/AVeriTeC/data/dev.json'
|
| 11 |
+
})
|
| 12 |
+
ds.push_to_hub(path)
|
cli/halueval-cli.py
CHANGED
|
@@ -33,7 +33,13 @@ def main():
|
|
| 33 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
| 34 |
|
| 35 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
| 36 |
-
my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
TASKS_HARNESS = [my_task]
|
| 39 |
# task_names = ['triviaqa']
|
|
@@ -48,7 +54,10 @@ def main():
|
|
| 48 |
|
| 49 |
for task in TASKS_HARNESS:
|
| 50 |
print(f"Selected Tasks: [{task}]")
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
batch_size=1, device="mps", use_cache=None, limit=10, write_out=True)
|
| 53 |
print('AAA', results["results"])
|
| 54 |
|
|
|
|
| 33 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
| 34 |
|
| 35 |
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
| 36 |
+
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
| 37 |
+
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
| 38 |
+
my_task = Task("fever10", "acc", "FEVER", 5)
|
| 39 |
+
|
| 40 |
+
eval_logger = utils.eval_logger
|
| 41 |
+
import logging
|
| 42 |
+
eval_logger.setLevel(getattr(logging, "DEBUG"))
|
| 43 |
|
| 44 |
TASKS_HARNESS = [my_task]
|
| 45 |
# task_names = ['triviaqa']
|
|
|
|
| 54 |
|
| 55 |
for task in TASKS_HARNESS:
|
| 56 |
print(f"Selected Tasks: [{task}]")
|
| 57 |
+
import torch
|
| 58 |
+
|
| 59 |
+
# breakpoint()
|
| 60 |
+
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
| 61 |
batch_size=1, device="mps", use_cache=None, limit=10, write_out=True)
|
| 62 |
print('AAA', results["results"])
|
| 63 |
|
cli/submit-cli.py
CHANGED
|
@@ -120,7 +120,10 @@ def main():
|
|
| 120 |
model_lst = [m for m in model_lst]
|
| 121 |
|
| 122 |
def custom_filter(m) -> bool:
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
|
| 126 |
|
|
@@ -138,6 +141,8 @@ def main():
|
|
| 138 |
|
| 139 |
requested_model_names = {e.model for e in eval_requests}
|
| 140 |
|
|
|
|
|
|
|
| 141 |
for i in range(min(200, len(filtered_model_lst))):
|
| 142 |
model = filtered_model_lst[i]
|
| 143 |
|
|
@@ -157,7 +162,7 @@ def main():
|
|
| 157 |
|
| 158 |
if 'mage' not in model.id:
|
| 159 |
add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
|
| 160 |
-
time.sleep(
|
| 161 |
else:
|
| 162 |
print(f'Model {model.id} already added, not adding it to the queue again.')
|
| 163 |
|
|
|
|
| 120 |
model_lst = [m for m in model_lst]
|
| 121 |
|
| 122 |
def custom_filter(m) -> bool:
|
| 123 |
+
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
|
| 124 |
+
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
|
| 125 |
+
res = 'mistralai/' in m.id
|
| 126 |
+
return res
|
| 127 |
|
| 128 |
filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
|
| 129 |
|
|
|
|
| 141 |
|
| 142 |
requested_model_names = {e.model for e in eval_requests}
|
| 143 |
|
| 144 |
+
breakpoint()
|
| 145 |
+
|
| 146 |
for i in range(min(200, len(filtered_model_lst))):
|
| 147 |
model = filtered_model_lst[i]
|
| 148 |
|
|
|
|
| 162 |
|
| 163 |
if 'mage' not in model.id:
|
| 164 |
add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
|
| 165 |
+
time.sleep(10)
|
| 166 |
else:
|
| 167 |
print(f'Model {model.id} already added, not adding it to the queue again.')
|
| 168 |
|