Spaces:
Runtime error
Runtime error
Commit
·
73d1e6e
1
Parent(s):
b2aa5d0
update
Browse files
cli/fix-requests-cli.py
CHANGED
|
@@ -41,7 +41,7 @@ for path in json_files:
|
|
| 41 |
data["model_type"] = "fine-tuned"
|
| 42 |
to_overwrite = True
|
| 43 |
|
| 44 |
-
is_instruction_tuned = 'nstruct' in model_id
|
| 45 |
if is_instruction_tuned:
|
| 46 |
data["model_type"] = "instruction-tuned"
|
| 47 |
to_overwrite = True
|
|
|
|
| 41 |
data["model_type"] = "fine-tuned"
|
| 42 |
to_overwrite = True
|
| 43 |
|
| 44 |
+
is_instruction_tuned = ('nstruct' in model_id) or ('chat' in model_id)
|
| 45 |
if is_instruction_tuned:
|
| 46 |
data["model_type"] = "instruction-tuned"
|
| 47 |
to_overwrite = True
|
cli/halueval-cli.py
CHANGED
|
@@ -7,6 +7,8 @@ from src.backend.manage_requests import get_eval_requests
|
|
| 7 |
from src.backend.manage_requests import EvalRequest
|
| 8 |
from src.backend.run_eval_suite import run_evaluation
|
| 9 |
|
|
|
|
|
|
|
| 10 |
from lm_eval.tasks import initialize_tasks, include_task_folder
|
| 11 |
from lm_eval import tasks, evaluator, utils
|
| 12 |
|
|
@@ -15,7 +17,7 @@ from src.envs import QUEUE_REPO
|
|
| 15 |
|
| 16 |
|
| 17 |
def main():
|
| 18 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
| 19 |
|
| 20 |
PENDING_STATUS = "PENDING"
|
| 21 |
RUNNING_STATUS = "RUNNING"
|
|
@@ -28,7 +30,10 @@ def main():
|
|
| 28 |
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
| 29 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
# task_names = ['triviaqa']
|
| 33 |
# TASKS_HARNESS = [task.value for task in Tasks]
|
| 34 |
|
|
|
|
| 7 |
from src.backend.manage_requests import EvalRequest
|
| 8 |
from src.backend.run_eval_suite import run_evaluation
|
| 9 |
|
| 10 |
+
from src.backend.tasks.xsum.task import XSum
|
| 11 |
+
|
| 12 |
from lm_eval.tasks import initialize_tasks, include_task_folder
|
| 13 |
from lm_eval import tasks, evaluator, utils
|
| 14 |
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
def main():
|
| 20 |
+
# snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
| 21 |
|
| 22 |
PENDING_STATUS = "PENDING"
|
| 23 |
RUNNING_STATUS = "RUNNING"
|
|
|
|
| 30 |
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
| 31 |
eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
|
| 32 |
|
| 33 |
+
# my_task = Task("memo-trap", "acc", "memo-trap", 0)
|
| 34 |
+
my_task = Task("xsum", "rougeLsum", "XSum", 2)
|
| 35 |
+
|
| 36 |
+
TASKS_HARNESS = [my_task]
|
| 37 |
# task_names = ['triviaqa']
|
| 38 |
# TASKS_HARNESS = [task.value for task in Tasks]
|
| 39 |
|
src/backend/tasks/xsum/xsum.yaml.bak → snippets/xsum.yaml
RENAMED
|
File without changes
|
src/backend/envs.py
CHANGED
|
@@ -23,12 +23,18 @@ class Tasks(Enum):
|
|
| 23 |
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
|
| 24 |
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
|
| 25 |
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
|
|
|
|
| 26 |
# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
|
| 27 |
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
|
| 28 |
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
|
|
|
|
| 29 |
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
|
|
|
|
|
|
|
|
|
|
| 30 |
# task6 = Task("xsum", "rougeL_acc", "XSum", 8)
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
| 34 |
|
|
|
|
| 23 |
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
|
| 24 |
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
|
| 25 |
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
|
| 26 |
+
|
| 27 |
# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
|
| 28 |
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
|
| 29 |
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
|
| 30 |
+
|
| 31 |
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
|
| 32 |
+
# task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
|
| 33 |
+
# task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
|
| 34 |
+
|
| 35 |
# task6 = Task("xsum", "rougeL_acc", "XSum", 8)
|
| 36 |
+
|
| 37 |
+
task8 = Task("memo-trap", "acc", "memo-trap", 0)
|
| 38 |
|
| 39 |
# NUM_FEWSHOT = 64 # Change with your few shot
|
| 40 |
|