Spaces:

hallucinations-leaderboard
/

leaderboard

Runtime error

App Files Files Community

pingnie commited on Feb 24, 2024

Commit

fdb7c69

1 Parent(s): b58e6fa

change model type and sync with open llm leaderboard on model type

Browse files

Files changed (6) hide show

app.py +12 -5
src/display/about.py +5 -4
src/display/utils.py +7 -7
src/envs.py +2 -0
src/leaderboard/read_evals.py +38 -3
src/populate.py +2 -1

app.py CHANGED Viewed

@@ -56,14 +56,21 @@ def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
-def init_space():
     dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
     if socket.gethostname() not in {'neuromancer'}:
-            ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
-            ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
-    raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
     return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df

     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
+def init_space(update_model_type_with_open_llm=True):
     dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
     if socket.gethostname() not in {'neuromancer'}:
+        # sync model_type with open-llm-leaderboard
+        ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
+        ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
+    # if EVAL_REQUESTS_PATH_OPEN_LLM == '' then we will not update model_type with open-llm-leaderbaord
+    if update_model_type_with_open_llm:
+        from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
+        ui_snapshot_download(repo_id=QUEUE_REPO_OPEN_LLM, local_dir=EVAL_REQUESTS_PATH_OPEN_LLM, repo_type="dataset", tqdm_class=None, etag_timeout=30)
+    else:
+        EVAL_REQUESTS_PATH_OPEN_LLM = ""
+    raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, EVAL_REQUESTS_PATH_OPEN_LLM, COLS, BENCHMARK_COLS)
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
     return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df

src/display/about.py CHANGED Viewed

@@ -5,7 +5,7 @@ TITLE = """<h1 align="center" id="space-title">Hallucinations Leaderboard</h1>""
 INTRODUCTION_TEXT = """
 📐 The Hallucinations Leaderboard aims to track, rank and evaluate hallucinations in LLMs.
-It evaluates the propensity for hallucination in Large Language Models (LLMs) across a diverse array of tasks, including Closed-book Open-domain QA, Summarization, Reading Comprehension, Instruction Following, Fact-Checking, Hallucination Detection, and Self-Consistency. The evaluation encompasses a wide range of datasets such as NQ Open, TriviaQA, TruthfulQA, XSum, CNN/DM, RACE, SQuADv2, MemoTrap, IFEval, FEVER, FaithDial, True-False, HaluEval, and SelfCheckGPT, offering a comprehensive assessment of each model's performance in generating accurate and contextually relevant content.
 A more detailed explanation of the definition of hallucination and the leaderboard's motivation, tasks and dataset can be found on the "About" page and [The Hallucinations Leaderboard blog post](https://huggingface.co/blog/leaderboards-on-the-hub-hallucinations).
@@ -74,7 +74,7 @@ To reproduce our results, here is the commands you can run, using [this script](
 Alternatively, if you're interested in evaluating a specific task with a particular model, you can use the [EleutherAI LLM Evaluation Harness library](https://github.com/EleutherAI/lm-evaluation-harness/) as follows:
 `python main.py --model=hf-auto --model_args="pretrained=<your_model>,revision=<your_model_revision>,parallelize=True"`
-` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
 Note that the Hallucinations Library includes several tasks definitions that are not included in the Harness library -- you can find them at [this link](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/tree/main/src/backend/tasks)).
@@ -108,8 +108,9 @@ For all these evaluations, a higher score is a better score.
 - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
 - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
 Specific fine-tune subcategories (more adapted to chat):
-- {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
-- {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
 If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
 """

 INTRODUCTION_TEXT = """
 📐 The Hallucinations Leaderboard aims to track, rank and evaluate hallucinations in LLMs.
+It evaluates the propensity for hallucination in Large Language Models (LLMs) across a diverse array of tasks, including Closed-book Open-domain QA, Summarization, Reading Comprehension, Instruction Following, Fact-Checking, Hallucination Detection, and Self-Consistency. The evaluation encompasses a wide range of datasets such as NQ Open, TriviaQA, TruthfulQA, XSum, CNN/DM, RACE, SQuADv2, MemoTrap, IFEval, FEVER, FaithDial, True-False, HaluEval, and SelfCheckGPT, offering a comprehensive assessment of each model's performance in generating accurate and contextually relevant content.
 A more detailed explanation of the definition of hallucination and the leaderboard's motivation, tasks and dataset can be found on the "About" page and [The Hallucinations Leaderboard blog post](https://huggingface.co/blog/leaderboards-on-the-hub-hallucinations).
 Alternatively, if you're interested in evaluating a specific task with a particular model, you can use the [EleutherAI LLM Evaluation Harness library](https://github.com/EleutherAI/lm-evaluation-harness/) as follows:
 `python main.py --model=hf-auto --model_args="pretrained=<your_model>,revision=<your_model_revision>,parallelize=True"`
+` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
 Note that the Hallucinations Library includes several tasks definitions that are not included in the Harness library -- you can find them at [this link](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/tree/main/src/backend/tasks)).
 - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
 - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
 Specific fine-tune subcategories (more adapted to chat):
+- {ModelType.chat.to_str(" : ")} model: chat models (RLHF, DPO, IFT, ...).
+- {ModelType.merges.to_str(" : ")} model: base merges and moerges.
+- {ModelType.Unknown.to_str(" : ")} model: Unknown model type
 If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
 """

src/display/utils.py CHANGED Viewed

@@ -106,9 +106,9 @@ class ModelDetails:
 class ModelType(Enum):
     PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
@@ -120,10 +120,10 @@ class ModelType(Enum):
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
             return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
         return ModelType.Unknown

 class ModelType(Enum):
     PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
+    chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
+    merges = ModelDetails(name="base merges and moerges", symbol="🤝")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
             return ModelType.PT
+        if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
+            return ModelType.chat
+        if "merge" in type or "🤝" in type:
+            return ModelType.merges
         return ModelType.Unknown

src/envs.py CHANGED Viewed

@@ -9,6 +9,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
 REPO_ID = "hallucinations-leaderboard/leaderboard"
 QUEUE_REPO = "hallucinations-leaderboard/requests"
 RESULTS_REPO = "hallucinations-leaderboard/results"
 PRIVATE_QUEUE_REPO = "hallucinations-leaderboard/private-requests"
@@ -20,6 +21,7 @@ CACHE_PATH = os.getenv("HF_HOME", ".")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"

 REPO_ID = "hallucinations-leaderboard/leaderboard"
 QUEUE_REPO = "hallucinations-leaderboard/requests"
+QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests"
 RESULTS_REPO = "hallucinations-leaderboard/results"
 PRIVATE_QUEUE_REPO = "hallucinations-leaderboard/private-requests"
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_OPEN_LLM = os.path.join(CACHE_PATH, "eval-queue-open-llm")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import glob
 import json
 import os
 from dataclasses import dataclass
 import dateutil
@@ -125,6 +126,18 @@ class EvalResult:
         except Exception as e:
             print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}")
     def is_complete(self) -> bool:
         for task in Tasks:
             if task.value.benchmark not in self.results:
@@ -180,8 +193,29 @@ def get_request_file_for_model(requests_path, model_name, precision):
                 request_file = tmp_request_file
     return request_file
-def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
@@ -200,11 +234,12 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
-    for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():

 import glob
 import json
 import os
+from tqdm import tqdm
 from dataclasses import dataclass
 import dateutil
         except Exception as e:
             print(f"Could not find request file for {self.org}/{self.model} -- path: {requests_path} -- {e}")
+    def update_model_type_with_open_llm_request_file(self, open_llm_requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model_open_llm(open_llm_requests_path, self.full_model, self.precision.value.name)
+        if request_file:
+            try:
+                with open(request_file, "r") as f:
+                    request = json.load(f)
+                self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
+            except Exception as e:
+                pass
     def is_complete(self) -> bool:
         for task in Tasks:
             if task.value.benchmark not in self.results:
                 request_file = tmp_request_file
     return request_file
+def get_request_file_for_model_open_llm(requests_path, model_name, precision):
+    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
+    request_files = os.path.join(
+        requests_path,
+        f"{model_name}_eval_request_*.json",
+    )
+    request_files = glob.glob(request_files)
+    # Select correct request file (precision)
+    request_file = ""
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file, "r") as f:
+            req_content = json.load(f)
+            if (
+                req_content["status"] in ["FINISHED"]
+                and req_content["precision"] == precision.split(".")[-1]
+            ):
+                request_file = tmp_request_file
+    return request_file
+def get_raw_eval_results(results_path: str, requests_path: str, requests_path_open_llm: str, is_backend: bool = False) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
+    for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
         eval_result.update_with_request_file(requests_path)
+        if requests_path_open_llm:
+            eval_result.update_model_type_with_open_llm_request_file(requests_path_open_llm)
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():

src/populate.py CHANGED Viewed

@@ -15,11 +15,12 @@ from src.display.utils import Tasks
 def get_leaderboard_df(results_path: str,
                        requests_path: str,
                        cols: list,
                        benchmark_cols: list,
                        is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
     # Returns a list of EvalResult
-    raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path)
     all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]

 def get_leaderboard_df(results_path: str,
                        requests_path: str,
+                       requests_path_open_llm: str,
                        cols: list,
                        benchmark_cols: list,
                        is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
     # Returns a list of EvalResult
+    raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
     all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]