Spaces:

saeedfarzi
/

MediLingua_Leaderboard

Running

App Files Files Community

Sfarzi commited on Sep 15

Commit

2de47c9

1 Parent(s): a3d9c20

Initial clone with modifications

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.ipynb_checkpoints/Gen_llm_eval_output-checkpoint.py +0 -0
.ipynb_checkpoints/get_model_info-checkpoint.py +129 -0
.ipynb_checkpoints/preprocess_models_output-checkpoint.py +250 -0
Gen_llm_eval_output.py +117 -0
csv_files/llm_scores_p1.xlsx +0 -0
csv_files/llm_scores_p2.xlsx +0 -0
csv_files/llm_scores_p3.xlsx +0 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__0shot.txt +11 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__10shot.txt +10 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__0shot.txt +11 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__10shot.txt +10 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__0shot.txt +11 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__10shot.txt +10 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__0shot.txt +11 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__10shot.txt +10 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__0shot.txt +11 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__10shot.txt +10 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__0shot.txt +11 -0
csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__10shot.txt +10 -0
csv_files/outputs/HiTZ__Medical-mT5-large__en__0shot.txt +11 -0
csv_files/outputs/HiTZ__Medical-mT5-large__en__10shot.txt +10 -0
csv_files/outputs/HiTZ__Medical-mT5-large__gr__0shot.txt +11 -0
csv_files/outputs/HiTZ__Medical-mT5-large__gr__10shot.txt +10 -0
csv_files/outputs/HiTZ__Medical-mT5-large__it__0shot.txt +10 -0
csv_files/outputs/HiTZ__Medical-mT5-large__it__10shot.txt +10 -0
csv_files/outputs/HiTZ__Medical-mT5-large__pl__0shot.txt +11 -0
csv_files/outputs/HiTZ__Medical-mT5-large__pl__10shot.txt +10 -0
csv_files/outputs/HiTZ__Medical-mT5-large__sk__0shot.txt +11 -0
csv_files/outputs/HiTZ__Medical-mT5-large__sk__10shot.txt +10 -0
csv_files/outputs/HiTZ__Medical-mT5-large__sl__0shot.txt +11 -0
csv_files/outputs/HiTZ__Medical-mT5-large__sl__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt +11 -0
csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt +10 -0
csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt +11 -0

.ipynb_checkpoints/Gen_llm_eval_output-checkpoint.py ADDED Viewed

File without changes

.ipynb_checkpoints/get_model_info-checkpoint.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+MODEL METADATA EXTRACTOR
+This script processes model evaluation output files (input_folder) from the lm-eval-harness library,
+extracts model identifiers, retrieves detailed metadata from HuggingFace
+and saves the information as structured JSON files (output_folder).
+Input: Directory containing .out files from lm-eval-harness
+Output: Directory with JSON files containing model metadata
+"""
+# Example input file format (lm-eval-harness output):
+'''
+hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
+|         Tasks          |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
+|------------------------|------:|------|-----:|--------|---|-----:|---|------|
+|evalita-mp              |      1|none  |      |acc     |↑  |0.5605|±  |0.0052|
+...
+Job completed
+'''
+# Example output JSON format:
+'''
+{
+    "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
+    "base_model": "LlamaForCausalLM",
+    "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",
+    "submitted_time": "2024-04-29 09:34:12+00:00",
+    "num_params_billion": 8.030261248,
+    "language": "en_it"
+}
+'''
+import os
+import re
+import json
+from huggingface_hub import HfApi
+# Configures the Hugging Face token (if needed)
+# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
+api = HfApi()
+# Directory paths
+# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
+#input_folder = "../evalita_llm_models_output/"
+input_folder = "/home/sfarzi/leaderboard/evalita_llm_leaderboard/task_result/"
+# output_folder: Directory where JSON files with model characteristics will be saved.
+output_folder = "/home/sfarzi/leaderboard/evalita_llm_leaderboard/e3c_llm_requests/"
+# Creates the output folder if it doesn't exist
+os.makedirs(output_folder, exist_ok=True)
+# Regular expression to find the model name
+model_pattern = re.compile(r"pretrained=([\w\-./]+)")
+# Scans files in the input folder
+for filename in os.listdir(input_folder):
+    if filename.endswith('.out'):
+        file_path = os.path.join(input_folder, filename)
+        # Reads the file content
+        with open(file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+        # Extracts the model name
+        match = model_pattern.search(content)
+        if match:
+            model_name = match.group(1)
+            print(f"Processing model: {model_name}")
+            try:
+                # Retrieves model information from HuggingFace
+                model_info = api.model_info(model_name)
+                # Calculates the number of parameters in billions, if available
+                num_params = None
+                if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
+                    num_params = model_info.safetensors.parameters["BF16"] / 1e9  # Convert to billions
+                # Extracts and concatenates languages
+                language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
+                #print(model_info)
+                # Builds the dictionary with required metadata
+                model_data = {
+                    "model": model_name,
+                    "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
+                    "revision": model_info.sha,
+                    # "precision": "bfloat16",  # If available, replace with real value
+                    # "weight_type": "Original",
+                    # "status": "FINISHED",
+                    "submitted_time": str(model_info.created_at),
+                    # "model_type": "pretrained",
+                    # "likes": model_info.likes,
+                    # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
+                    # "license": model_info.license,
+                    # "private": model_info.private,
+                    "num_params_billion": num_params,  # Number of parameters in billions
+                    "language": language,  # Extracted language
+                }
+                # Separates the model_name into two parts: directory name and file name
+                if "/" in model_name:
+                    dir_name, file_name = model_name.split("/", 1)
+                else:
+                    dir_name, file_name = model_name, model_name  # If no "/", use the same name
+                # Creates the folder for saving the produced json files
+                model_output_folder = os.path.join(output_folder, dir_name)
+                os.makedirs(model_output_folder, exist_ok=True)
+                # Saves the JSON file in the appropriate folder
+                output_file = os.path.join(model_output_folder, f"{file_name}.json")
+                # Check if the file already exists
+                if os.path.exists(output_file):
+                    print(f"File {output_file} already exists. Skipping...")
+                    continue
+                with open(output_file, "w", encoding="utf-8") as f:
+                    json.dump(model_data, f, indent=4)
+                print(f"Saved metadata for {model_name} in {output_file}")
+            except Exception as e:
+                print(f"Error retrieving info for {model_name}: {e}")
+            print("Process finished!")

.ipynb_checkpoints/preprocess_models_output-checkpoint.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+EVALITA LLM EVALUATION PROCESSOR
+Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
+DATA PIPELINE OVERVIEW:
+1. Inputs:
+   - Evaluation Results: Raw .out files from lm-eval-harness
+   - Model Metadata: Pre-collected .json files from HuggingFace
+2. Output:
+   - Comprehensive evaluation reports in JSON format
+   - Ready for ingestion into the evaluation leaderboard
+--------------------------------------------------------------------
+INPUT SPECIFICATION
+Evaluation Results (.out format):
+   hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
+   | Task          | Metric | Value  | Stderr |
+   |---------------|--------|--------|--------|
+   | main-task     | acc    | 0.5605 | 0.0052 |
+   | - sub-task    | acc    | 0.4640 | 0.0088 |
+   |   - prompt-1  | acc    | 0.3720 | 0.0216 |
+Model Metadata (.json format):
+   {
+     "model": "model-org/model-name",
+     "base_model": "ModelArchitecture",
+     "revision": "git_commit_hash",
+     "parameters": 8.03,
+     "language": "en_it"
+   }
+--------------------------------------------------------------------
+OUTPUT SPECIFICATION
+Evaluation Report (.json format):
+   {
+     "summary_metrics": {
+       "average_CPS": 41.74,
+       "num_tasks": 12
+     },
+     "model_config": {
+       "identifier": "model-org/model-name",
+       "architecture": "ModelArchitecture",
+       "parameters": 8.03,
+       "evaluation_settings": {
+         "fewshot": 5,
+         "batch_size": 1
+       }
+     },
+     "task_results": {
+       "task-name": {
+         "average_score": 52.60,
+         "best_prompt": {
+           "id": "prompt-6",
+           "score": 66.57
+         },
+         "prompt_analysis": [
+           {
+             "prompt_id": "prompt-1",
+             "score": 37.20,
+             "stderr": 0.0216
+           }
+         ]
+       }
+     }
+   }
+"""
+import json
+import os
+import re
+def safe_float(value):
+    """Safely converts a value to float, returning None if the conversion fails."""
+    try:
+        return float(value)
+    except ValueError:
+        return None
+def calculate_task_metrics(task_info):
+    """Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
+    accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
+    if not accuracies:
+        return None
+    task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
+    best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
+    task_info['best_prompt'] = best_prompt_data['value']
+    task_info['prompt_id'] = best_prompt_data['prompt']
+    # Calculate CPS
+    avg_acc = task_info['average_accuracy']
+    best_acc = task_info['best_prompt']
+    task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
+def extract_data_from_file(file_path):
+    """Extracts task and prompt data from a specified file."""
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    tasks_data = {}
+    current_task = None
+    for line in lines:
+        line = line.strip()
+        # Skips empty lines
+        if not line:
+            continue
+        # Skips header lines
+        if line.startswith("|         Tasks"):
+            continue
+        # Extracts model configuration details
+        if line.startswith("hf (pretrained="):
+            start = line.find("pretrained=") + len("pretrained=")
+            end = line.find(",", start)
+            pretrained_model = line[start:end]
+            num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
+            num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
+            batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
+            batch_size = int(batch_size_match.group(1)) if batch_size_match else None
+            continue
+        columns = line.split('|')
+        if len(columns) != 11:
+            continue
+        task_name = columns[1]
+        metric = columns[5].strip()
+        value = safe_float(columns[7])
+        stderr = safe_float(columns[9])
+        print (value)
+        # Skips normalized accuracy metrics
+        if metric == "acc_norm":
+            continue
+        # Identifies task and prompt sections in the file
+        if task_name.startswith(" - "):
+            task_name = task_name[3:].strip()
+            current_task = task_name
+            tasks_data.setdefault(current_task,
+                                  {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
+                                   'CPS': None})
+        elif task_name.startswith("  - ") and current_task:
+            prompt_name = task_name[4:].strip()
+            prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
+                           'stderr': stderr}
+            tasks_data[current_task]['prompts'].append(prompt_data)
+    # Special handling for evalita NER task to calculate weighted prompt averages
+    if "evalita NER" in tasks_data:
+        task_info = tasks_data["evalita NER"]
+        weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
+                      "WN prompt-1": 2088, "WN prompt-2": 2088}
+        weighted_values = {"prompt-1": 0, "prompt-2": 0}
+        total_weights = sum(weight_map.values())
+        for prompt in task_info['prompts']:
+            if prompt['prompt'] in weight_map:
+                if "prompt-1" in prompt['prompt']:
+                    weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
+                elif "prompt-2" in prompt['prompt']:
+                    weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
+        task_info['prompts'] = [
+            {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
+             'stderr': None},
+            {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
+             'stderr': None}]
+    # Calculates task metrics for each task
+    for task_info in tasks_data.values():
+        calculate_task_metrics(task_info)
+    # Calculates the average CPS across all tasks
+    tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
+    average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
+    config = {
+        "model_name": pretrained_model,
+        "num_fewshot": num_fewshot,
+        "batch_size": batch_size
+    }
+    return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
+"""
+MAIN PROCESSING PIPELINE
+This script executes the complete evaluation data processing workflow:
+1. Input Sources:
+   - Raw evaluation results (.out files) from: ../evalita_llm_models_output/
+   - Model metadata JSON files from: ../evalita_llm_requests/
+2. Processing Steps:
+   - Parses evaluation metrics from .out files
+   - Combines with model metadata
+   - Calculates aggregated performance statistics
+3. Output:
+   - Structured JSON results saved to: ../evalita_llm_results/
+   - Organized by model organization/name
+   - Contains complete evaluation results with metadata
+"""
+directory_in_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/task_result/'
+directory_in_requests_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/evalita_llm_requests/'
+directory_out_results_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/evalita_llm_results/'
+for filename in os.listdir(directory_in_path):
+    if filename.endswith('.out'):
+        file_path = os.path.join(directory_in_path, filename)
+        json_output = extract_data_from_file(file_path)
+        model_org_name, model_name = json_output['config']['model_name'].split('/')
+        config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
+        if os.path.exists(config_file_path):
+            with open(config_file_path, 'r', encoding='utf-8') as config_file:
+                additional_config = json.load(config_file)
+            json_output['config'].update(additional_config)
+        org_folder_path = os.path.join(directory_out_results_path, model_org_name)
+        os.makedirs(org_folder_path, exist_ok=True)
+        file_suffix = f"{json_output['config']['num_fewshot']}"
+        output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
+        with open(output_file_path, 'w', newline="\n") as outfile:
+            json.dump(json_output, outfile, indent=4)
+        print(f"File {filename} processed and saved to {output_file_path}")

Gen_llm_eval_output.py ADDED Viewed

	@@ -0,0 +1,117 @@

+#!/usr/bin/env python3
+#python Gen_llm_eval_output.py   --p1 csv_files/llm_scores_p1.xlsx   --p2 csv_files/llm_scores_p2.xlsx   --p3 csv_files/llm_scores_p3.xlsx   --output-dir csv_files/outputs
+import argparse
+import os
+import re
+import math
+import pandas as pd
+import numpy as np
+REQUIRED_COLS = ["model", "task", "language", "configuration", "prompts", "f1"]
+def read_scores(path: str) -> pd.DataFrame:
+    df = pd.read_excel(path)
+    # normalize columns
+    df.columns = [c.strip().lower() for c in df.columns]
+    if "prompts" not in df.columns and "prompt" in df.columns:
+        df["prompts"] = df["prompt"]
+    missing = [c for c in REQUIRED_COLS if c not in df.columns]
+    if missing:
+        raise ValueError(f"{path} is missing required columns: {missing}")
+    # keep only required, coerce f1 to numeric
+    df = df[REQUIRED_COLS].copy()
+    df["f1"] = pd.to_numeric(df["f1"], errors="coerce")
+    df = df.dropna(subset=["f1"])
+    return df
+def sanitize_filename(s: str) -> str:
+    return re.sub(r"[^0-9A-Za-z._\-+]+", "_", str(s).strip())
+def format_float(x):
+    if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))):
+        return "nan"
+    return f"{x:.4f}"
+def prompt_order_key(label: str):
+    # Sort by the number in "prompt-<n>" if present; fallback to string
+    m = re.search(r"(\d+)", str(label))
+    return (0, int(m.group(1))) if m else (1, str(label))
+def render_group_table(g: pd.DataFrame, model: str, language: str, configuration: str) -> str:
+    # Collect all prompt-level f1 values (across tasks and prompts)
+    prompt_values = g["f1"].to_numpy(dtype=float)
+    if prompt_values.size > 0:
+        gen_value = float(np.mean(prompt_values))
+        gen_stderr = float(np.std(prompt_values, ddof=1) / math.sqrt(len(prompt_values))) if len(prompt_values) > 1 else 0.0
+    else:
+        gen_value, gen_stderr = float("nan"), 0.0
+    # Build table text
+    if configuration=="0shot" : configuration='0'
+    if configuration=="10shot" : configuration='10'
+    model = model.split("__")[0]+'/'+model.split("__")[1]
+    #if model =='Henrychur__MMed-Llama-3-8B' : model='Henrychur/MMed-Llama-3-8B'
+    #if model =='HiTZ__Medical-mT5-large' : model=''
+    #if model =='Qwen__Qwen2.5-14B-Instruct-1M' : model='Qwen/'+model
+    #if model =='Qwen__Qwen2.5-32B-Instruct' : model='Qwen/'+model
+    #if model =='Qwen__Qwen3-30B-A3B-Instruct-2507' : model='Qwen/'+model
+    #if model =='deepseek-ai__DeepSeek-R1-Distill-Qwen-32B' : model=''
+    #if model =='epfl-llm__meditron-7b' : model=''
+    #if model =='google__gemma-2-9b-it' : model=''
+    #if model =='google__gemma-3-27b-it' : model=''
+    #if model =='google__medgemma-27b-text-it' : model=''
+    #if model =='google__medgemma-4b-it' : model=''
+    #if model =='microsoft__MediPhi-Clinical' : model=''
+    #if model =='microsoft__MediPhi-Instruct' : model=''
+    #if model =='mistralai__Mistral-7B-Instruct-v0.2' : model=''
+    #if model =='mistralai__Mistral-Nemo-Instruct-2407' : model=''
+    #if model =='tiiuae__Falcon3-10B-Instruct' : model=''
+    #if model =='unsloth__phi-4' : model=''
+    #if model =='Henrychur__MMed-Llama-3-8B' : model=''
+    header = f"hf (pretrained={model} ), num_fewshot: {configuration}, batch_size: 1"
+    lines = [
+        "|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|",
+        "|-------|-------|------|------|------|----|------|---|------|",
+        #f"|Gen   |       |      |      |f1    |    |{format_float(gen_value)} |---| {format_float(gen_stderr)} |",
+    ]
+    # For each task, add task row (mean over prompts) then prompt rows
+    for task, df_task in g.groupby("task", sort=False):
+        f1s = df_task["f1"].to_numpy(dtype=float)
+        task_mean = float(np.mean(f1s)) if f1s.size else float("nan")
+        lines.append(f"| - {task}        |       |      |      |f1    |   | {format_float(task_mean)} |   |0 |")
+        # Prompt-level rows, sorted by prompt number if available
+        df_task = df_task.copy()
+        df_task["_order"] = df_task["prompts"].map(prompt_order_key)
+        df_task = df_task.sort_values("_order")
+        for _, r in df_task.iterrows():
+            prompt_label = str(r["prompts"])
+            lines.append(f"|   - {prompt_label}  |       |      |      |f1    |   | {format_float(r['f1'])} |   | 0 |")
+    return header + "\n" + "\n".join(lines) + "\n"
+def main():
+    ap = argparse.ArgumentParser(description="Build per-(model,language,configuration) summaries from three prompt Excel files.")
+    ap.add_argument("--p1", required=True, help="Path to llm_scores_p1.xlsx")
+    ap.add_argument("--p2", required=True, help="Path to llm_scores_p2.xlsx")
+    ap.add_argument("--p3", required=True, help="Path to llm_scores_p3.xlsx")
+    ap.add_argument("--output-dir", required=True, help="Directory to write output files")
+    args = ap.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    df = pd.concat([read_scores(args.p1), read_scores(args.p2), read_scores(args.p3)], ignore_index=True)
+    # One file per (model, language, configuration)
+    for (model, language, config), g in df.groupby(["model", "language", "configuration"], sort=False):
+        content = render_group_table(g, model, language, config)
+        fname = f"{sanitize_filename(model)}__{sanitize_filename(language)}__{sanitize_filename(config)}.txt"
+        out_path = os.path.join(args.output_dir, fname)
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write(content)
+if __name__ == "__main__":
+    main()

csv_files/llm_scores_p1.xlsx ADDED Viewed

Binary file (28.9 kB). View file

csv_files/llm_scores_p2.xlsx ADDED Viewed

Binary file (26.3 kB). View file

csv_files/llm_scores_p3.xlsx ADDED Viewed

Binary file (23.1 kB). View file

csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0918 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0629 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1041 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1083 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.2604 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1287 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3394 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3131 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.2142 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2189 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2243 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1994 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.1429 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1189 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1668 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0611 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0620 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0592 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0620 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0863 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1017 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0506 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1065 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1474 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1667 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1089 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1667 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0937 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0821 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1053 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0416 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0435 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0429 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0384 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.1413 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0672 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2266 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1300 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.3753 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3299 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4023 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3938 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.1102 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0977 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1226 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0379 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0379 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0378 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0379 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0891 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0602 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1293 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0778 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.3966 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3992 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3916 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3992 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.1026 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0998 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1055 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0385 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0387 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0380 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0387 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0174 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0121 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0280 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0121 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.3507 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3444 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3632 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3444 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0889 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0734 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1045 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0438 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0429 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0456 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0429 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.1278 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0967 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1900 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0967 |   | 0 |

csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.3720 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3558 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4045 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3558 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0784 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0787 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0781 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0578 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0940 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0331 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0464 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1317 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1215 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1415 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1322 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0022 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0028 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0016 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0769 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0859 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0591 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0859 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1448 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1455 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1434 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1455 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0015 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0024 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0007 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0812 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0770 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0920 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0747 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1694 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1616 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1774 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1690 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0050 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0035 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0064 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0308 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0244 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0436 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0244 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__pl__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1516 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1500 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1548 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1500 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0031 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0040 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0023 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__sk__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0712 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0880 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0375 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0880 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__sk__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1444 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1485 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1360 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1485 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0031 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0038 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0024 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__sl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0711 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0777 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0579 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0777 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0000 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0000 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0000 |   | 0 |

csv_files/outputs/HiTZ__Medical-mT5-large__sl__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1422 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1470 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1325 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1470 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.0073 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0073 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0074 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.2500 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3425 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1181 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2893 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4075 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4135 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3917 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4172 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.5993 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6091 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5646 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6243 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.6179 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6332 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6025 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.1290 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1339 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1191 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1339 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.3957 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3796 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4266 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3810 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.6028 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6119 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5847 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6119 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.5993 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5962 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6024 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.2137 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2467 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1709 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2234 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4016 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4173 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3770 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4106 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.6569 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6719 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6327 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6661 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.5882 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5767 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5998 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0586 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.0697 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0364 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.0697 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4022 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3803 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4464 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3800 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.6092 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6226 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5824 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6226 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.5729 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5991 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5466 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.0955 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.1220 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.0426 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.1220 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4116 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4027 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4294 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4027 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.6419 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6386 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6486 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6386 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.5869 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5894 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5845 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.3398 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3910 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2375 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3910 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.3777 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3775 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3783 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3775 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.6371 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6467 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6178 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6467 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.5865 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5949 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5782 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.3279 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3804 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.3068 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2964 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4658 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4734 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4649 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4591 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.5895 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5970 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5602 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6113 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.6475 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6482 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6469 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.4506 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5976 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1568 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.5976 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4104 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4393 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4083 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3834 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.6175 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6196 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.6131 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6196 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.5905 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5913 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5896 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.2734 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3758 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.1647 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2796 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4370 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.4505 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4159 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.4447 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.7005 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.6934 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.7152 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.6930 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.5698 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.5801 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.5595 |   | 0 |

csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1
+|Tasks  |Version|Filter|n-shot|Metric|    |Value |   |Stderr|
+|-------|-------|------|------|------|----|------|---|------|
+| - ner        |       |      |      |f1    |   | 0.2428 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.2486 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.2311 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.2486 |   | 0 |
+| - re        |       |      |      |f1    |   | 0.4074 |   |0 |
+|   - p1  |       |      |      |f1    |   | 0.3865 |   | 0 |
+|   - p2  |       |      |      |f1    |   | 0.4569 |   | 0 |
+|   - p3  |       |      |      |f1    |   | 0.3788 |   | 0 |