Add evaluation results: HLE, GPQA Diamond, SWE-bench Verified, Terminal-Bench 2.0

Files changed (4) hide show

.eval_results/gpqa.yaml ADDED Viewed

+- dataset:
+    id: Idavidrein/gpqa
+    task_id: diamond
+  value: 87.2
+  date: '2026-04-14'
+  source:
+    url: https://huggingface.co/tencent/Hy3-preview
+    name: Model Card

.eval_results/hle.yaml ADDED Viewed

+- dataset:
+    id: cais/hle
+    task_id: hle
+  value: 30.0
+  date: '2026-04-14'
+  source:
+    url: https://huggingface.co/tencent/Hy3-preview
+    name: Model Card
+  notes: "Text-only"

.eval_results/swe_bench_verified.yaml ADDED Viewed

+- dataset:
+    id: SWE-bench/SWE-bench_Verified
+    task_id: swe_bench_%_resolved
+  value: 74.4
+  date: '2026-04-14'
+  source:
+    url: https://huggingface.co/tencent/Hy3-preview
+    name: Model Card

.eval_results/terminal_bench.yaml ADDED Viewed

+- dataset:
+    id: harborframework/terminal-bench-2.0
+    task_id: terminalbench_2
+  value: 54.4
+  date: '2026-04-14'
+  source:
+    url: https://huggingface.co/tencent/Hy3-preview
+    name: Model Card