import os import pandas as pd import argparse from typing import List, Dict, Any from evaluate import QwenEvaluator def run_benchmark(model_id: str, dataset_path: str, num_samples: int = 10): print(f"Benchmarking model: {model_id} on {dataset_path}") # We can't actually run 7B here without GPU, but we provide the logic try: evaluator = QwenEvaluator(model_id=model_id) evaluator.setup_model() # Load local dataset df = pd.read_json(dataset_path, orient="records", lines=True).head(num_samples) results = [] for i, row in df.iterrows(): print(f"Evaluating sample {i+1}/{num_samples}") instruction = row.get("instruction", "") # Simple simulation for local runs without GPU if not torch.cuda.is_available(): print("CUDA not available. Simulating response...") response_clean = "\nSimulation of complex reasoning process...\n\n\nSimulation answer.\n" else: inputs = evaluator.tokenizer( [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"], return_tensors="pt" ).to("cuda") outputs = evaluator.model.generate(**inputs, max_new_tokens=1024, use_cache=True) response = evaluator.tokenizer.batch_decode(outputs)[0] response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip() results.append({ "instruction": instruction, "ground_truth": row.get("output", ""), "model_response": response_clean }) results_df = pd.DataFrame(results) # Save raw results first report_path = f"benchmark_report_{model_id.replace('/', '_')}.jsonl" results_df.to_json(report_path, orient="records", lines=True) print(f"Raw benchmark results saved to {report_path}") try: # Judge the results judged_df = evaluator.judge_responses(results_df, "Complex reasoning and multi-step math/logic") # Save judged results judged_df.to_json(report_path, orient="records", lines=True) print(f"Judged benchmark report saved to {report_path}") avg_score = judged_df["judge_score"].mean() if "judge_score" in judged_df.columns else 0 print(f"Average Judge Score: {avg_score:.2f}") except Exception as judge_e: print(f"Judging failed: {judge_e}") print("Proceeding with raw results.") except Exception as e: print(f"Benchmark failed: {e}") print("Note: 7B models require significant GPU memory. Ensure you are running this on a T4 x2 or A100 instance.") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark a Qwen model on Reasoning Assistant") parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-7B", help="Model ID") parser.add_argument("--dataset", type=str, default="reasoning_assistant_v2_10.jsonl", help="Dataset path") parser.add_argument("--num", type=int, default=10, help="Number of samples") args = parser.parse_args() # Import torch here to avoid error if not installed in some envs import torch run_benchmark(args.model, args.dataset, args.num)