{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Welcome to the Second Lab - Week 1, Day 3\n", "\n", "Changes I've made with this lab.\n", "1) Modified the original question to instead generate a range of questions, 12 of them. These questions will be used to evaluate each LLM's reasoning, knowledge, creativity, and ability to handle nuanced scenarios.\n", "2) I've changed this lab to run the queries in parallel. Thanks GPT for helping with the code to do that. :)\n", "3) Instead of having one LLM rate all the responses, I have all of the LLM's rate each others work and then use a Borda Count to asign points to determine the winner." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Start with imports - ask ChatGPT to explain any package that you don't know\n", "\n", "import os\n", "import json\n", "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "from anthropic import Anthropic\n", "from IPython.display import Markdown, display" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Always remember to do this!\n", "load_dotenv(override=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Print the key prefixes to help with any debugging\n", "\n", "openai_api_key = os.getenv('OPENAI_API_KEY')\n", "anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')\n", "gemini_api_key = os.getenv('GEMINI_API_KEY')\n", "deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')\n", "groq_api_key = os.getenv('GROQ_API_KEY')\n", "\n", "if openai_api_key:\n", " print(f\"OpenAI API Key exists and begins {openai_api_key[:8]}\")\n", "else:\n", " print(\"OpenAI API Key not set\")\n", " \n", "if anthropic_api_key:\n", " print(f\"Anthropic API Key exists and begins {anthropic_api_key[:7]}\")\n", "else:\n", " print(\"Anthropic API Key not set (and this is optional)\")\n", "\n", "if gemini_api_key:\n", " print(f\"Gemini API Key exists and begins {gemini_api_key[:2]}\")\n", "else:\n", " print(\"Gemini API Key not set (and this is optional)\")\n", "\n", "if deepseek_api_key:\n", " print(f\"DeepSeek API Key exists and begins {deepseek_api_key[:3]}\")\n", "else:\n", " print(\"DeepSeek API Key not set (and this is optional)\")\n", "\n", "if groq_api_key:\n", " print(f\"Groq API Key exists and begins {groq_api_key[:4]}\")\n", "else:\n", " print(\"Groq API Key not set (and this is optional)\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "request = \"\"\"You are being evaluated for your reasoning, knowledge, creativity, and ability to handle nuanced scenarios. \n", "Generate 12 questions that cover the following categories:\n", "- Logical reasoning and problem solving\n", "- Creative writing and storytelling\n", "- Factual accuracy and knowledge recall\n", "- Following instructions with strict constraints\n", "- Multi-step planning and organization\n", "- Ethical dilemmas and debatable issues\n", "- Philosophical or abstract reasoning\n", "- Summarization and explanation at different levels\n", "- Translation and multilingual ability\n", "- Roleplay or adaptive communication style\n", "\n", "Number each question from 1 to 12. \n", "The result should be a balanced benchmark question set that fully tests an LLM’s capabilities.\n", "\n", "Important: Output only clean plain text. \n", "Do not use any markup, formatting symbols, quotation marks, brackets, lists, or special characters \n", "that could cause misinterpretation. Only provide plain text questions, one per line, numbered 1 to 20.\n", "\"\"\"\n", "request += \"Answer only with the question, no explanation.\"\n", "messages = [{\"role\": \"user\", \"content\": request}]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Generate the questions.\n", "openai = OpenAI()\n", "response = openai.chat.completions.create(\n", " model=\"gpt-4o-mini\",\n", " messages=messages,\n", ")\n", "question = response.choices[0].message.content\n", "\n", "display(Markdown(question))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "competitors = []\n", "answers = []\n", "messages = [{\"role\": \"user\", \"content\": question}]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Ask the LLM's in Parallel\n", "\n", "import asyncio\n", "\n", "clients = {\n", " \"openai\": OpenAI(),\n", " \"claude\": Anthropic(),\n", " \"gemini\": OpenAI(api_key=gemini_api_key, base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\"),\n", " \"deepseek\": OpenAI(api_key=deepseek_api_key, base_url=\"https://api.deepseek.com/v1\"),\n", " \"groq\": OpenAI(api_key=groq_api_key, base_url=\"https://api.groq.com/openai/v1\"),\n", "}\n", "\n", "# Get the answers from the LLM\n", "async def call_llm(model_name, messages):\n", " try:\n", " if \"claude\" in model_name:\n", " response = await asyncio.to_thread(\n", " clients[\"claude\"].messages.create,\n", " model=model_name,\n", " messages=messages,\n", " max_tokens=3000,\n", " )\n", " answer = \"\".join([c.text for c in response.content if c.type == \"text\"])\n", " \n", " elif \"gpt-4o-mini\" in model_name:\n", " response = await asyncio.to_thread(\n", " clients[\"openai\"].chat.completions.create,\n", " model=model_name,\n", " messages=messages,\n", " )\n", " answer = response.choices[0].message.content\n", "\n", " elif \"gemini\" in model_name:\n", " response = await asyncio.to_thread(\n", " clients[\"gemini\"].chat.completions.create,\n", " model=model_name,\n", " messages=messages,\n", " )\n", " answer = response.choices[0].message.content\n", "\n", " elif \"deepseek\" in model_name:\n", " response = await asyncio.to_thread(\n", " clients[\"deepseek\"].chat.completions.create,\n", " model=model_name,\n", " messages=messages,\n", " )\n", " answer = response.choices[0].message.content\n", "\n", " elif \"llama\" in model_name:\n", " response = await asyncio.to_thread(\n", " clients[\"groq\"].chat.completions.create,\n", " model=model_name,\n", " messages=messages,\n", " )\n", " answer = response.choices[0].message.content\n", "\n", " return model_name, answer \n", "\n", " except Exception as e:\n", " print (f\"❌ Error: {str(e)}\")\n", " return model_name, \"I was not able to generate answers for any of the questions.\"\n", "\n", "\n", "# send out the calls to the LLM to ask teh questions.\n", "async def ask_questions_in_parallel(messages):\n", " competitor_models = [\n", " \"gpt-4o-mini\",\n", " \"claude-3-7-sonnet-latest\",\n", " \"gemini-2.0-flash\",\n", " \"deepseek-chat\",\n", " \"llama-3.3-70b-versatile\"\n", " ]\n", "\n", " # create tasks to call the LLM's in parallel\n", " tasks = [call_llm(model, messages) for model in competitor_models]\n", "\n", " answers = []\n", " competitors = []\n", "\n", " # When we have an answer, we can process it. No waiting.\n", " for task in asyncio.as_completed(tasks):\n", " model_name, answer = await task\n", " competitors.append(model_name)\n", " answers.append(answer)\n", " print(f\"\\n✅ Got response from {model_name}\")\n", "\n", " return competitors, answers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Fire off the ask to all the LLM's at once. Parallelization...\n", "competitors, answers = await ask_questions_in_parallel(messages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Look at the results\n", "print (len(answers))\n", "print (len(competitors))\n", "print (competitors)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's bring this together - note the use of \"enumerate\"\n", "\n", "together = \"\"\n", "for index, answer in enumerate(answers):\n", " together += f\"# Response from competitor {index+1}\\n\\n\"\n", " together += answer + \"\\n\\n\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(together)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "judge = f\"\"\"You are judging a competition between {len(competitors)} competitors.\n", "Each model has been given the folowing questions:\n", "\n", "{question}\n", "\n", "Your task is to evaluate the overall strength of the arguments presented by each competitor. \n", "Consider the following factors:\n", "- Clarity: how clearly the ideas are communicated\n", "- Relevance: how directly the response addresses the question\n", "- Depth: the level of reasoning, insight, or supporting evidence provided\n", "- Persuasiveness: how compelling or convincing the response is overall\n", "Respond with JSON, and only JSON.\n", "The output must be a single JSON array of competitor names, ordered from best to worst.\n", "Do not include any keys, labels, or extra text.\n", "\n", "Example format:\n", "[\"1\", \"3\", \"5\", \"2\", \"4\"]\n", "\n", "Here are the responses from each competitor:\n", "\n", "{together}\n", "\n", "Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks.\n", "Do not deviate from the json format as described above. Do not include the term ranking in the final json\"\"\"\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(judge)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "judge_messages = [{\"role\": \"user\", \"content\": judge}]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Have each LLM rate all of the results.\n", "results = dict()\n", "LLM_result = ''\n", "\n", "competitors, answers = await ask_questions_in_parallel(judge_messages)\n", "\n", "results = dict()\n", "for index, each_competitor in enumerate(competitors):\n", " results[each_competitor] = answers[index].strip()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# See the results\n", "print (len(answers))\n", "results = dict()\n", "for index, each_competitor in enumerate(competitors):\n", " results[each_competitor] = answers[index]\n", "\n", "print (results)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Lets convert these rankings into scores. Borda Count - (1st gets 4, 2nd gets 3, etc.).\n", "number_of_competitors = len(competitors)\n", "scores = {}\n", "\n", "for rankings in results.values():\n", " print(rankings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# # Borda count points (1st gets n-1, 2nd gets n-2, etc.)\n", "num_competitors = len(competitors)\n", "\n", "competitor_dict = dict()\n", "for index, each_competitor in enumerate(competitors):\n", " competitor_dict[each_competitor] = index + 1\n", "\n", "borda_scores_dict = dict()\n", "for each_competitor in competitors:\n", " if each_competitor not in borda_scores_dict:\n", " borda_scores_dict[each_competitor] = 0\n", "\n", "for voter_llm, ranking_str in results.items():\n", " ranking_indices = json.loads(ranking_str)\n", " ranking_indices = [int(x) for x in ranking_indices]\n", "\n", " # For each position in the ranking, award points\n", " for position, competitor_index in enumerate(ranking_indices):\n", " competitor_name = competitors[competitor_index - 1]\n", "\n", " # Borda count points (1st gets n-1, 2nd gets n-2, etc.)\n", " points = num_competitors - 1 - position \n", " borda_scores_dict[competitor_name] += points\n", " \n", "sorted_results = sorted(borda_scores_dict.items(), key=lambda x: x[1], reverse=True)\n", "\n", "print(f\"{'Rank':<4} {'LLM':<30} {'Points':<3}\")\n", "print(\"-\" * 50)\n", "\n", "for rank, (llm, points) in enumerate(sorted_results, 1):\n", " print(f\"{rank:<4} {llm:<30} {points:<8}\")\n", "\n", "print(\"\\nQuestions asked:\")\n", "print(question)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }