## Welcome to the Second Lab - Week 1, Day 3

Changes I've made with this lab.
1) Modified the original question to instead generate a range of questions, 12 of them. These questions will be used to evaluate each LLM's reasoning, knowledge, creativity, and ability to handle nuanced scenarios.
2) I've changed this lab to run the queries in parallel. Thanks GPT for helping with the code to do that. :)
3) Instead of having one LLM rate all the responses, I have all of the LLM's rate each others work and then use a Borda Count to asign points to determine the winner.

In [None]:
# Start with imports - ask ChatGPT to explain any package that you don't know

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display

In [None]:
# Always remember to do this!
load_dotenv(override=True)

In [None]:
# Print the key prefixes to help with any debugging

openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
gemini_api_key = os.getenv('GEMINI_API_KEY')
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if openai_api_key:
 print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
 print("OpenAI API Key not set")
 
if anthropic_api_key:
 print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
 print("Anthropic API Key not set (and this is optional)")

if gemini_api_key:
 print(f"Gemini API Key exists and begins {gemini_api_key[:2]}")
else:
 print("Gemini API Key not set (and this is optional)")

if deepseek_api_key:
 print(f"DeepSeek API Key exists and begins {deepseek_api_key[:3]}")
else:
 print("DeepSeek API Key not set (and this is optional)")

if groq_api_key:
 print(f"Groq API Key exists and begins {groq_api_key[:4]}")
else:
 print("Groq API Key not set (and this is optional)")

In [None]:
request = """You are being evaluated for your reasoning, knowledge, creativity, and ability to handle nuanced scenarios. 
Generate 12 questions that cover the following categories:
- Logical reasoning and problem solving
- Creative writing and storytelling
- Factual accuracy and knowledge recall
- Following instructions with strict constraints
- Multi-step planning and organization
- Ethical dilemmas and debatable issues
- Philosophical or abstract reasoning
- Summarization and explanation at different levels
- Translation and multilingual ability
- Roleplay or adaptive communication style

Number each question from 1 to 12. 
The result should be a balanced benchmark question set that fully tests an LLM’s capabilities.

Important: Output only clean plain text. 
Do not use any markup, formatting symbols, quotation marks, brackets, lists, or special characters 
that could cause misinterpretation. Only provide plain text questions, one per line, numbered 1 to 20.
"""
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

In [None]:
# Generate the questions.
openai = OpenAI()
response = openai.chat.completions.create(
 model="gpt-4o-mini",
 messages=messages,
)
question = response.choices[0].message.content

display(Markdown(question))

In [None]:
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [None]:
# Ask the LLM's in Parallel

import asyncio

clients = {
 "openai": OpenAI(),
 "claude": Anthropic(),
 "gemini": OpenAI(api_key=gemini_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/"),
 "deepseek": OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1"),
 "groq": OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1"),
}

# Get the answers from the LLM
async def call_llm(model_name, messages):
 try:
 if "claude" in model_name:
 response = await asyncio.to_thread(
 clients["claude"].messages.create,
 model=model_name,
 messages=messages,
 max_tokens=3000,
 )
 answer = "".join([c.text for c in response.content if c.type == "text"])
 
 elif "gpt-4o-mini" in model_name:
 response = await asyncio.to_thread(
 clients["openai"].chat.completions.create,
 model=model_name,
 messages=messages,
 )
 answer = response.choices[0].message.content

 elif "gemini" in model_name:
 response = await asyncio.to_thread(
 clients["gemini"].chat.completions.create,
 model=model_name,
 messages=messages,
 )
 answer = response.choices[0].message.content

 elif "deepseek" in model_name:
 response = await asyncio.to_thread(
 clients["deepseek"].chat.completions.create,
 model=model_name,
 messages=messages,
 )
 answer = response.choices[0].message.content

 elif "llama" in model_name:
 response = await asyncio.to_thread(
 clients["groq"].chat.completions.create,
 model=model_name,
 messages=messages,
 )
 answer = response.choices[0].message.content

 return model_name, answer 

 except Exception as e:
 print (f"❌ Error: {str(e)}")
 return model_name, "I was not able to generate answers for any of the questions."


# send out the calls to the LLM to ask teh questions.
async def ask_questions_in_parallel(messages):
 competitor_models = [
 "gpt-4o-mini",
 "claude-3-7-sonnet-latest",
 "gemini-2.0-flash",
 "deepseek-chat",
 "llama-3.3-70b-versatile"
 ]

 # create tasks to call the LLM's in parallel
 tasks = [call_llm(model, messages) for model in competitor_models]

 answers = []
 competitors = []

 # When we have an answer, we can process it. No waiting.
 for task in asyncio.as_completed(tasks):
 model_name, answer = await task
 competitors.append(model_name)
 answers.append(answer)
 print(f"\n✅ Got response from {model_name}")

 return competitors, answers

In [None]:
# Fire off the ask to all the LLM's at once. Parallelization...
competitors, answers = await ask_questions_in_parallel(messages)

In [None]:
#Look at the results
print (len(answers))
print (len(competitors))
print (competitors)

In [None]:
# Let's bring this together - note the use of "enumerate"

together = ""
for index, answer in enumerate(answers):
 together += f"# Response from competitor {index+1}\n\n"
 together += answer + "\n\n"

In [None]:
print(together)

In [None]:

judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given the folowing questions:

{question}

Your task is to evaluate the overall strength of the arguments presented by each competitor. 
Consider the following factors:
- Clarity: how clearly the ideas are communicated
- Relevance: how directly the response addresses the question
- Depth: the level of reasoning, insight, or supporting evidence provided
- Persuasiveness: how compelling or convincing the response is overall
Respond with JSON, and only JSON.
The output must be a single JSON array of competitor names, ordered from best to worst.
Do not include any keys, labels, or extra text.

Example format:
["1", "3", "5", "2", "4"]

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks.
Do not deviate from the json format as described above. Do not include the term ranking in the final json"""


In [None]:
print(judge)

In [None]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
# Have each LLM rate all of the results.
results = dict()
LLM_result = ''

competitors, answers = await ask_questions_in_parallel(judge_messages)

results = dict()
for index, each_competitor in enumerate(competitors):
 results[each_competitor] = answers[index].strip()

In [None]:
# See the results
print (len(answers))
results = dict()
for index, each_competitor in enumerate(competitors):
 results[each_competitor] = answers[index]

print (results)

In [None]:
# Lets convert these rankings into scores. Borda Count - (1st gets 4, 2nd gets 3, etc.).
number_of_competitors = len(competitors)
scores = {}

for rankings in results.values():
 print(rankings)

In [None]:
# # Borda count points (1st gets n-1, 2nd gets n-2, etc.)
num_competitors = len(competitors)

competitor_dict = dict()
for index, each_competitor in enumerate(competitors):
 competitor_dict[each_competitor] = index + 1

borda_scores_dict = dict()
for each_competitor in competitors:
 if each_competitor not in borda_scores_dict:
 borda_scores_dict[each_competitor] = 0

for voter_llm, ranking_str in results.items():
 ranking_indices = json.loads(ranking_str)
 ranking_indices = [int(x) for x in ranking_indices]

 # For each position in the ranking, award points
 for position, competitor_index in enumerate(ranking_indices):
 competitor_name = competitors[competitor_index - 1]

 # Borda count points (1st gets n-1, 2nd gets n-2, etc.)
 points = num_competitors - 1 - position 
 borda_scores_dict[competitor_name] += points
 
sorted_results = sorted(borda_scores_dict.items(), key=lambda x: x[1], reverse=True)

print(f"{'Rank':<4} {'LLM':<30} {'Points':<3}")
print("-" * 50)

for rank, (llm, points) in enumerate(sorted_results, 1):
 print(f"{rank:<4} {llm:<30} {points:<8}")

print("\nQuestions asked:")
print(question)