# datasetgen.py import json import random from faker import Faker from tqdm import tqdm import os fake = Faker() OUTPUT_PATH = "data/filtered_data.jsonl" #os.makedirs("datasets", exist_ok=True) def generate_example(): """Generates a single GPT-like QA pair""" q_templates = [ "What is {}?", "How do you {}?", "Why is {} important?", "Give me an example of {}.", "Explain {} in simple terms.", "Compare {} and {}.", "What happens if {}?", "Can you summarize {}?" ] concepts = [ "machine learning", "quantum physics", "natural selection", "photosynthesis", "neural networks", "global warming", "black holes", "economic inflation", "probability", "blockchain" ] actions = [ "train a neural network", "reduce carbon emissions", "make bread", "calculate probability", "grow tomatoes", "optimize code", "write a resume", "design a logo", "encrypt data", "learn Python" ] concept = random.choice(concepts) action = random.choice(actions) template = random.choice(q_templates) if '{}' in template and template.count('{}') == 1: question = template.format(random.choice([concept, action])) else: question = template.format(concept, random.choice(concepts)) # Simulate an answer (in real GPT training you'd use real completions) answer = f"{fake.paragraph(nb_sentences=4)}" return { "text": "^User: "+ question + "\nMiniGPT: " + answer + " ", } def generate_dataset(n=5000): with open(OUTPUT_PATH, "w", encoding="utf-8") as f: for _ in tqdm(range(n), desc="Generating Examples"): example = generate_example() f.write(json.dumps(example, ensure_ascii=False) + "\n") print(f"\n✅ Dataset saved to: {OUTPUT_PATH}") if __name__ == "__main__": generate_dataset(5000)