from datasets import load_dataset import json import re from tqdm import tqdm from filter import filterdata # Custom filtering logic # Load 110k samples from OpenWebText print("📦 Loading dataset (110k samples)...") ds = load_dataset("OpenAssistant/oasst1",split="train") convo = [] print("⚙️ Processing dataset into Q&A pairs...") for entry in tqdm(ds, unit='samples'): if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"): parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None) if parent and parent.get("role") == "user": convo.append({ "input": parent["text"], "output": entry["text"] }) #convo.append({ # "instruction": instruction, # "input": user_input, # "output": bot_response, # "text": full_instruction + "\n" + bot_response #}) print(f"✅ Got {len(convo)} usable Q&A pairs.") # Save unfiltered data unfiltered_path = "./data/unfiltered_data.jsonl" with open(unfiltered_path, "w", encoding="utf-8") as f: for line in convo: f.write(json.dumps(line, ensure_ascii=False) + "\n") print(f"📝 Saved unfiltered data to {unfiltered_path}") # Run filtering print("🚿 Starting filtering...") filterdata(convo)