import json import re from dataset import SimpleTokenizr from tqdm import tqdm tokenizer = SimpleTokenizr() def filterdata(data): filtered = [] unused = [] low_quality = [] long = [] filtered_lines = 0 unused_lines = 0 low_quality_lines = 0 long_lines = 0 for line in tqdm(data, unit='B', unit_scale=True, unit_divisor=1024): decoded = json.dumps(line) data = json.loads(decoded) text = data.get("text","") encoded = tokenizer.tokenize(text) if re.search(r"\d",text): unused_lines += 1 unused.append(line) else: if len(encoded) >= 64: filtered_lines += 1 filtered.append(line) if len(encoded) < 64: long_lines += 1 long.append(text) print(f"Filtered {filtered_lines} successfully!") print(f"Removed {unused_lines} from data.") print(f"Removed {long_lines} from data (too short).") #print(f"Removed {low_quality} from data (low quality).") with open("./data/filtered_data.jsonl", "w", encoding="utf-8") as f: for lines in filtered: dump = json.dumps(lines) decoded = json.loads(dump) f.write(json.dumps(decoded,ensure_ascii=False) + "\n")