Upload folder using huggingface_hub
Browse files
data/trained-tiny-gpt/checkpoint-tiny-gpt.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7dc6d3e7756554064ba1cb7785ae75395d3fb6b74362e212b0029da91c79c2f2
|
3 |
+
size 66253943
|
data/trained-tiny-gpt/tokenizer.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"stoi": {"'": 0, "+": 1, ",": 2, "-": 3, ".": 4, "/": 5, "1": 6, "10": 7, "100": 8, "101": 9, "102": 10, "103": 11, "104": 12, "105": 13, "106": 14, "107": 15, "108": 16, "109": 17, "11": 18, "110": 19, "111": 20, "112": 21, "113": 22, "114": 23, "115": 24, "116": 25, "117": 26, "118": 27, "119": 28, "12": 29, "120": 30, "121": 31, "122": 32, "123": 33, "124": 34, "125": 35, "126": 36, "127": 37, "128": 38, "129": 39, "13": 40, "130": 41, "131": 42, "132": 43, "133": 44, "134": 45, "135": 46, "136": 47, "137": 48, "138": 49, "139": 50, "14": 51, "140": 52, "141": 53, "142": 54, "143": 55, "144": 56, "145": 57, "146": 58, "147": 59, "148": 60, "149": 61, "15": 62, "150": 63, "151": 64, "152": 65, "153": 66, "154": 67, "155": 68, "156": 69, "157": 70, "158": 71, "159": 72, "16": 73, "160": 74, "161": 75, "162": 76, "163": 77, "165": 78, "166": 79, "167": 80, "168": 81, "169": 82, "17": 83, "170": 84, "171": 85, "172": 86, "173": 87, "177": 88, "178": 89, "179": 90, "18": 91, "180": 92, "181": 93, "182": 94, "183": 95, "184": 96, "185": 97, "186": 98, "187": 99, "188": 100, "19": 101, "190": 102, "193": 103, "194": 104, "2": 105, "20": 106, "21": 107, "22": 108, "23": 109, "24": 110, "25": 111, "26": 112, "27": 113, "28": 114, "29": 115, "3": 116, "30": 117, "31": 118, "32": 119, "33": 120, "34": 121, "35": 122, "36": 123, "37": 124, "38": 125, "39": 126, "4": 127, "40": 128, "41": 129, "42": 130, "43": 131, "44": 132, "45": 133, "46": 134, "47": 135, "48": 136, "49": 137, "5": 138, "50": 139, "51": 140, "52": 141, "53": 142, "54": 143, "55": 144, "56": 145, "57": 146, "58": 147, "59": 148, "6": 149, "60": 150, "61": 151, "62": 152, "63": 153, "64": 154, "65": 155, "66": 156, "67": 157, "68": 158, "69": 159, "7": 160, "70": 161, "71": 162, "72": 163, "73": 164, "74": 165, "75": 166, "76": 167, "77": 168, "78": 169, "79": 170, "8": 171, "80": 172, "81": 173, "82": 174, "83": 175, "84": 176, "85": 177, "86": 178, "87": 179, "88": 180, "89": 181, "9": 182, "90": 183, "91": 184, "92": 185, "93": 186, "94": 187, "95": 188, "96": 189, "97": 190, "98": 191, "99": 192, ":": 193, "<": 194, "<END>": 195, "<PAD>": 196, "<UNK>": 197, ">": 198, "?": 199, "^": 200, "^user:": 201, "a": 202, "about": 203, "actions": 204, "add": 205, "ai": 206, "algorithm": 207, "allows": 208, "an": 209, "and": 210, "are": 211, "array": 212, "artificial": 213, "as": 214, "based": 215, "block": 216, "blueprint": 217, "book": 218, "boolean": 219, "bras": 220, "brazil": 221, "by": 222, "calculate": 223, "can": 224, "canada": 225, "capital": 226, "change": 227, "city": 228, "class": 229, "code": 230, "conclusions": 231, "conditions": 232, "convert": 233, "correct": 234, "creating": 235, "decision": 236, "deduction": 237, "define": 238, "delhi": 239, "democracy": 240, "derive": 241, "different": 242, "do": 243, "does": 244, "doesn": 245, "don": 246, "during": 247, "each": 248, "else": 249, "end": 250, "energy": 251, "error": 252, "execution": 253, "explain": 254, "false": 255, "fix": 256, "for": 257, "force": 258, "france": 259, "from": 260, "function": 261, "general": 262, "give": 263, "go": 264, "government": 265, "gravity": 266, "handles": 267, "has": 268, "have": 269, "he": 270, "help": 271, "how": 272, "human": 273, "i": 274, "if": 275, "in": 276, "india": 277, "instructions": 278, "intelligence": 279, "into": 280, "is": 281, "it": 282, "japan": 283, "know": 284, "late": 285, "lia": 286, "like": 287, "logic": 288, "loop": 289, "machines": 290, "making": 291, "me": 292, "mean": 293, "meaning": 294, "minigpt": 295, "minigpt:": 296, "multiple": 297, "new": 298, "objects": 299, "of": 300, "on": 301, "one": 302, "organized": 303, "other": 304, "ottawa": 305, "paris": 306, "perform": 307, "photosynthesis": 308, "plants": 309, "playing": 310, "please": 311, "plus": 312, "population": 313, "problem": 314, "programming": 315, "pulls": 316, "purpose": 317, "python": 318, "repeating": 319, "reusable": 320, "s": 321, "school": 322, "sentence": 323, "serves": 324, "set": 325, "she": 326, "should": 327, "simulation": 328, "solve": 329, "specific": 330, "statements": 331, "stores": 332, "sum": 333, "sunlight": 334, "system": 335, "t": 336, "tell": 337, "term": 338, "that": 339, "the": 340, "there": 341, "they": 342, "this": 343, "to": 344, "tokyo": 345, "toward": 346, "true": 347, "use": 348, "used": 349, "useful": 350, "user": 351, "value": 352, "values": 353, "variable": 354, "version": 355, "want": 356, "was": 357, "we": 358, "went": 359, "were": 360, "what": 361, "when": 362, "which": 363, "whole": 364, "why": 365, "yesterday": 366, "you": 367, "\u2014": 368}, "itos": {"0": "'", "1": "+", "2": ",", "3": "-", "4": ".", "5": "/", "6": "1", "7": "10", "8": "100", "9": "101", "10": "102", "11": "103", "12": "104", "13": "105", "14": "106", "15": "107", "16": "108", "17": "109", "18": "11", "19": "110", "20": "111", "21": "112", "22": "113", "23": "114", "24": "115", "25": "116", "26": "117", "27": "118", "28": "119", "29": "12", "30": "120", "31": "121", "32": "122", "33": "123", "34": "124", "35": "125", "36": "126", "37": "127", "38": "128", "39": "129", "40": "13", "41": "130", "42": "131", "43": "132", "44": "133", "45": "134", "46": "135", "47": "136", "48": "137", "49": "138", "50": "139", "51": "14", "52": "140", "53": "141", "54": "142", "55": "143", "56": "144", "57": "145", "58": "146", "59": "147", "60": "148", "61": "149", "62": "15", "63": "150", "64": "151", "65": "152", "66": "153", "67": "154", "68": "155", "69": "156", "70": "157", "71": "158", "72": "159", "73": "16", "74": "160", "75": "161", "76": "162", "77": "163", "78": "165", "79": "166", "80": "167", "81": "168", "82": "169", "83": "17", "84": "170", "85": "171", "86": "172", "87": "173", "88": "177", "89": "178", "90": "179", "91": "18", "92": "180", "93": "181", "94": "182", "95": "183", "96": "184", "97": "185", "98": "186", "99": "187", "100": "188", "101": "19", "102": "190", "103": "193", "104": "194", "105": "2", "106": "20", "107": "21", "108": "22", "109": "23", "110": "24", "111": "25", "112": "26", "113": "27", "114": "28", "115": "29", "116": "3", "117": "30", "118": "31", "119": "32", "120": "33", "121": "34", "122": "35", "123": "36", "124": "37", "125": "38", "126": "39", "127": "4", "128": "40", "129": "41", "130": "42", "131": "43", "132": "44", "133": "45", "134": "46", "135": "47", "136": "48", "137": "49", "138": "5", "139": "50", "140": "51", "141": "52", "142": "53", "143": "54", "144": "55", "145": "56", "146": "57", "147": "58", "148": "59", "149": "6", "150": "60", "151": "61", "152": "62", "153": "63", "154": "64", "155": "65", "156": "66", "157": "67", "158": "68", "159": "69", "160": "7", "161": "70", "162": "71", "163": "72", "164": "73", "165": "74", "166": "75", "167": "76", "168": "77", "169": "78", "170": "79", "171": "8", "172": "80", "173": "81", "174": "82", "175": "83", "176": "84", "177": "85", "178": "86", "179": "87", "180": "88", "181": "89", "182": "9", "183": "90", "184": "91", "185": "92", "186": "93", "187": "94", "188": "95", "189": "96", "190": "97", "191": "98", "192": "99", "193": ":", "194": "<", "195": "<END>", "196": "<PAD>", "197": "<UNK>", "198": ">", "199": "?", "200": "^", "201": "^user:", "202": "a", "203": "about", "204": "actions", "205": "add", "206": "ai", "207": "algorithm", "208": "allows", "209": "an", "210": "and", "211": "are", "212": "array", "213": "artificial", "214": "as", "215": "based", "216": "block", "217": "blueprint", "218": "book", "219": "boolean", "220": "bras", "221": "brazil", "222": "by", "223": "calculate", "224": "can", "225": "canada", "226": "capital", "227": "change", "228": "city", "229": "class", "230": "code", "231": "conclusions", "232": "conditions", "233": "convert", "234": "correct", "235": "creating", "236": "decision", "237": "deduction", "238": "define", "239": "delhi", "240": "democracy", "241": "derive", "242": "different", "243": "do", "244": "does", "245": "doesn", "246": "don", "247": "during", "248": "each", "249": "else", "250": "end", "251": "energy", "252": "error", "253": "execution", "254": "explain", "255": "false", "256": "fix", "257": "for", "258": "force", "259": "france", "260": "from", "261": "function", "262": "general", "263": "give", "264": "go", "265": "government", "266": "gravity", "267": "handles", "268": "has", "269": "have", "270": "he", "271": "help", "272": "how", "273": "human", "274": "i", "275": "if", "276": "in", "277": "india", "278": "instructions", "279": "intelligence", "280": "into", "281": "is", "282": "it", "283": "japan", "284": "know", "285": "late", "286": "lia", "287": "like", "288": "logic", "289": "loop", "290": "machines", "291": "making", "292": "me", "293": "mean", "294": "meaning", "295": "minigpt", "296": "minigpt:", "297": "multiple", "298": "new", "299": "objects", "300": "of", "301": "on", "302": "one", "303": "organized", "304": "other", "305": "ottawa", "306": "paris", "307": "perform", "308": "photosynthesis", "309": "plants", "310": "playing", "311": "please", "312": "plus", "313": "population", "314": "problem", "315": "programming", "316": "pulls", "317": "purpose", "318": "python", "319": "repeating", "320": "reusable", "321": "s", "322": "school", "323": "sentence", "324": "serves", "325": "set", "326": "she", "327": "should", "328": "simulation", "329": "solve", "330": "specific", "331": "statements", "332": "stores", "333": "sum", "334": "sunlight", "335": "system", "336": "t", "337": "tell", "338": "term", "339": "that", "340": "the", "341": "there", "342": "they", "343": "this", "344": "to", "345": "tokyo", "346": "toward", "347": "true", "348": "use", "349": "used", "350": "useful", "351": "user", "352": "value", "353": "values", "354": "variable", "355": "version", "356": "want", "357": "was", "358": "we", "359": "went", "360": "were", "361": "what", "362": "when", "363": "which", "364": "whole", "365": "why", "366": "yesterday", "367": "you", "368": "\u2014"}}
|
localscripts/mergelines.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
def merge_short_lines(file_path,min_length=32):
|
3 |
+
merged = []
|
4 |
+
buffer = ""
|
5 |
+
|
6 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
7 |
+
for line in f:
|
8 |
+
line = line.strip()
|
9 |
+
data = json.loads(line)
|
10 |
+
text = data["text"]
|
11 |
+
buffer += " " + text.strip()
|
12 |
+
if len(buffer) >= min_length:
|
13 |
+
merged.append({"text": buffer.strip()})
|
14 |
+
buffer = ""
|
15 |
+
|
16 |
+
if buffer.strip():
|
17 |
+
merged.append({"text": buffer.strip})
|
18 |
+
|
19 |
+
print(f"Merged {len(merged)} lines")
|
20 |
+
return merged
|
localscripts/mergelines2.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from mergelines import merge_short_lines
|
3 |
+
|
4 |
+
merged_data = merge_short_lines("./customchatbot-v1/data/data.jsonl")
|
5 |
+
with open("./customchatbot-v1/data/merged_data.jsonl","w",encoding="utf-8") as out:
|
6 |
+
for item in merged_data:
|
7 |
+
out.write(json.dumps(item) + "\n")
|
8 |
+
|
9 |
+
# with open("./customchatbot-v1/data/data.jsonl","r",encoding="utf-8") as out:
|
10 |
+
# for item in out:
|
11 |
+
# with open("./customchatbot-v1/data/backup_data.jsonl","w",encoding="utf-8") as out2:
|
12 |
+
# out2.write(json.dumps(item) + "\n")
|
localscripts/train_custommade.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch
|
4 |
+
from model import MiniGPT
|
5 |
+
from dataset import DataLoader,ChatDataset,SimpleTokenizr
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
with open("./customchatbot-v1/data/merged_data.jsonl", "r", encoding="utf-8") as f:
|
9 |
+
texts = [json.loads(line)["text"] for line in f if line.strip()]
|
10 |
+
|
11 |
+
tokenizer = SimpleTokenizr()
|
12 |
+
tokenizer.train(texts)
|
13 |
+
|
14 |
+
model = MiniGPT(vocab_size=100)
|
15 |
+
|
16 |
+
criterion = nn.CrossEntropyLoss()
|
17 |
+
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
|
18 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
+
model.to(device)
|
20 |
+
|
21 |
+
dataset = ChatDataset("./customchatbot-v1/data/merged_data.jsonl", tokenizer)
|
22 |
+
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
|
23 |
+
|
24 |
+
def Train(epochs):
|
25 |
+
for epoch in range(epochs):
|
26 |
+
model.train()
|
27 |
+
loop = tqdm(enumerate(dataloader),total=len(dataloader),desc="Training")
|
28 |
+
tloss = 0
|
29 |
+
for i,l in loop:
|
30 |
+
optimizer.zero_grad()
|
31 |
+
outputs = model(i)
|
32 |
+
loss = criterion(outputs,l)
|
33 |
+
loss.backward()
|
34 |
+
|
35 |
+
Train(epochs=1)
|
localscripts/trainer_data_maker.py
ADDED
File without changes
|
minigpt.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from model import MiniGPT
|
4 |
+
from dataset import MiniBPETokenizr,SimpleTokenizr
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
|
8 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
9 |
+
|
10 |
+
# Load tokenizer
|
11 |
+
tokenizer = SimpleTokenizr()
|
12 |
+
tokenizer.load("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
|
13 |
+
|
14 |
+
# Load model
|
15 |
+
model = MiniGPT(vocab_size=len(tokenizer))
|
16 |
+
model.load_state_dict(torch.load("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth", map_location=device) if os.path.exists("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth") else torch.load("./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth", map_location=device)["model_state_dict"] )
|
17 |
+
model.eval().to(device)
|
18 |
+
totalparams = sum(p.numel() for p in model.parameters())
|
19 |
+
print(f"Model total params: {totalparams:,}")
|
20 |
+
|
21 |
+
def sample_token(logits, temperature=1.0):
|
22 |
+
logits = logits / temperature
|
23 |
+
logits = torch.nan_to_num(logits, nan=-1e9)
|
24 |
+
probs = F.softmax(logits, dim=-1)
|
25 |
+
|
26 |
+
if torch.any(torch.isnan(probs)) or torch.any(probs < 0):
|
27 |
+
print("⚠️ Invalid probs detected. Using uniform fallback.")
|
28 |
+
probs = torch.ones_like(probs) / probs.size(-1)
|
29 |
+
|
30 |
+
return torch.multinomial(probs, num_samples=1).item()
|
31 |
+
|
32 |
+
def generate_reply(prompt, max_tokens=100):
|
33 |
+
tokens = tokenizer.encode(prompt)
|
34 |
+
if not tokens:
|
35 |
+
print("⚠️ Empty prompt after encoding.")
|
36 |
+
return
|
37 |
+
input_ids = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
|
38 |
+
generated = []
|
39 |
+
|
40 |
+
with torch.no_grad():
|
41 |
+
for _ in range(max_tokens):
|
42 |
+
logits = model(input_ids)
|
43 |
+
logits = logits[:, -1, :]
|
44 |
+
next_token = sample_token(logits)
|
45 |
+
generated.append(next_token)
|
46 |
+
|
47 |
+
next_str = tokenizer.itos.get(next_token, "")
|
48 |
+
encoded_text = tokenizer.encode(next_str)
|
49 |
+
decoded_text = tokenizer.decode(encoded_text)
|
50 |
+
print(decoded_text, end=" ", flush=True)
|
51 |
+
|
52 |
+
if next_str == "<END>":
|
53 |
+
break
|
54 |
+
|
55 |
+
input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(device)], dim=1)
|
56 |
+
print()
|
57 |
+
|
58 |
+
# Chat loop
|
59 |
+
print("🧠 MiniGPT Chat (type 'exit' to quit')")
|
60 |
+
while True:
|
61 |
+
user_input = input("User: ")
|
62 |
+
if user_input.lower() == "exit":
|
63 |
+
break
|
64 |
+
prompt = f"^User: {user_input}\nMiniGPT:"
|
65 |
+
print("MiniGPT: ", end="", flush=True)
|
66 |
+
generate_reply(prompt)
|