import os import glob import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM import spaces import faiss from sentence_transformers import SentenceTransformer MODEL_BASE = "Qwen/Qwen2.5-1.5B-Instruct" DOCS_DIR = "docs" # ---------- Metni parçalara bölme (RAG chunking) ---------- def split_text(text: str, chunk_size: int = 800, overlap: int = 100): chunks = [] start = 0 length = len(text) while start < length: end = min(start + chunk_size, length) chunk = text[start:end].strip() if chunk: chunks.append(chunk) start = end - overlap if start < 0: start = 0 return chunks # --------------------------------------------------------- print("Loading tokenizer…") tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE) print("Loading model…") model = AutoModelForCausalLM.from_pretrained( MODEL_BASE, torch_dtype=torch.float16, device_map="auto", ) model.eval() # --------- RAG: dokümanları ve FAISS index'i yükle --------- def load_docs(): texts = [] paths = sorted(glob.glob(os.path.join(DOCS_DIR, "*.txt"))) for p in paths: try: with open(p, "r", encoding="utf-8") as f: texts.append(f.read()) except Exception: pass return texts print("Loading RAG docs…") raw_docs = load_docs() # tam dokümanlar docs = [] # chunk'lar buraya for d in raw_docs: docs.extend(split_text(d)) # her dokümanı parçalara böl if docs: embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") doc_embeddings = embed_model.encode(docs, convert_to_numpy=True) index = faiss.IndexFlatL2(doc_embeddings.shape[1]) index.add(doc_embeddings) print(f"RAG: {len(docs)} chunks indexed.") else: embed_model = None index = None print("RAG: no docs found, context will be empty.") def retrieve_context(query: str, k: int = 3) -> str: if index is None or embed_model is None or not docs: return "" q_emb = embed_model.encode([query], convert_to_numpy=True) k = min(k, len(docs)) D, I = index.search(q_emb, k) parts = [] for i in I[0]: if 0 <= i < len(docs): parts.append(docs[i]) return "\n---\n".join(parts) # ----------------------------------------------------------- @spaces.GPU(duration=120) def respond(message, history, system_message, max_tokens, temperature, top_p): # İlk giriş: sadece karşılama, model yok if len(history) == 0: return "Merhaba, 2025 Doçentlik Koşulları için bana herhangi bir şey sor?" # RAG context context = retrieve_context(message, k=3) if context: model_system = system_message + "\n\n[RETRIEVED CONTEXT]\n" + context[:2000] else: model_system = system_message messages_for_model = [{"role": "system", "content": model_system}] messages_for_model.extend(history) messages_for_model.append({"role": "user", "content": message}) text = tokenizer.apply_chat_template( messages_for_model, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer(text, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, ) # sadece yeni üretilen kısmı al input_len = inputs["input_ids"].shape[1] gen_tokens = outputs[0, input_len:] final = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip() return final chatbot = gr.ChatInterface( respond, type="messages", additional_inputs=[ gr.Textbox("You are a scientific assistant.", label="System"), gr.Slider(1, 4096, 1024, step=1, label="Max tokens"), gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p"), ], ) with gr.Blocks() as demo: chatbot.render() if __name__ == "__main__": demo.launch()