mfirat007 commited on
Commit
0bd97eb
·
verified ·
1 Parent(s): 8c9c02c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -18
app.py CHANGED
@@ -11,6 +11,25 @@ from sentence_transformers import SentenceTransformer
11
  MODEL_BASE = "Qwen/Qwen2.5-1.5B-Instruct"
12
  DOCS_DIR = "docs"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  print("Loading tokenizer…")
15
  tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)
16
 
@@ -34,24 +53,11 @@ def load_docs():
34
  pass
35
  return texts
36
 
37
- def split_text(text: str, chunk_size: int = 800, overlap: int = 100):
38
- chunks = []
39
- start = 0
40
- n = len(text)
41
- while start < n:
42
- end = start + chunk_size
43
- chunk = text[start:end]
44
- chunks.append(chunk.strip())
45
- start = end - overlap
46
- return chunks
47
-
48
  print("Loading RAG docs…")
49
- raw_docs = load_docs()
50
-
51
- # chunk'lanmış dokümanlar
52
- docs: list[str] = []
53
  for d in raw_docs:
54
- docs.extend(split_text(d))
55
 
56
  if docs:
57
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
@@ -65,9 +71,10 @@ else:
65
  print("RAG: no docs found, context will be empty.")
66
 
67
  def retrieve_context(query: str, k: int = 3) -> str:
68
- if index is None or embed_model is None:
69
  return ""
70
  q_emb = embed_model.encode([query], convert_to_numpy=True)
 
71
  D, I = index.search(q_emb, k)
72
  parts = []
73
  for i in I[0]:
@@ -122,7 +129,7 @@ chatbot = gr.ChatInterface(
122
  respond,
123
  type="messages",
124
  additional_inputs=[
125
- gr.Textbox("Merhaba", label="System"),
126
  gr.Slider(1, 4096, 1024, step=1, label="Max tokens"),
127
  gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature"),
128
  gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p"),
 
11
  MODEL_BASE = "Qwen/Qwen2.5-1.5B-Instruct"
12
  DOCS_DIR = "docs"
13
 
14
+ # ---------- Metni parçalara bölme (RAG chunking) ----------
15
+ def split_text(text: str, chunk_size: int = 800, overlap: int = 100):
16
+ chunks = []
17
+ start = 0
18
+ length = len(text)
19
+
20
+ while start < length:
21
+ end = min(start + chunk_size, length)
22
+ chunk = text[start:end].strip()
23
+ if chunk:
24
+ chunks.append(chunk)
25
+ start = end - overlap
26
+ if start < 0:
27
+ start = 0
28
+
29
+ return chunks
30
+ # ---------------------------------------------------------
31
+
32
+
33
  print("Loading tokenizer…")
34
  tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)
35
 
 
53
  pass
54
  return texts
55
 
 
 
 
 
 
 
 
 
 
 
 
56
  print("Loading RAG docs…")
57
+ raw_docs = load_docs() # tam dokümanlar
58
+ docs = [] # chunk'lar buraya
 
 
59
  for d in raw_docs:
60
+ docs.extend(split_text(d)) # her dokümanı parçalara böl
61
 
62
  if docs:
63
  embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
71
  print("RAG: no docs found, context will be empty.")
72
 
73
  def retrieve_context(query: str, k: int = 3) -> str:
74
+ if index is None or embed_model is None or not docs:
75
  return ""
76
  q_emb = embed_model.encode([query], convert_to_numpy=True)
77
+ k = min(k, len(docs))
78
  D, I = index.search(q_emb, k)
79
  parts = []
80
  for i in I[0]:
 
129
  respond,
130
  type="messages",
131
  additional_inputs=[
132
+ gr.Textbox("You are a scientific assistant.", label="System"),
133
  gr.Slider(1, 4096, 1024, step=1, label="Max tokens"),
134
  gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature"),
135
  gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p"),