tomas.helmfridsson commited on
Commit
1e5b35e
Β·
1 Parent(s): f9a8906

update 43 42 worked but short answer

Browse files
Files changed (1) hide show
  1. app.py +82 -98
app.py CHANGED
@@ -1,148 +1,132 @@
1
- # ── app.py ─────────────────────────────────────────────────────────
2
- import os, logging, math, textwrap
3
  import gradio as gr
4
- from transformers import pipeline
5
  from langchain_community.document_loaders import PyPDFLoader
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
 
10
- # ── KONFIGURATION ─────────────────────────────────────────
11
  DOCS_DIR = "document"
12
  INDEX_DIR = "faiss_index"
13
  EMB_MODEL = "KBLab/sentence-bert-swedish-cased"
14
  LLM_MODEL = "tiiuae/falcon-rw-1b"
15
 
16
- CHUNK_SIZE = 500
17
- CHUNK_OVERLAP = 50
18
- MAX_NEW_TOKENS = 128 # svarlΓ€ngd
19
- CTX_TOKEN_MAX = 900 # fΓΆr att stanna under modell‑begrΓ€nsningen 1β€―024
20
- K = 10 # hur mΓ₯nga chunkar vi hΓ€mtar
21
  DEFAULT_TEMP = 0.3
22
 
23
- # ── LOGGING ──────────────────────────────────────────────
24
- logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
25
- logger = logging.getLogger(__name__)
26
 
27
- # ── 1) Index (bygg eller ladda) ─────────────────────────
28
  emb = HuggingFaceEmbeddings(model_name=EMB_MODEL)
29
-
30
  if os.path.isdir(INDEX_DIR):
31
- logger.info(f"πŸ”„ Laddar FAISS‑index frΓ₯n `{INDEX_DIR}`")
32
  vs = FAISS.load_local(INDEX_DIR, emb)
33
  else:
34
- logger.info("βš™οΈ Bygger FAISS‑index frΓ₯n PDF‑filer …")
35
- splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,
36
- chunk_overlap=CHUNK_OVERLAP)
37
- docs, files = [], []
38
  for fn in os.listdir(DOCS_DIR):
39
  if fn.lower().endswith(".pdf"):
40
- pages = PyPDFLoader(os.path.join(DOCS_DIR, fn)).load()
41
- chunks = splitter.split_documents(pages)
42
  for c in chunks:
43
  c.metadata["source"] = fn
44
- docs.extend(chunks)
45
- files.append(fn)
46
- vs = FAISS.from_documents(docs, emb)
47
- vs.save_local(INDEX_DIR)
48
- logger.info(f"βœ… Sparade index ({len(files)}β€―PDF, {len(docs)}β€―chunkar)")
49
 
50
  retriever = vs.as_retriever(search_kwargs={"k": K})
51
 
52
- # ── 2) LLM‑pipeline ─────────────────────────────────────
53
- logger.info("πŸš€ Initierar text‑genererings‑pipeline …")
54
- gen_pipe = pipeline("text-generation",
55
- model=LLM_MODEL,
56
- device=-1,
57
- max_new_tokens=MAX_NEW_TOKENS)
58
-
59
- logger.info("βœ… LLM klar")
60
-
61
- # ── 3) HjΓ€lpfunktioner ──────────────────────────────────
62
- def truncate_tokens(text: str, max_tokens: int = CTX_TOKEN_MAX) -> str:
63
- """VΓ€ldigt enkel token‑approx (1β€―token β‰ˆ4β€―tecken)"""
64
- approx_tokens = len(text) // 4
65
- if approx_tokens <= max_tokens:
66
- return text
67
- slice_len = max_tokens * 4
68
- return text[:slice_len]
69
-
70
- def test_retrieval(query: str) -> str:
71
- docs = retriever.get_relevant_documents(query)
72
- if not docs:
73
- return "🚫 Inga trÀffar"
74
- out = []
75
- for i, d in enumerate(docs, 1):
76
- src = d.metadata.get("source", "okΓ€nd")
77
- snippet = d.page_content.replace("\n", " ")[:160]
78
- out.append(f"{i}. ({src}) …{snippet}…")
79
- return "\n\n".join(out)
80
-
81
- # ── 4) Chat‑funktionen (exakt 3β€―param, 2β€―retur) ─────────
82
- def chat_fn(query: str, temperature: float, history: list[dict]):
83
- history = history or []
84
- history.append({"role": "user", "content": query})
85
-
86
- # HΓ€mtar K chunkar
87
- docs = retriever.get_relevant_documents(query)
88
- if not docs:
89
- ans = "🚫 Hittade inget relevant innehΓ₯ll i dokumenten."
90
- history.append({"role": "assistant", "content": ans})
91
- return history, history
92
-
93
- # Bygg kontext och trimma
94
- context = "\n\n---\n\n".join(d.page_content for d in docs)
95
- context = truncate_tokens(context, CTX_TOKEN_MAX)
96
-
97
- prompt = textwrap.dedent(f"""
98
  Du Γ€r en hjΓ€lpsam assistent som svarar pΓ₯ svenska.
99
- Kontext (ur PDF‑dokument):
100
 
101
  {context}
102
 
103
  FrΓ₯ga: {query}
104
  Svar (svenska):""").strip()
105
 
106
- logger.info(f"πŸ“ Prompt‑lΓ€ngdβ‰ˆ{len(prompt)}β€―tecken, temp={temperature}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  try:
109
- resp = gen_pipe(prompt,
110
- temperature=float(temperature),
111
- max_new_tokens=MAX_NEW_TOKENS,
112
- pad_token_id=2,
113
- eos_token_id=2,
114
- do_sample=True,
115
- return_full_text=False)[0]["generated_text"]
 
 
116
  except Exception as e:
117
- logger.exception("Fel vid generering")
118
- resp = f"❌ Fel: {e}"
119
 
120
- src_hint = docs[0].metadata.get("source", "okΓ€nd")
121
- history.append({"role": "assistant",
122
- "content": f"**(KΓ€lla: {src_hint})**\n\n{resp}"})
123
  return history, history
124
 
125
- # ── 5) Bygg Gradio‑UI ──────────────────────────────────
126
  with gr.Blocks() as demo:
127
- gr.Markdown("# πŸ“š svensk RAG‑chat\nStΓ€ll frΓ₯gor till dina PDF‑filer")
128
- gr.Markdown(f"**PDF‑filer i index:** {', '.join(os.listdir(DOCS_DIR)) or 'inga'}")
129
 
130
  with gr.Row():
131
- test_in = gr.Textbox(label="Snabb‑retrieval (ingen AI)", lines=1)
132
- test_btn = gr.Button("πŸ”Ž Testa")
133
- test_out = gr.Textbox(label="Chunkar")
134
 
135
  with gr.Row():
136
- q_in = gr.Textbox(placeholder="Ex: Vad stΓ₯r det om krav?", label="FrΓ₯ga")
137
  temp = gr.Slider(0, 1, value=DEFAULT_TEMP, step=0.05, label="Temperatur")
138
- send = gr.Button("πŸ“¨ Skicka")
139
 
140
  chat = gr.Chatbot(type="messages", label="Chat")
141
  chat_hist = gr.State([])
142
 
143
- # Kopplingar
144
- test_btn.click(test_retrieval, inputs=[test_in], outputs=[test_out])
145
- send.click(chat_fn, inputs=[q_in, temp, chat_hist], outputs=[chat, chat_hist])
146
 
147
  if __name__ == "__main__":
148
- demo.launch(share=True)
 
1
+ # ── app.py ───────────────────────────────────────────────────────────
2
+ import os, logging, textwrap
3
  import gradio as gr
4
+ from transformers import pipeline, AutoTokenizer
5
  from langchain_community.document_loaders import PyPDFLoader
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_huggingface.embeddings import HuggingFaceEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
 
10
+ # ── KONFIG ───────────────────────────────────────────────────────────
11
  DOCS_DIR = "document"
12
  INDEX_DIR = "faiss_index"
13
  EMB_MODEL = "KBLab/sentence-bert-swedish-cased"
14
  LLM_MODEL = "tiiuae/falcon-rw-1b"
15
 
16
+ CHUNK_SIZE = 400
17
+ CHUNK_OVERLAP = 40
18
+ CTX_TOK_MAX = 750 # sparar marginal till frΓ₯ga + svar
19
+ MAX_NEW_TOKENS = 128
20
+ K = 10
21
  DEFAULT_TEMP = 0.3
22
 
23
+ # ── LOGGING ──────────────────────────────────────────────────────────
24
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
25
+ log = logging.getLogger(__name__)
26
 
27
+ # ── 1) Index (bygg eller ladda) ─────────────────────────────────────
28
  emb = HuggingFaceEmbeddings(model_name=EMB_MODEL)
 
29
  if os.path.isdir(INDEX_DIR):
30
+ log.info(f"πŸ”„ Laddar index frΓ₯n {INDEX_DIR}")
31
  vs = FAISS.load_local(INDEX_DIR, emb)
32
  else:
33
+ splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
34
+ docs, pdfs = [], []
 
 
35
  for fn in os.listdir(DOCS_DIR):
36
  if fn.lower().endswith(".pdf"):
37
+ chunks = splitter.split_documents(PyPDFLoader(os.path.join(DOCS_DIR, fn)).load())
 
38
  for c in chunks:
39
  c.metadata["source"] = fn
40
+ docs.extend(chunks); pdfs.append(fn)
41
+ vs = FAISS.from_documents(docs, emb); vs.save_local(INDEX_DIR)
42
+ log.info(f"βœ… Byggt index – {len(pdfs)}β€―PDF / {len(docs)}β€―chunkar")
 
 
43
 
44
  retriever = vs.as_retriever(search_kwargs={"k": K})
45
 
46
+ # ── 2) LLM‑pipeline & tokenizer ─────────────────────────────────────
47
+ log.info("πŸš€ Initierar LLM …")
48
+ gen_pipe = pipeline("text-generation", model=LLM_MODEL, device=-1, max_new_tokens=MAX_NEW_TOKENS)
49
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
50
+ log.info("βœ… LLM klar")
51
+
52
+ # ── 3) HjΓ€lpfunktioner ──────────────────────────────────────────────
53
+ def build_prompt(query: str, docs):
54
+ """
55
+ Tar sΓ₯ mΓ₯nga chunkar som ryms i CTX_TOK_MAX token
56
+ """
57
+ context_parts = []
58
+ total_ctx_tok = 0
59
+ for d in docs:
60
+ tok_len = len(tokenizer.encode(d.page_content))
61
+ if total_ctx_tok + tok_len > CTX_TOK_MAX:
62
+ break
63
+ context_parts.append(d.page_content)
64
+ total_ctx_tok += tok_len
65
+
66
+ context = "\n\n---\n\n".join(context_parts)
67
+ return textwrap.dedent(f"""\
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  Du Γ€r en hjΓ€lpsam assistent som svarar pΓ₯ svenska.
69
+ Kontext (hΓ€mtat ur PDF‑dokument):
70
 
71
  {context}
72
 
73
  FrΓ₯ga: {query}
74
  Svar (svenska):""").strip()
75
 
76
+ def test_retrieval(q): # snabb‑test utan AI
77
+ docs = retriever.invoke(q)
78
+ return "\n\n".join([f"{i+1}. ({d.metadata['source']}) {d.page_content[:160]}…" for i, d in enumerate(docs)]) or "🚫 Inga trΓ€ffar"
79
+
80
+ def chat_fn(q, temp, history):
81
+ history = history or []
82
+ history.append({"role": "user", "content": q})
83
+
84
+ docs = retriever.invoke(q)
85
+ if not docs:
86
+ history.append({"role": "assistant", "content": "🚫 Hittade inget relevant."})
87
+ return history, history
88
+
89
+ prompt = build_prompt(q, docs)
90
+ log.info(f"Prompt tokens={len(tokenizer.encode(prompt))} temp={temp}")
91
 
92
  try:
93
+ ans = gen_pipe(
94
+ prompt,
95
+ temperature=float(temp),
96
+ max_new_tokens=MAX_NEW_TOKENS,
97
+ pad_token_id=tokenizer.eos_token_id,
98
+ eos_token_id=tokenizer.eos_token_id,
99
+ do_sample=True,
100
+ return_full_text=False
101
+ )[0]["generated_text"]
102
  except Exception as e:
103
+ log.exception("Genererings‑fel")
104
+ ans = f"❌ Fel: {e}"
105
 
106
+ src_hint = docs[0].metadata["source"]
107
+ history.append({"role": "assistant", "content": f"**(KΓ€lla: {src_hint})**\n\n{ans}"})
 
108
  return history, history
109
 
110
+ # ── 4) Gradio UI ────────────────────────────────────────────────────
111
  with gr.Blocks() as demo:
112
+ gr.Markdown("# πŸ“š Svensk RAG‑chat")
113
+ gr.Markdown(f"**PDF‑filer:** {', '.join(os.listdir(DOCS_DIR)) or '–'}")
114
 
115
  with gr.Row():
116
+ q_test = gr.Textbox(label="πŸ”Ž Test Retrieval")
117
+ b_test = gr.Button("Testa")
118
+ o_test = gr.Textbox(label="Chunkar")
119
 
120
  with gr.Row():
121
+ q_in = gr.Textbox(label="FrΓ₯ga", placeholder="Ex: Vad handlar dokumenten om?")
122
  temp = gr.Slider(0, 1, value=DEFAULT_TEMP, step=0.05, label="Temperatur")
123
+ b_send = gr.Button("Skicka")
124
 
125
  chat = gr.Chatbot(type="messages", label="Chat")
126
  chat_hist = gr.State([])
127
 
128
+ b_test.click(test_retrieval, inputs=[q_test], outputs=[o_test])
129
+ b_send.click(chat_fn, inputs=[q_in, temp, chat_hist], outputs=[chat, chat_hist])
 
130
 
131
  if __name__ == "__main__":
132
+ demo.launch(share=True) # ta bort share=True om du vill hΓ₯lla den privat