cwinkler commited on
Commit
28db3d5
·
verified ·
1 Parent(s): 01a64ac

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chroma_langchain/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — Hugging Face Space (Gradio) using a prebuilt Chroma index
2
+ # Embeddings: nomic-ai/nomic-embed-text-v1.5 (HF), trust_remote_code=True, normalize_embeddings=True
3
+
4
+ import os
5
+ import gradio as gr
6
+
7
+ # Silence Chroma telemetry noise
8
+ os.environ["CHROMA_TELEMETRY_DISABLED"] = "1"
9
+
10
+ from chromadb.config import Settings
11
+ from langchain_chroma import Chroma
12
+ from langchain_community.embeddings import HuggingFaceEmbeddings
13
+
14
+ # -------- Config (can be overridden via Space "Variables") --------
15
+ PERSIST_DIR = os.getenv("PERSIST_DIR", "./chroma_langchain") # path to your committed Chroma index
16
+ EMB_MODEL = os.getenv("EMB_MODEL", "nomic-ai/nomic-embed-text-v1.5")
17
+ TOPK_DEF = int(os.getenv("TOPK", "5"))
18
+
19
+ # Embedding function for query text — must match the model used to build the index
20
+ EMBEDDINGS = HuggingFaceEmbeddings(
21
+ model_name=EMB_MODEL,
22
+ model_kwargs={"trust_remote_code": True},
23
+ encode_kwargs={"normalize_embeddings": True},
24
+ )
25
+
26
+ def load_vector_store():
27
+ """
28
+ Load the persisted Chroma collection with the embedding function for query-time encoding.
29
+ Returns (vs, error_message_or_None)
30
+ """
31
+ try:
32
+ vs = Chroma(
33
+ persist_directory=PERSIST_DIR,
34
+ embedding_function=EMBEDDINGS,
35
+ client_settings=Settings(anonymized_telemetry=False),
36
+ )
37
+ # sanity check (forces collection open)
38
+ _ = vs._collection.count()
39
+ return vs, None
40
+ except Exception as e:
41
+ # Helpful diagnostics: list available collections
42
+ try:
43
+ import chromadb
44
+ client = chromadb.PersistentClient(
45
+ path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False)
46
+ )
47
+ existing = [c.name for c in client.list_collections()]
48
+ except Exception:
49
+ existing = []
50
+ msg = (
51
+ f"Failed to load Chroma store at '{PERSIST_DIR}'. "
52
+ f"Existing collections: {existing or '—'}. "
53
+ "Check that the index folder is present in the Space and the collection name matches."
54
+ )
55
+ return None, f"{msg}\n\nDetails: {e}"
56
+
57
+ VS, LOAD_ERR = load_vector_store()
58
+
59
+ def search(query: str, k: int = TOPK_DEF):
60
+ if LOAD_ERR:
61
+ return f"⚠️ {LOAD_ERR}"
62
+ q = (query or "").strip()
63
+ if not q:
64
+ return "Please enter a query."
65
+ try:
66
+ results = VS.similarity_search_with_score(q, k=int(k))
67
+ except Exception as e:
68
+ return f"Search failed: {e}"
69
+ if not results:
70
+ return "No results."
71
+
72
+ lines = [f"### Top {len(results)} results"]
73
+ for i, (doc, score) in enumerate(results, 1):
74
+ meta = doc.metadata or {}
75
+ src = meta.get("source") or meta.get("file_path") or "(no source)"
76
+ snippet = (doc.page_content[:800] + "…") if len(doc.page_content) > 800 else doc.page_content
77
+ lines.append(f"**[{i}]** \nSimilarity: `{score:.4f}`\n\n> {snippet}")
78
+ lines.append("\n> **Disclaimer:** Models can produce incorrect or misleading statements. Verify with sources.")
79
+ return "\n\n".join(lines)
80
+
81
+ with gr.Blocks(title="Semantische Suchmaschine für BGH Leitsätzen v0.1") as demo:
82
+ gr.Markdown(
83
+ """
84
+ ## Semantische Suchmaschine für BGH Leitsätzen v0.1
85
+ Datensatz: Leitsätze von 87 Entscheidungen des 2. BGH Senats
86
+
87
+ **Wie es funktioniert:** Ermöglicht die semantische Suche im Datensatz und gibt die Entscheidungen geöordnet nach Ähnlichkeitswerten zurück.
88
+
89
+ **Versuche bespielsweise:**
90
+ - `Ist eine Partnerschaft zwischen einem Tierarzt und einem Betriebswirt zulässig?` → erwartetes Aktenzeichen **II ZB 6/21**
91
+
92
+ *Disclaimer:* Models may produce incorrect or misleading statements. Verify with sources.
93
+ """
94
+ )
95
+ with gr.Row():
96
+ q = gr.Textbox(label="Query", placeholder="Is partial priority allowed under EPC?")
97
+ k = gr.Slider(1, 20, value=TOPK_DEF, step=1, label="Top-K")
98
+ out = gr.Markdown()
99
+ gr.Button("Search").click(fn=search, inputs=[q, k], outputs=[out])
100
+
101
+ demo.launch()
chroma_langchain/4b6fff2a-e7df-4306-8325-d78eab7960b5/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23add52afbe7588391f32d3deffb581b2663d2e2ad8851aba7de25e6b3f66761
3
+ size 32120000
chroma_langchain/4b6fff2a-e7df-4306-8325-d78eab7960b5/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8c7f00b4415698ee6cb94332eff91aedc06ba8e066b1f200e78ca5df51abb57
3
+ size 100
chroma_langchain/4b6fff2a-e7df-4306-8325-d78eab7960b5/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b56dc2a9efbf97230f629eef146503c7fe46a2bb0a57dfc90e6ff3d9231af35a
3
+ size 40000
chroma_langchain/4b6fff2a-e7df-4306-8325-d78eab7960b5/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
chroma_langchain/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52db109ad2e66225fb70611dd4671c6edf7ca0e331cac91d79e495ae67f36f4d
3
+ size 917504
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ chromadb
3
+ langchain-chroma
4
+ langchain-community
5
+ sentence-transformers
6
+ huggingface-hub
7
+ einops