Spaces:

cwinkler
/

bgh

Running

App Files Files Community

cwinkler commited on 5 days ago

Commit

3cbd6c7

verified ·

1 Parent(s): d50ca3b

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -101

app.py CHANGED Viewed

@@ -1,101 +1,103 @@
-# app.py — Hugging Face Space (Gradio) using a prebuilt Chroma index
-# Embeddings: nomic-ai/nomic-embed-text-v1.5 (HF), trust_remote_code=True, normalize_embeddings=True
-import os
-import gradio as gr
-# Silence Chroma telemetry noise
-os.environ["CHROMA_TELEMETRY_DISABLED"] = "1"
-from chromadb.config import Settings
-from langchain_chroma import Chroma
-from langchain_community.embeddings import HuggingFaceEmbeddings
-# -------- Config (can be overridden via Space "Variables") --------
-PERSIST_DIR = os.getenv("PERSIST_DIR", "./chroma_langchain")     # path to your committed Chroma index
-EMB_MODEL   = os.getenv("EMB_MODEL", "nomic-ai/nomic-embed-text-v1.5")
-TOPK_DEF    = int(os.getenv("TOPK", "5"))
-# Embedding function for query text — must match the model used to build the index
-EMBEDDINGS = HuggingFaceEmbeddings(
-    model_name=EMB_MODEL,
-    model_kwargs={"trust_remote_code": True},
-    encode_kwargs={"normalize_embeddings": True},
-)
-def load_vector_store():
-    """
-    Load the persisted Chroma collection with the embedding function for query-time encoding.
-    Returns (vs, error_message_or_None)
-    """
-    try:
-        vs = Chroma(
-            persist_directory=PERSIST_DIR,
-            embedding_function=EMBEDDINGS,
-            client_settings=Settings(anonymized_telemetry=False),
-        )
-        # sanity check (forces collection open)
-        _ = vs._collection.count()
-        return vs, None
-    except Exception as e:
-        # Helpful diagnostics: list available collections
-        try:
-            import chromadb
-            client = chromadb.PersistentClient(
-                path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False)
-            )
-            existing = [c.name for c in client.list_collections()]
-        except Exception:
-            existing = []
-        msg = (
-            f"Failed to load Chroma store at '{PERSIST_DIR}'. "
-            f"Existing collections: {existing or '—'}. "
-            "Check that the index folder is present in the Space and the collection name matches."
-        )
-        return None, f"{msg}\n\nDetails: {e}"
-VS, LOAD_ERR = load_vector_store()
-def search(query: str, k: int = TOPK_DEF):
-    if LOAD_ERR:
-        return f"⚠️ {LOAD_ERR}"
-    q = (query or "").strip()
-    if not q:
-        return "Please enter a query."
-    try:
-        results = VS.similarity_search_with_score(q, k=int(k))
-    except Exception as e:
-        return f"Search failed: {e}"
-    if not results:
-        return "No results."
-    lines = [f"### Top {len(results)} results"]
-    for i, (doc, score) in enumerate(results, 1):
-        meta = doc.metadata or {}
-        src = meta.get("source") or meta.get("file_path") or "(no source)"
-        snippet = (doc.page_content[:800] + "…") if len(doc.page_content) > 800 else doc.page_content
-        lines.append(f"**[{i}]**  \nSimilarity: `{score:.4f}`\n\n> {snippet}")
-    lines.append("\n> **Disclaimer:** Models can produce incorrect or misleading statements. Verify with sources.")
-    return "\n\n".join(lines)
-with gr.Blocks(title="Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1") as demo:
-    gr.Markdown(
-        """
-        ## Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1
-        Datensatz: **21.603 Leitsatzentscheidungen des BGH (ab dem Jahr 2000) extrahiert aus https://zenodo.org/records/15153244**
-        **Wie es funktioniert:** Ermöglicht die semantische Suche im Datensatz und gibt die Entscheidungen geordnet nach Ähnlichkeitswerten zurück.
-        **Versuche bespielsweise:**
-        - `Kann KI Erfinder sein?` → erwartetes Aktenzeichen **X ZB 5/22**
-        *Disclaimer:* Models may produce incorrect or misleading statements. Verify with sources.
-        """
-    )
-    with gr.Row():
-        q = gr.Textbox(label="Query", placeholder="Kann KI Erfinder sein?")
-        k = gr.Slider(1, 20, value=TOPK_DEF, step=1, label="Top-K")
-    out = gr.Markdown()
-    gr.Button("Search").click(fn=search, inputs=[q, k], outputs=[out])
-demo.launch()

+# app.py — Hugging Face Space (Gradio) using a prebuilt Chroma index
+# Embeddings: nomic-ai/nomic-embed-text-v1.5 (HF), trust_remote_code=True, normalize_embeddings=True
+import os
+import gradio as gr
+# Silence Chroma telemetry noise
+os.environ["CHROMA_TELEMETRY_DISABLED"] = "1"
+from chromadb.config import Settings
+from langchain_chroma import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+# -------- Config (can be overridden via Space "Variables") --------
+PERSIST_DIR = os.getenv("PERSIST_DIR", "./chroma_langchain")     # path to your committed Chroma index
+EMB_MODEL   = os.getenv("EMB_MODEL", "nomic-ai/nomic-embed-text-v1.5")
+TOPK_DEF    = int(os.getenv("TOPK", "5"))
+# Embedding function for query text — must match the model used to build the index
+EMBEDDINGS = HuggingFaceEmbeddings(
+    model_name=EMB_MODEL,
+    model_kwargs={"trust_remote_code": True},
+    encode_kwargs={"normalize_embeddings": True},
+)
+def load_vector_store():
+    """
+    Load the persisted Chroma collection with the embedding function for query-time encoding.
+    Returns (vs, error_message_or_None)
+    """
+    try:
+        vs = Chroma(
+            persist_directory=PERSIST_DIR,
+            embedding_function=EMBEDDINGS,
+            client_settings=Settings(anonymized_telemetry=False),
+        )
+        # sanity check (forces collection open)
+        _ = vs._collection.count()
+        return vs, None
+    except Exception as e:
+        # Helpful diagnostics: list available collections
+        try:
+            import chromadb
+            client = chromadb.PersistentClient(
+                path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False)
+            )
+            existing = [c.name for c in client.list_collections()]
+        except Exception:
+            existing = []
+        msg = (
+            f"Failed to load Chroma store at '{PERSIST_DIR}'. "
+            f"Existing collections: {existing or '—'}. "
+            "Check that the index folder is present in the Space and the collection name matches."
+        )
+        return None, f"{msg}\n\nDetails: {e}"
+VS, LOAD_ERR = load_vector_store()
+def search(query: str, k: int = TOPK_DEF):
+    if LOAD_ERR:
+        return f"⚠️ {LOAD_ERR}"
+    q = (query or "").strip()
+    if not q:
+        return "Please enter a query."
+    try:
+        results = VS.similarity_search_with_score(q, k=int(k))
+    except Exception as e:
+        return f"Search failed: {e}"
+    if not results:
+        return "No results."
+    lines = [f"### Top {len(results)} results"]
+    for i, (doc, score) in enumerate(results, 1):
+        meta = doc.metadata or {}
+        src = meta.get("source") or meta.get("file_path") or "(no source)"
+        snippet = (doc.page_content[:800] + "…") if len(doc.page_content) > 800 else doc.page_content
+        lines.append(f"**[{i}]**  \nSimilarity: `{score:.4f}`\n\n> {snippet}")
+    lines.append("\n> **Disclaimer:** Models can produce incorrect or misleading statements. Verify with sources.")
+    return "\n\n".join(lines)
+with gr.Blocks(title="Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1") as demo:
+    gr.Markdown(
+        """
+        ## Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1
+        **Datensatz: 21.603 Leitsatzentscheidungen des BGH (ab dem Jahr 2000) extrahiert aus https://zenodo.org/records/15153244**
+        **Modell:** nomic-ai/nomic-embed-text-v1.5
+        **Wie es funktioniert:** Ermöglicht die semantische Suche im Datensatz und gibt die Entscheidungen geordnet nach Ähnlichkeitswerten zurück.
+        **Versuche bespielsweise:**
+        - `Kann KI Erfinder sein?` → erwartetes Aktenzeichen **X ZB 5/22**
+        *Disclaimer:* Models may produce incorrect or misleading statements. Verify with sources.
+        """
+    )
+    with gr.Row():
+        q = gr.Textbox(label="Query", placeholder="Kann KI Erfinder sein?")
+        k = gr.Slider(1, 20, value=TOPK_DEF, step=1, label="Top-K")
+    out = gr.Markdown()
+    gr.Button("Search").click(fn=search, inputs=[q, k], outputs=[out])
+demo.launch()