# app.py — Hugging Face Space (Gradio) using a prebuilt Chroma index # Embeddings: nomic-ai/nomic-embed-text-v1.5 (HF), trust_remote_code=True, normalize_embeddings=True import os import gradio as gr # Silence Chroma telemetry noise os.environ["CHROMA_TELEMETRY_DISABLED"] = "1" from chromadb.config import Settings from langchain_chroma import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings # -------- Config (can be overridden via Space "Variables") -------- PERSIST_DIR = os.getenv("PERSIST_DIR", "./chroma_langchain") # path to your committed Chroma index EMB_MODEL = os.getenv("EMB_MODEL", "nomic-ai/nomic-embed-text-v1.5") TOPK_DEF = int(os.getenv("TOPK", "5")) # Embedding function for query text — must match the model used to build the index EMBEDDINGS = HuggingFaceEmbeddings( model_name=EMB_MODEL, model_kwargs={"trust_remote_code": True}, encode_kwargs={"normalize_embeddings": True}, ) def load_vector_store(): """ Load the persisted Chroma collection with the embedding function for query-time encoding. Returns (vs, error_message_or_None) """ try: vs = Chroma( persist_directory=PERSIST_DIR, embedding_function=EMBEDDINGS, client_settings=Settings(anonymized_telemetry=False), ) # sanity check (forces collection open) _ = vs._collection.count() return vs, None except Exception as e: # Helpful diagnostics: list available collections try: import chromadb client = chromadb.PersistentClient( path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False) ) existing = [c.name for c in client.list_collections()] except Exception: existing = [] msg = ( f"Failed to load Chroma store at '{PERSIST_DIR}'. " f"Existing collections: {existing or '—'}. " "Check that the index folder is present in the Space and the collection name matches." ) return None, f"{msg}\n\nDetails: {e}" VS, LOAD_ERR = load_vector_store() def search(query: str, k: int = TOPK_DEF): if LOAD_ERR: return f"⚠️ {LOAD_ERR}" q = (query or "").strip() if not q: return "Please enter a query." try: results = VS.similarity_search_with_score(q, k=int(k)) except Exception as e: return f"Search failed: {e}" if not results: return "No results." lines = [f"### Top {len(results)} results"] for i, (doc, score) in enumerate(results, 1): meta = doc.metadata or {} src = meta.get("source") or meta.get("file_path") or "(no source)" snippet = (doc.page_content[:800] + "…") if len(doc.page_content) > 800 else doc.page_content lines.append(f"**[{i}]** \nSimilarity: `{score:.4f}`\n\n> {snippet}") lines.append("\n> **Disclaimer:** Models can produce incorrect or misleading statements. Verify with sources.") return "\n\n".join(lines) with gr.Blocks(title="Semantische Suchmaschine für BGH Leitsätzen v0.1") as demo: gr.Markdown( """ ## Semantische Suchmaschine für BGH Leitsätzen v0.1 Datensatz: Leitsätze von 87 Entscheidungen des 2. BGH Senats **Wie es funktioniert:** Ermöglicht die semantische Suche im Datensatz und gibt die Entscheidungen geöordnet nach Ähnlichkeitswerten zurück. **Versuche bespielsweise:** - `Ist eine Partnerschaft zwischen einem Tierarzt und einem Betriebswirt zulässig?` → erwartetes Aktenzeichen **II ZB 6/21** *Disclaimer:* Models may produce incorrect or misleading statements. Verify with sources. """ ) with gr.Row(): q = gr.Textbox(label="Query", placeholder="Is partial priority allowed under EPC?") k = gr.Slider(1, 20, value=TOPK_DEF, step=1, label="Top-K") out = gr.Markdown() gr.Button("Search").click(fn=search, inputs=[q, k], outputs=[out]) demo.launch()