import gradio as gr import requests import subprocess from sentence_transformers import SentenceTransformer import faiss import numpy as np # Globals embedding_model = SentenceTransformer("all-MiniLM-L6-v2") vector_index = None indexed_chunks = [] url_cache = {} # url -> (chunks, faiss index) FIRECRAWL_API_KEY = "sk_lBtoP3fxN5Z5z9D7WJAdGUIhR9uoWB6w0IIzqjRZ0Q8yq6Nc" # Replace this # Ollama call helper def ollama_generate(system_prompt: str, user_prompt: str, model: str = "llama2") -> str: """ Calls local Ollama LLM via CLI and returns generated text. """ try: command = [ "ollama", "generate", model, "--system", system_prompt, "--prompt", user_prompt, "--quiet", "--json" ] result = subprocess.run(command, capture_output=True, text=True, check=True) # Ollama JSON output format: {"text":"generated text..."} import json output = json.loads(result.stdout) return output.get("text", "").strip() except Exception as e: return f"Error calling Ollama: {str(e)}" # Scrape URL and embed content def scrape_and_embed(url: str): global vector_index, indexed_chunks if url in url_cache: indexed_chunks, vector_index = url_cache[url] return f"✅ Loaded cached content for {url}" # Firecrawl scrape response = requests.post( "https://api.firecrawl.dev/v1/scrape", headers={"Authorization": f"Bearer {FIRECRAWL_API_KEY}"}, json={"url": url, "javascript": False} ) if response.status_code != 200: return f"❌ Failed to scrape URL: {response.status_code}" content = response.json().get("text", "") chunks = [line.strip() for line in content.split("\n") if len(line.strip()) > 50] indexed_chunks = chunks[:100] # Embeddings + FAISS index embeddings = embedding_model.encode(indexed_chunks) vector_index = faiss.IndexFlatL2(embeddings.shape[1]) vector_index.add(np.array(embeddings)) # Cache it url_cache[url] = (indexed_chunks, vector_index) return f"✅ Scraped and indexed {len(indexed_chunks)} chunks from {url}" # Main RAG + Ollama Q&A function def web_rag_ollama(combined_input: str) -> str: """ Expects input: " || " Scrapes URL (cached), embeds, retrieves context, then asks Ollama to answer. """ global vector_index, indexed_chunks if "||" not in combined_input: return "❌ Input format must be: || " url, question = [part.strip() for part in combined_input.split("||", 1)] # Scrape and embed scrape_status = scrape_and_embed(url) if scrape_status.startswith("❌"): return scrape_status # Retrieval if not indexed_chunks or vector_index is None: return "⚠️ No indexed content available." query_emb = embedding_model.encode([question]) D, I = vector_index.search(np.array(query_emb), k=3) context = "\n\n".join([indexed_chunks[i] for i in I[0]]) # Ollama prompt engineering system_prompt = ( "You are a helpful assistant. Use the provided context to answer the question. " "If the answer is not contained in the context, say you don't know." ) user_prompt = f"Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:" # Call Ollama answer = ollama_generate(system_prompt, user_prompt) return f"**Scrape status:** {scrape_status}\n\n**Answer:**\n{answer}" # Gradio interface with MCP support demo = gr.Interface( fn=web_rag_ollama, inputs=gr.Textbox( label="Input", placeholder="Enter input in format:\nhttps://example.com || What is this page about?" ), outputs=gr.Textbox(label="Answer"), title="🌐 Web RAG Q&A with Ollama (MCP-ready)", description="Scrape URL, embed content, and answer questions using local Ollama LLM." ) if __name__ == "__main__": demo.launch(mcp_server=True)