# File: main/app.py # Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks) # instead of returning full HTML. Output is compact and configurable to reduce verbosity. import gradio as gr # UI library import requests # HTTP client from bs4 import BeautifulSoup # HTML parsing from readability import Document # Readability algorithm to isolate main content from urllib.parse import urljoin, urldefrag, urlparse # URL helpers import re # For whitespace cleanup and simple formatting # ------------------------------- # HTTP fetching with sane defaults # ------------------------------- def _http_get(url: str) -> requests.Response: """ Make an HTTP GET request with headers and a timeout. Layman's terms: downloads the webpage safely and politely. """ headers = { "User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)", "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", } # Short timeouts so the app isn't stuck forever return requests.get(url, headers=headers, timeout=15) # ---------------------------------------- # Helpers: text cleanup & friendly trimming # ---------------------------------------- def _normalize_whitespace(text: str) -> str: """ Layman's terms: squash weird spacing and too many blank lines. """ text = re.sub(r"[ \t\u00A0]+", " ", text) # collapse runs of spaces text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text.strip()) # max 1 blank line at a time return text.strip() def _truncate(text: str, max_chars: int) -> tuple[str, bool]: """ Layman's terms: cut the text if it’s too long and tell the caller if we cut it. """ if max_chars is None or max_chars <= 0 or len(text) <= max_chars: return text, False return text[:max_chars].rstrip() + " …", True def _domain_of(url: str) -> str: """ Layman's terms: show a friendly domain like example.com. """ try: return urlparse(url).netloc or "" except Exception: return "" # ----------------------------------- # Metadata extraction (title, etc.) # ----------------------------------- def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict: """ Layman's terms: grab useful fields like title, description, site name, and canonical link. """ meta = {} # Title preference: > og:title > twitter:title title_candidates = [ (soup.title.string if soup.title and soup.title.string else None), _og(soup, "og:title"), _meta(soup, "twitter:title"), ] meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "") # Description preference: meta[name=description] > og:description > twitter:description desc_candidates = [ _meta(soup, "description"), _og(soup, "og:description"), _meta(soup, "twitter:description"), ] meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "") # Canonical URL if provided (helps dedupe / standardize) link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v) meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else "" # Site name (nice for context) meta["site_name"] = (_og(soup, "og:site_name") or "").strip() # Language (if present) html_tag = soup.find("html") meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else "" # Final resolved URL and domain meta["fetched_url"] = final_url meta["domain"] = _domain_of(final_url) return meta def _meta(soup: BeautifulSoup, name: str) -> str | None: tag = soup.find("meta", attrs={"name": name}) return tag.get("content") if tag and tag.has_attr("content") else None def _og(soup: BeautifulSoup, prop: str) -> str | None: tag = soup.find("meta", attrs={"property": prop}) return tag.get("content") if tag and tag.has_attr("content") else None # --------------------------------------------------------- # Main content extraction with Readability + gentle cleanup # --------------------------------------------------------- def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]: """ Layman's terms: use Readability to find the article body, then clean it to plain text. Returns (clean_text, soup_of_readable_html) for link scraping. """ # Readability gives us a simplified article HTML doc = Document(html) readable_html = doc.summary(html_partial=True) # Parse the simplified HTML so we can clean it up further s = BeautifulSoup(readable_html, "lxml") # Remove obviously noisy elements if present for sel in ["script", "style", "noscript", "iframe", "svg"]: for tag in s.select(sel): tag.decompose() # Extract text with paragraphs preserved, then normalize whitespace text_parts = [] for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]): # Keep list items and headers to retain structure without being too verbose chunk = p.get_text(" ", strip=True) if chunk: text_parts.append(chunk) clean_text = _normalize_whitespace("\n\n".join(text_parts)) return clean_text, s # ------------------------------------------ # Link extraction from the simplified content # ------------------------------------------ def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]: """ Layman's terms: pull out clickable links from the article content only, turn them into absolute URLs, drop junk, dedupe, and cap the list. """ seen = set() links: list[tuple[str, str]] = [] for a in readable_soup.find_all("a", href=True): href = a.get("href").strip() # Ignore anchors, mailto, javascript, and empty if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"): continue # Resolve relative URLs and strip URL fragments (#section) absolute = urljoin(base_url, href) absolute, _ = urldefrag(absolute) if absolute in seen: continue seen.add(absolute) text = a.get_text(" ", strip=True) # Keep link text concise if len(text) > 120: text = text[:117] + "…" links.append((text or absolute, absolute)) if len(links) >= max_links > 0: break return links # ------------------------- # Formatter: compact output # ------------------------- def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]], include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str: """ Layman's terms: turn the pieces into a neat, compact Markdown string. """ lines = [] # Title header title = meta.get("title") or meta.get("domain") or "Untitled" lines.append(f"# {title}") # Metadata (compact) if include_metadata: md = [] # Only show fields that exist to keep things tight if meta.get("description"): md.append(f"- **Description:** {meta['description']}") if meta.get("site_name"): md.append(f"- **Site:** {meta['site_name']}") if meta.get("canonical"): md.append(f"- **Canonical:** {meta['canonical']}") if meta.get("lang"): md.append(f"- **Language:** {meta['lang']}") if meta.get("fetched_url"): md.append(f"- **Fetched From:** {meta['fetched_url']}") if md: lines.append("## Metadata") lines.extend(md) # Body text if include_text and body: # For "Brief", show a very small excerpt even after truncation if verbosity == "Brief": brief, was_more = _truncate(body, 800) lines.append("## Text") lines.append(brief) if was_more or body_truncated: lines.append("\n> (Trimmed for brevity)") else: lines.append("## Text") lines.append(body) if body_truncated: lines.append("\n> (Trimmed for brevity)") # Links if include_links and links: lines.append(f"## Links ({len(links)})") for text, url in links: lines.append(f"- [{text}]({url})") return "\n\n".join(lines).strip() # -------------------------------- # Gradio-facing function (the app) # -------------------------------- def extract_relevant( url: str, verbosity: str = "Standard", include_metadata: bool = True, include_text: bool = True, include_links: bool = True, max_chars: int = 3000, max_links: int = 20 ) -> str: """ Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary. """ if not url or not url.strip(): return "Please enter a valid URL." try: resp = _http_get(url) resp.raise_for_status() except requests.exceptions.RequestException as e: return f"An error occurred: {e}" # Respect the final resolved URL (after redirects) final_url = str(resp.url) # Only process HTML-ish responses ctype = resp.headers.get("Content-Type", "") if "html" not in ctype.lower(): return f"Unsupported content type for extraction: {ctype or 'unknown'}" # Decode as text (requests usually sets encoding; otherwise guess) resp.encoding = resp.encoding or resp.apparent_encoding html = resp.text # Full page soup (to extract metadata accurately) full_soup = BeautifulSoup(html, "lxml") meta = _extract_metadata(full_soup, final_url) # Extract main body text using Readability body_text, readable_soup = _extract_main_text(html) # If the body is suspiciously empty, fall back to a simpler text strategy if not body_text: fallback_text = full_soup.get_text(" ", strip=True) body_text = _normalize_whitespace(fallback_text) # Enforce verbosity presets unless user overrides via slider preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999} target_cap = preset_caps.get(verbosity, 3000) # Use the *smaller* of user cap and preset to keep things tidy cap = min(max_chars if max_chars > 0 else target_cap, target_cap) body_text, truncated = _truncate(body_text, cap) if include_text else ("", False) # Extract links from the readable portion only (cleaner than whole DOM) links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0) # Build compact Markdown md = _format_markdown( meta=meta, body=body_text, body_truncated=truncated, links=links, include_text=include_text, include_metadata=include_metadata, include_links=include_links, verbosity=verbosity ) return md or "No content could be extracted." # ----------------- # Gradio UI (Blocks) # ----------------- with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo: # Title & subtitle for clarity gr.Markdown("# Fetch MCP — Clean Extract") gr.Markdown( "Extract **title**, **metadata**, **clean text**, and **links** — without the noisy HTML. " "Use Verbosity and caps to keep it tight." ) with gr.Row(): url_in = gr.Textbox(label="URL", placeholder="https://example.com/article") fetch_btn = gr.Button("Fetch Clean Content") with gr.Accordion("Options", open=False): with gr.Row(): verbosity = gr.Dropdown( label="Verbosity", choices=["Brief", "Standard", "Full"], value="Standard", info="Controls how much text you get back." ) max_chars = gr.Slider( 400, 12000, value=3000, step=100, label="Max Characters (body text)", info="Hard cap for body text. Lower = less verbose." ) max_links = gr.Slider( 0, 100, value=20, step=1, label="Max Links", info="Limit how many hyperlinks we include." ) with gr.Row(): include_metadata = gr.Checkbox(value=True, label="Include Metadata") include_text = gr.Checkbox(value=True, label="Include Main Text") include_links = gr.Checkbox(value=True, label="Include Links") # Output as Markdown (compact and readable) out = gr.Markdown(label="Result") # Wire up the click fetch_btn.click( fn=extract_relevant, inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links], outputs=out ) # Keep MCP server enabled if __name__ == "__main__": demo.launch(mcp_server=True)