Spaces:
Runtime error
Runtime error
Added caching of feed content and URI.
Browse files- functions/feed_extraction.py +27 -5
- functions/tools.py +36 -13
- rss_server.py +3 -2
functions/feed_extraction.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
'''Helper functions for MCP tools.'''
|
| 2 |
|
|
|
|
| 3 |
import re
|
|
|
|
| 4 |
import logging
|
| 5 |
import urllib.request
|
| 6 |
from urllib.error import HTTPError, URLError
|
|
@@ -10,11 +12,15 @@ from boilerpy3 import extractors
|
|
| 10 |
from boilerpy3.exceptions import HTMLExtractionError
|
| 11 |
from findfeed import search as feed_search
|
| 12 |
from googlesearch import search as google_search
|
|
|
|
| 13 |
|
| 14 |
FEED_URIS = {}
|
| 15 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
| 16 |
COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def find_feed_uri(website: str) -> str:
|
| 20 |
'''Attempts to find URI for RSS feed. First checks if string provided in
|
|
@@ -42,14 +48,26 @@ def find_feed_uri(website: str) -> str:
|
|
| 42 |
feed_uri = website
|
| 43 |
logger.info('%s looks like a feed URI already - using it directly', website)
|
| 44 |
|
| 45 |
-
# Next, check the cache to see if we already have this feed's URI
|
| 46 |
elif website in FEED_URIS:
|
| 47 |
feed_uri = FEED_URIS[website]
|
| 48 |
-
logger.info('%s feed URI in cache: %s', website, feed_uri)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# or else just google it
|
| 52 |
-
|
| 53 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
| 54 |
website_url = website
|
| 55 |
logger.info('%s looks like a website URL', website)
|
|
@@ -63,6 +81,10 @@ def find_feed_uri(website: str) -> str:
|
|
| 63 |
|
| 64 |
FEED_URIS[website] = feed_uri
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return feed_uri
|
| 67 |
|
| 68 |
|
|
|
|
| 1 |
'''Helper functions for MCP tools.'''
|
| 2 |
|
| 3 |
+
import os
|
| 4 |
import re
|
| 5 |
+
import json
|
| 6 |
import logging
|
| 7 |
import urllib.request
|
| 8 |
from urllib.error import HTTPError, URLError
|
|
|
|
| 12 |
from boilerpy3.exceptions import HTMLExtractionError
|
| 13 |
from findfeed import search as feed_search
|
| 14 |
from googlesearch import search as google_search
|
| 15 |
+
from upstash_redis import Redis
|
| 16 |
|
| 17 |
FEED_URIS = {}
|
| 18 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
| 19 |
COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
|
| 20 |
+
REDIS = Redis(
|
| 21 |
+
url='https://sensible-midge-19304.upstash.io',
|
| 22 |
+
token=os.environ['UPSTASH_KEY']
|
| 23 |
+
)
|
| 24 |
|
| 25 |
def find_feed_uri(website: str) -> str:
|
| 26 |
'''Attempts to find URI for RSS feed. First checks if string provided in
|
|
|
|
| 48 |
feed_uri = website
|
| 49 |
logger.info('%s looks like a feed URI already - using it directly', website)
|
| 50 |
|
| 51 |
+
# Next, check the cache to see if we already have this feed's URI locally
|
| 52 |
elif website in FEED_URIS:
|
| 53 |
feed_uri = FEED_URIS[website]
|
| 54 |
+
logger.info('%s feed URI in local cache: %s', website, feed_uri)
|
| 55 |
+
|
| 56 |
+
# Then, check to see if the URI is in the Redis cache
|
| 57 |
+
cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
|
| 58 |
+
cache_hit = False
|
| 59 |
+
|
| 60 |
+
if feed_uri is None:
|
| 61 |
+
cached_uri = REDIS.get(cache_key)
|
| 62 |
|
| 63 |
+
if cached_uri:
|
| 64 |
+
cache_hit = True
|
| 65 |
+
feed_uri = cached_uri
|
| 66 |
+
logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
|
| 67 |
+
|
| 68 |
+
# If none of those get it - try feedparse if it looks like a url
|
| 69 |
# or else just google it
|
| 70 |
+
if feed_uri is None:
|
| 71 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
| 72 |
website_url = website
|
| 73 |
logger.info('%s looks like a website URL', website)
|
|
|
|
| 81 |
|
| 82 |
FEED_URIS[website] = feed_uri
|
| 83 |
|
| 84 |
+
# Add the feed URI to the redis cache if it wasn't already there
|
| 85 |
+
if cache_hit is False:
|
| 86 |
+
REDIS.set(cache_key, feed_uri)
|
| 87 |
+
|
| 88 |
return feed_uri
|
| 89 |
|
| 90 |
|
functions/tools.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
'''Tool functions for MCP server'''
|
| 2 |
|
|
|
|
| 3 |
import json
|
| 4 |
import logging
|
| 5 |
import functions.feed_extraction as extraction_funcs
|
| 6 |
import functions.summarization as summarization_funcs
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
def get_feed(website: str) -> list:
|
| 10 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
| 11 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
| 12 |
feed and return title, summary and link to full article for most recent
|
|
@@ -14,30 +18,49 @@ def get_feed(website: str) -> list:
|
|
| 14 |
|
| 15 |
Args:
|
| 16 |
website: URL or name of website to extract RSS feed content from
|
|
|
|
|
|
|
| 17 |
|
| 18 |
Returns:
|
| 19 |
JSON string containing the feed content or 'No feed found' if a RSS
|
| 20 |
feed for the requested website could not be found
|
| 21 |
'''
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
logger.info('Getting feed content for: %s', website)
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
|
| 37 |
-
|
| 38 |
-
summary = summarization_funcs.summarize_content(item['content'])
|
| 39 |
-
content[i]['summary'] = summary
|
| 40 |
|
| 41 |
-
|
| 42 |
|
| 43 |
return json.dumps(content)
|
|
|
|
| 1 |
'''Tool functions for MCP server'''
|
| 2 |
|
| 3 |
+
import time
|
| 4 |
import json
|
| 5 |
import logging
|
| 6 |
import functions.feed_extraction as extraction_funcs
|
| 7 |
import functions.summarization as summarization_funcs
|
| 8 |
|
| 9 |
+
LOCAL_CACHE = {
|
| 10 |
+
'get_feed': {}
|
| 11 |
+
}
|
| 12 |
|
| 13 |
+
def get_feed(website: str, use_cache: bool = True) -> list:
|
| 14 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
| 15 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
| 16 |
feed and return title, summary and link to full article for most recent
|
|
|
|
| 18 |
|
| 19 |
Args:
|
| 20 |
website: URL or name of website to extract RSS feed content from
|
| 21 |
+
use_cache: check local cache for content from RSS feed first before
|
| 22 |
+
downloading data from the website's RSS feed
|
| 23 |
|
| 24 |
Returns:
|
| 25 |
JSON string containing the feed content or 'No feed found' if a RSS
|
| 26 |
feed for the requested website could not be found
|
| 27 |
'''
|
| 28 |
|
| 29 |
+
start_time = time.time()
|
| 30 |
+
|
| 31 |
+
logger = logging.getLogger(__name__ + '.get_feed()')
|
| 32 |
logger.info('Getting feed content for: %s', website)
|
| 33 |
|
| 34 |
+
# Check to see if we have this feed cached, if desired
|
| 35 |
+
if use_cache is True and website in LOCAL_CACHE['get_feed']:
|
| 36 |
+
content = LOCAL_CACHE['get_feed'][website]
|
| 37 |
+
logger.info('Got feed content from local cache')
|
| 38 |
+
|
| 39 |
+
else:
|
| 40 |
+
|
| 41 |
+
# Find the feed's URI from the website name/URL
|
| 42 |
+
feed_uri = extraction_funcs.find_feed_uri(website)
|
| 43 |
+
logger.info('find_feed_uri() returned %s', feed_uri)
|
| 44 |
+
|
| 45 |
+
if 'No feed found' in feed_uri:
|
| 46 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
| 47 |
+
return 'No feed found'
|
| 48 |
+
|
| 49 |
+
# Parse and extract content from the feed
|
| 50 |
+
content = extraction_funcs.parse_feed(feed_uri)
|
| 51 |
+
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
| 52 |
|
| 53 |
+
# Summarize each post in the feed
|
| 54 |
+
for i, item in content.items():
|
| 55 |
|
| 56 |
+
if item['content'] is not None:
|
| 57 |
+
summary = summarization_funcs.summarize_content(item['content'])
|
| 58 |
+
content[i]['summary'] = summary
|
| 59 |
|
| 60 |
+
content[i].pop('content', None)
|
| 61 |
|
| 62 |
+
LOCAL_CACHE['get_feed'][website] = content
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
| 65 |
|
| 66 |
return json.dumps(content)
|
rss_server.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
'''Main script to run gradio interface and MCP server.'''
|
| 2 |
|
| 3 |
import logging
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
from logging.handlers import RotatingFileHandler
|
| 6 |
|
|
@@ -17,7 +18,7 @@ Path('logs').mkdir(parents=True, exist_ok=True)
|
|
| 17 |
# Clear old logs if present
|
| 18 |
gradio_funcs.delete_old_logs('logs', 'rss_server')
|
| 19 |
|
| 20 |
-
# Set up the root logger so we catch logs from
|
| 21 |
logging.basicConfig(
|
| 22 |
handlers=[RotatingFileHandler(
|
| 23 |
'logs/rss_server.log',
|
|
@@ -29,9 +30,9 @@ logging.basicConfig(
|
|
| 29 |
format='%(levelname)s - %(name)s - %(message)s'
|
| 30 |
)
|
| 31 |
|
|
|
|
| 32 |
logger = logging.getLogger(__name__)
|
| 33 |
|
| 34 |
-
|
| 35 |
with gr.Blocks() as demo:
|
| 36 |
|
| 37 |
# Page text
|
|
|
|
| 1 |
'''Main script to run gradio interface and MCP server.'''
|
| 2 |
|
| 3 |
import logging
|
| 4 |
+
from functools import partial
|
| 5 |
from pathlib import Path
|
| 6 |
from logging.handlers import RotatingFileHandler
|
| 7 |
|
|
|
|
| 18 |
# Clear old logs if present
|
| 19 |
gradio_funcs.delete_old_logs('logs', 'rss_server')
|
| 20 |
|
| 21 |
+
# Set up the root logger so we catch logs from everything
|
| 22 |
logging.basicConfig(
|
| 23 |
handlers=[RotatingFileHandler(
|
| 24 |
'logs/rss_server.log',
|
|
|
|
| 30 |
format='%(levelname)s - %(name)s - %(message)s'
|
| 31 |
)
|
| 32 |
|
| 33 |
+
# Get a logger
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
|
|
|
| 36 |
with gr.Blocks() as demo:
|
| 37 |
|
| 38 |
# Page text
|