Spaces:
Runtime error
Runtime error
Completed vector context search tool.
Browse files- functions/rag.py +20 -10
- functions/tools.py +5 -3
- rss_server.py +15 -3
functions/rag.py
CHANGED
|
@@ -5,14 +5,14 @@ import logging
|
|
| 5 |
import queue
|
| 6 |
from semantic_text_splitter import TextSplitter
|
| 7 |
from tokenizers import Tokenizer
|
| 8 |
-
from upstash_vector import Index
|
| 9 |
|
| 10 |
|
| 11 |
def ingest(rag_ingest_queue: queue.Queue) -> None:
|
| 12 |
'''Semantically chunks article and upsert to Upstash vector db
|
| 13 |
using article title as namespace.'''
|
| 14 |
|
| 15 |
-
logger = logging.
|
| 16 |
|
| 17 |
index = Index(
|
| 18 |
url='https://living-whale-89944-us1-vector.upstash.io',
|
|
@@ -24,27 +24,37 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
|
|
| 24 |
namespaces = index.list_namespaces()
|
| 25 |
|
| 26 |
item = rag_ingest_queue.get()
|
|
|
|
| 27 |
title = item['title']
|
| 28 |
-
text = item['content']
|
| 29 |
-
logger.info('Got %s from RAG ingest queue', title)
|
| 30 |
|
| 31 |
if title not in namespaces:
|
|
|
|
|
|
|
| 32 |
|
| 33 |
tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
|
| 34 |
splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
|
| 35 |
chunks=splitter.chunks(text)
|
| 36 |
|
| 37 |
for i, chunk in enumerate(chunks):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
index.upsert(
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
| 43 |
)
|
| 44 |
],
|
| 45 |
-
namespace=title
|
| 46 |
)
|
| 47 |
-
|
| 48 |
logger.info('Ingested %s chunks into vector DB', i + 1)
|
| 49 |
|
| 50 |
else:
|
|
|
|
| 5 |
import queue
|
| 6 |
from semantic_text_splitter import TextSplitter
|
| 7 |
from tokenizers import Tokenizer
|
| 8 |
+
from upstash_vector import Index
|
| 9 |
|
| 10 |
|
| 11 |
def ingest(rag_ingest_queue: queue.Queue) -> None:
|
| 12 |
'''Semantically chunks article and upsert to Upstash vector db
|
| 13 |
using article title as namespace.'''
|
| 14 |
|
| 15 |
+
logger = logging.getLogger(__name__ + '.ingest()')
|
| 16 |
|
| 17 |
index = Index(
|
| 18 |
url='https://living-whale-89944-us1-vector.upstash.io',
|
|
|
|
| 24 |
namespaces = index.list_namespaces()
|
| 25 |
|
| 26 |
item = rag_ingest_queue.get()
|
| 27 |
+
logger.info(item)
|
| 28 |
title = item['title']
|
|
|
|
|
|
|
| 29 |
|
| 30 |
if title not in namespaces:
|
| 31 |
+
text = item['content']
|
| 32 |
+
logger.info('Got "%s" from RAG ingest queue', title)
|
| 33 |
|
| 34 |
tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
|
| 35 |
splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
|
| 36 |
chunks=splitter.chunks(text)
|
| 37 |
|
| 38 |
for i, chunk in enumerate(chunks):
|
| 39 |
+
# index.upsert(
|
| 40 |
+
# vectors=[
|
| 41 |
+
# Vector(
|
| 42 |
+
# id=hash(f'{title}-{i}'),
|
| 43 |
+
# data=chunk,
|
| 44 |
+
# )
|
| 45 |
+
# ],
|
| 46 |
+
# namespace=title
|
| 47 |
+
# )
|
| 48 |
+
|
| 49 |
index.upsert(
|
| 50 |
+
[
|
| 51 |
+
(
|
| 52 |
+
hash(f'{title}-{i}'),
|
| 53 |
+
chunk,
|
| 54 |
+
{'namespace': title}
|
| 55 |
)
|
| 56 |
],
|
|
|
|
| 57 |
)
|
|
|
|
| 58 |
logger.info('Ingested %s chunks into vector DB', i + 1)
|
| 59 |
|
| 60 |
else:
|
functions/tools.py
CHANGED
|
@@ -59,7 +59,7 @@ def get_feed(website: str) -> list:
|
|
| 59 |
if item['content'] is not None:
|
| 60 |
|
| 61 |
RAG_INGEST_QUEUE.put(item)
|
| 62 |
-
logger.info('%s sent to RAG ingest', item['title'])
|
| 63 |
|
| 64 |
summary = summarization_funcs.summarize_content(
|
| 65 |
item['title'],
|
|
@@ -67,7 +67,7 @@ def get_feed(website: str) -> list:
|
|
| 67 |
)
|
| 68 |
|
| 69 |
content[i]['summary'] = summary
|
| 70 |
-
logger.info('Summary of %s generated', item['title'])
|
| 71 |
|
| 72 |
content[i].pop('content', None)
|
| 73 |
|
|
@@ -96,8 +96,10 @@ def context_search(query: str, article_title: str = None) -> str:
|
|
| 96 |
results = None
|
| 97 |
|
| 98 |
results = index.query(
|
| 99 |
-
|
| 100 |
top_k=3,
|
|
|
|
|
|
|
| 101 |
namespace=article_title
|
| 102 |
)
|
| 103 |
|
|
|
|
| 59 |
if item['content'] is not None:
|
| 60 |
|
| 61 |
RAG_INGEST_QUEUE.put(item)
|
| 62 |
+
logger.info('"%s" sent to RAG ingest', item['title'])
|
| 63 |
|
| 64 |
summary = summarization_funcs.summarize_content(
|
| 65 |
item['title'],
|
|
|
|
| 67 |
)
|
| 68 |
|
| 69 |
content[i]['summary'] = summary
|
| 70 |
+
logger.info('Summary of "%s" generated', item['title'])
|
| 71 |
|
| 72 |
content[i].pop('content', None)
|
| 73 |
|
|
|
|
| 96 |
results = None
|
| 97 |
|
| 98 |
results = index.query(
|
| 99 |
+
data=query,
|
| 100 |
top_k=3,
|
| 101 |
+
include_metadata=True,
|
| 102 |
+
include_data=True,
|
| 103 |
namespace=article_title
|
| 104 |
)
|
| 105 |
|
rss_server.py
CHANGED
|
@@ -40,7 +40,7 @@ with gr.Blocks() as demo:
|
|
| 40 |
gr.HTML(html.DESCRIPTION)
|
| 41 |
|
| 42 |
# Log output
|
| 43 |
-
dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=
|
| 44 |
timer = gr.Timer(0.5, active=True)
|
| 45 |
|
| 46 |
timer.tick( # pylint: disable=no-member
|
|
@@ -51,16 +51,28 @@ with gr.Blocks() as demo:
|
|
| 51 |
|
| 52 |
# Get feed tool
|
| 53 |
website_url = gr.Textbox('hackernews.com', label='Website')
|
| 54 |
-
|
| 55 |
submit_button = gr.Button('Submit')
|
| 56 |
|
| 57 |
submit_button.click( # pylint: disable=no-member
|
| 58 |
fn=tool_funcs.get_feed,
|
| 59 |
inputs=website_url,
|
| 60 |
-
outputs=
|
| 61 |
api_name='Get RSS feed content'
|
| 62 |
)
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
if __name__ == '__main__':
|
| 66 |
|
|
|
|
| 40 |
gr.HTML(html.DESCRIPTION)
|
| 41 |
|
| 42 |
# Log output
|
| 43 |
+
dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=10)
|
| 44 |
timer = gr.Timer(0.5, active=True)
|
| 45 |
|
| 46 |
timer.tick( # pylint: disable=no-member
|
|
|
|
| 51 |
|
| 52 |
# Get feed tool
|
| 53 |
website_url = gr.Textbox('hackernews.com', label='Website')
|
| 54 |
+
feed_output = gr.Textbox(label='RSS entries', lines=10, max_lines=10)
|
| 55 |
submit_button = gr.Button('Submit')
|
| 56 |
|
| 57 |
submit_button.click( # pylint: disable=no-member
|
| 58 |
fn=tool_funcs.get_feed,
|
| 59 |
inputs=website_url,
|
| 60 |
+
outputs=feed_output,
|
| 61 |
api_name='Get RSS feed content'
|
| 62 |
)
|
| 63 |
|
| 64 |
+
# Vector search tool
|
| 65 |
+
search_query = gr.Textbox('Does apple offer parental controls?', label='Vector search query')
|
| 66 |
+
search_output = gr.Textbox(label='Vector search results', lines=10, max_lines=10)
|
| 67 |
+
submit_button = gr.Button('Submit')
|
| 68 |
+
|
| 69 |
+
submit_button.click( # pylint: disable=no-member
|
| 70 |
+
fn=tool_funcs.context_search,
|
| 71 |
+
inputs=search_query,
|
| 72 |
+
outputs=search_output,
|
| 73 |
+
api_name='Context vector search'
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
|
| 77 |
if __name__ == '__main__':
|
| 78 |
|