import streamlit as st from datasets import load_dataset from haystack import Pipeline from haystack.components.readers import ExtractiveReader from haystack.components.retrievers.in_memory import InMemoryBM25Retriever from haystack.document_stores.in_memory import InMemoryDocumentStore from utils import get_unique_docs # Load the dataset @st.cache_data(show_spinner=False) def load_documents(): """ Load the documents from the dataset considering only unique documents. Returns: - documents: list of dictionaries with the documents. """ unique_docs = set() dataset_name = "PedroCJardim/QASports" dataset_split = "basketball" st.caption(f'Fetching "{dataset_name}" dataset') # build the dataset dataset = load_dataset(dataset_name, name=dataset_split) docs_validation = get_unique_docs(dataset["validation"], unique_docs) docs_train = get_unique_docs(dataset["train"], unique_docs) docs_test = get_unique_docs(dataset["test"], unique_docs) documents = docs_validation + docs_train + docs_test return documents @st.cache_resource(show_spinner=False) def get_document_store(documents): """ Index the files in the document store. Args: - files: list of dictionaries with the documents. """ # Create in memory database st.caption(f"Building the Document Store") document_store = InMemoryDocumentStore() document_store.write_documents(documents=documents) return document_store @st.cache_resource(show_spinner=False) def get_question_pipeline(_doc_store): """ Create the pipeline with the retriever and reader components. Args: - doc_store: instance of the document store. Returns: - pipe: instance of the pipeline. """ st.caption(f"Building the Question Answering pipeline") # Create the retriever and reader retriever = InMemoryBM25Retriever(document_store=_doc_store) reader = ExtractiveReader(model="deepset/roberta-base-squad2") reader.warm_up() # Create the pipeline pipe = Pipeline() pipe.add_component(instance=retriever, name="retriever") pipe.add_component(instance=reader, name="reader") pipe.connect("retriever.documents", "reader.documents") return pipe def search(pipeline, question: str): """ Search for the answer to a question in the documents. Args: - pipeline: instance of the pipeline. - question: string with the question. Returns: - answer: dictionary with the answer. """ # Get the answers top_k = 3 answer = pipeline.run( data={ "retriever": {"query": question, "top_k": 10}, "reader": {"query": question, "top_k": top_k}, } ) max_k = min(top_k, len(answer["reader"]["answers"])) return answer["reader"]["answers"][0:max_k] # Loading status with st.status( "Downloading dataset...", expanded=st.session_state.get("expanded", True) ) as status: documents = load_documents() status.update(label="Indexing documents...") doc_store = get_document_store(documents) status.update(label="Creating pipeline...") pipe = get_question_pipeline(doc_store) status.update( label="Download and indexing complete!", state="complete", expanded=False ) st.session_state["expanded"] = False st.subheader("🏀 HoopMind Basketball Wiki", divider="rainbow") st.caption( """Welcome to **HoopMind**! This AI answers basketball questions using the QASports dataset — the first large sports question answering dataset. It includes real info on players, teams, and matches from basketball, soccer, and football, with over **1.5M Q&A pairs** across **54k+ documents**.""" ) if user_query := st.text_input( label="Ask HoopMind anything about Basketball! 🧠", placeholder="Who is Kobe Bryant?", ): # Get the answers with st.spinner("Thinking... 🏀"): try: answer = search(pipe, user_query) for idx, ans in enumerate(answer): st.info( f""" **Answer {idx+1}:** "{ans.data}" 🔥 Score: {ans.score:0.4f} 📖 Document: "{ans.document.meta["title"]}" 🔗 URL: {ans.document.meta["url"]} """ ) with st.expander("See details", expanded=False): st.write(ans) st.divider() except Exception: st.error("❌ HoopMind couldn’t find an answer for that one...")