Spaces:

hasnanmr
/

agentic_summarizer

Sleeping

App Files Files

xet

Community

Hasnan Ramadhan commited on Jul 8

Commit

c7b8084

1 Parent(s): ce3a388

Update space

Browse files

Files changed (2) hide show

app.py +361 -60
requirements.txt +10 -1

app.py CHANGED Viewed

@@ -1,64 +1,365 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from langgraph.graph import StateGraph
+from typing import TypedDict
+from langchain_community.document_loaders import PyMuPDFLoader
+import requests
+from groq import Groq
+import os
+from dotenv import load_dotenv
+import tempfile
+from googlesearch import search
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import re
+load_dotenv()
+def get_llm_response(prompt):
+    url = "http://192.168.181.215:8081/llms"
+    headers = {"Content-Type": "application/json"}
+    payload = {
+        "messages": [{"role": "user", "content": prompt}],
+        "max_new_tokens": 2000,
+        "do_sample": True,
+        "temperature": 0.2,
+        "top_k": 10,
+        "top_p": 0.90
+    }
+    try:
+        response = requests.post(url, json=payload, headers=headers)
+        response.raise_for_status()
+        data = response.json()
+        return {
+            "response": data['choices'][0]['content'],
+            "usage": data.get('usage', {}),
+            "generation_time": data.get('generation_time', None)
+        }
+    except requests.exceptions.RequestException as e:
+        return {
+            "response": f"Error occurred: {str(e)}",
+            "usage": {},
+            "generation_time": None
+        }
+def get_groq_response(prompt):
+    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+    completion = client.chat.completions.create(
+        model="llama-3.1-8b-instant",
+        messages=[
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ]
+    )
+    return completion.choices[0].message.content
+def google_search_agent(state: DocumentState) -> DocumentState:
+    """Performs Google search and extracts content from results."""
+    if not state.get('search_query'):
+        return state
+    try:
+        search_results = []
+        # Get top 3 search results
+        for url in search(state['search_query'], num_results=3):
+            try:
+                response = requests.get(url, timeout=10)
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Remove script and style elements
+                for script in soup(["script", "style"]):
+                    script.decompose()
+                # Get text content
+                text = soup.get_text()
+                # Clean up text
+                lines = (line.strip() for line in text.splitlines())
+                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+                text = ' '.join(chunk for chunk in chunks if chunk)
+                # Limit text length
+                if len(text) > 1000:
+                    text = text[:1000] + "..."
+                search_results.append({
+                    'url': url,
+                    'content': text,
+                    'title': soup.title.string if soup.title else "No title"
+                })
+            except Exception as e:
+                print(f"Error scraping {url}: {e}")
+                continue
+        state['search_results'] = search_results
+    except Exception as e:
+        print(f"Error during search: {e}")
+        state['search_results'] = []
+    return state
+def search_analyzer_agent(state: DocumentState) -> DocumentState:
+    """Analyzes user query to determine if web search is needed."""
+    if not state.get('search_query'):
+        return state
+    # Keywords that typically indicate need for current information
+    search_indicators = [
+        'latest', 'recent', 'current', 'news', 'update', 'today', 'now',
+        'what is', 'who is', 'when did', 'where is', 'how to', 'definition',
+        'explain', 'information about', 'tell me about', 'research'
+    ]
+    query_lower = state['search_query'].lower()
+    state['needs_search'] = any(indicator in query_lower for indicator in search_indicators)
+    return state
+def search_response_agent(state: DocumentState) -> DocumentState:
+    """Generates response based on search results."""
+    if not state.get('search_results'):
+        # Fallback to regular LLM response
+        llm_response = get_llm_response(state['search_query'])
+        state['summaries'] = [llm_response['response']]
+        return state
+    # Prepare search results for LLM
+    search_context = "\n\n".join([
+        f"Source: {result['title']} ({result['url']})\nContent: {result['content']}"
+        for result in state['search_results']
+    ])
+    prompt = f"""Based on the following search results, provide a comprehensive and accurate answer to the user's question: "{state['search_query']}"
+Search Results:
+{search_context}
+Please provide a well-structured response that:
+1. Answers the user's question directly
+2. Cites the sources when relevant
+3. Is accurate and informative
+4. Is concise but comprehensive
+Response:"""
+    llm_response = get_llm_response(prompt)
+    state['summaries'] = [llm_response['response']]
+    return state
+class DocumentState(TypedDict):
+    documents: list[dict]
+    summaries: list[str]
+    search_results: list[dict]
+    search_query: str
+    needs_search: bool
+def document_extractor_agent(state: DocumentState, pdf_path: str) -> DocumentState:
+    """Extracts documents from a PDF file."""
+    try:
+        loader = PyMuPDFLoader(pdf_path)
+        documents = loader.load()
+        state['documents'] = [
+            {
+                'content': doc.page_content,
+                'page': doc.metadata.get('page', 0) + 1,
+                'source': doc.metadata.get('source', 'Unknown')
+            } for doc in documents
+        ]
+    except Exception as e:
+        print(f"Error loading PDF: {e}")
+        state['documents'] = []
+    return state
+def document_summarizer_agent(state: DocumentState) -> DocumentState:
+    """Retrieves summaries of the documents."""
+    truncated_docs = []
+    for doc in state['documents']:
+        content = doc['content'][:500]
+        truncated_docs.append(f"Page {doc['page']}: {content}")
+    prompt = f"""Summarize these documents in exactly 3 sentences. Include page citations (p. X).
+Documents:
+{chr(10).join(truncated_docs)}
+Write 3 sentences with page citations with only refer from the document don't add up and jump to the conclusion."""
+    llm_response = get_llm_response(prompt)
+    summary = llm_response["response"]
+    state['summaries'] = [summary]
+    return state
+def create_document_graph():
+    talking_documents = StateGraph(DocumentState)
+    talking_documents.add_node('document_extractor', document_extractor_agent)
+    talking_documents.add_node('document_summarizer', document_summarizer_agent)
+    talking_documents.set_entry_point('document_extractor')
+    talking_documents.add_edge('document_extractor', 'document_summarizer')
+    return talking_documents.compile()
+def create_search_graph():
+    search_workflow = StateGraph(DocumentState)
+    search_workflow.add_node('search_analyzer', search_analyzer_agent)
+    search_workflow.add_node('google_search', google_search_agent)
+    search_workflow.add_node('search_response', search_response_agent)
+    search_workflow.set_entry_point('search_analyzer')
+    # Conditional edge based on search needs
+    def should_search(state):
+        return "search" if state.get('needs_search', False) else "response"
+    search_workflow.add_conditional_edges(
+        'search_analyzer',
+        should_search,
+        {
+            "search": "google_search",
+            "response": "search_response"
+        }
+    )
+    search_workflow.add_edge('google_search', 'search_response')
+    return search_workflow.compile()
+def process_pdf_and_chat(pdf_file, message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
+    if pdf_file is None:
+        return history + [(message, "Please upload a PDF file first.")]
+    try:
+        # Create a temporary file path for the uploaded PDF
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(pdf_file.read())
+            tmp_pdf_path = tmp_file.name
+        # Check if user wants to search for additional information
+        search_keywords = ['search', 'find more', 'additional info', 'more information', 'research']
+        if enable_search and any(keyword in message.lower() for keyword in search_keywords):
+            # Use search workflow for additional information
+            search_graph = create_search_graph()
+            search_state = {
+                'documents': [],
+                'summaries': [],
+                'search_results': [],
+                'search_query': message,
+                'needs_search': True
+            }
+            search_result = search_graph.invoke(search_state)
+            # Also process the PDF
+            def document_extractor_with_path(state: DocumentState) -> DocumentState:
+                return document_extractor_agent(state, tmp_pdf_path)
+            talking_documents = StateGraph(DocumentState)
+            talking_documents.add_node('document_extractor', document_extractor_with_path)
+            talking_documents.add_node('document_summarizer', document_summarizer_agent)
+            talking_documents.set_entry_point('document_extractor')
+            talking_documents.add_edge('document_extractor', 'document_summarizer')
+            pdf_graph = talking_documents.compile()
+            pdf_state = {'documents': [], 'summaries': []}
+            pdf_result = pdf_graph.invoke(pdf_state)
+            # Combine PDF and search results
+            combined_response = f"**PDF Summary:**\n{pdf_result['summaries'][0] if pdf_result['summaries'] else 'No summary available'}\n\n**Additional Information from Web:**\n{search_result['summaries'][0] if search_result['summaries'] else 'No additional information found'}"
+            response = combined_response
+        else:
+            # Regular PDF processing
+            def document_extractor_with_path(state: DocumentState) -> DocumentState:
+                return document_extractor_agent(state, tmp_pdf_path)
+            talking_documents = StateGraph(DocumentState)
+            talking_documents.add_node('document_extractor', document_extractor_with_path)
+            talking_documents.add_node('document_summarizer', document_summarizer_agent)
+            talking_documents.set_entry_point('document_extractor')
+            talking_documents.add_edge('document_extractor', 'document_summarizer')
+            graph = talking_documents.compile()
+            state = {'documents': [], 'summaries': []}
+            final_state = graph.invoke(state)
+            if final_state['summaries']:
+                response = final_state['summaries'][0]
+            else:
+                response = "Unable to process the PDF. Please check the file format."
+        # Clean up temporary file
+        os.unlink(tmp_pdf_path)
+        return history + [(message, response)]
+    except Exception as e:
+        return history + [(message, f"Error processing PDF: {str(e)}")]
+def respond(message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
+    """Enhanced chat function with optional Google search"""
+    if enable_search:
+        # Use search workflow
+        search_graph = create_search_graph()
+        state = {
+            'documents': [],
+            'summaries': [],
+            'search_results': [],
+            'search_query': message,
+            'needs_search': False
+        }
+        final_state = search_graph.invoke(state)
+        if final_state['summaries']:
+            response = final_state['summaries'][0]
+        else:
+            # Fallback to regular LLM response
+            prompt = f"{system_message}\n\nUser: {message}"
+            llm_response = get_llm_response(prompt)
+            response = llm_response["response"]
+    else:
+        # Regular chat without search
+        prompt = f"{system_message}\n\nUser: {message}"
+        llm_response = get_llm_response(prompt)
+        response = llm_response["response"]
+    return history + [(message, response)]
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Document Summarizer with Web Search")
+    gr.Markdown("Upload a PDF document and ask questions about it, or chat normally. Enable search for additional web information.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
+            enable_search = gr.Checkbox(label="Enable Google Search", value=False)
+            system_message = gr.Textbox(
+                value="You are a helpful assistant for summarizing and finding related information needed.",
+                label="System message"
+            )
+            max_tokens = gr.Slider(minimum=1, maximum=2000, value=512, step=1, label="Max new tokens")
+            temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
+            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot()
+            msg = gr.Textbox(label="Message")
+            clear = gr.Button("Clear")
+    def user_input(message, history):
+        return "", history + [(message, None)]
+    def bot_response(history, pdf_file, enable_search, system_message, max_tokens, temperature, top_p):
+        message = history[-1][0]
+        if pdf_file is not None:
+            new_history = process_pdf_and_chat(pdf_file, message, history[:-1], system_message, max_tokens, temperature, top_p, enable_search)
+        else:
+            new_history = respond(message, history[:-1], system_message, max_tokens, temperature, top_p, enable_search)
+        return new_history
+    msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot_response, [chatbot, pdf_upload, enable_search, system_message, max_tokens, temperature, top_p], chatbot
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,10 @@
1	- huggingface_hub==0.25.2

+huggingface_hub==0.25.2
+gradio
+langgraph
+langchain-community
+requests
+groq
+python-dotenv
+PyMuPDF
+google
+beautifulsoup4