Hasnan Ramadhan commited on
Commit
c7b8084
·
1 Parent(s): ce3a388

Update space

Browse files
Files changed (2) hide show
  1. app.py +361 -60
  2. requirements.txt +10 -1
app.py CHANGED
@@ -1,64 +1,365 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ from langgraph.graph import StateGraph
3
+ from typing import TypedDict
4
+ from langchain_community.document_loaders import PyMuPDFLoader
5
+ import requests
6
+ from groq import Groq
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import tempfile
10
+ from googlesearch import search
11
+ from bs4 import BeautifulSoup
12
+ from urllib.parse import urljoin, urlparse
13
+ import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ load_dotenv()
16
+
17
+ def get_llm_response(prompt):
18
+ url = "http://192.168.181.215:8081/llms"
19
+ headers = {"Content-Type": "application/json"}
20
+ payload = {
21
+ "messages": [{"role": "user", "content": prompt}],
22
+ "max_new_tokens": 2000,
23
+ "do_sample": True,
24
+ "temperature": 0.2,
25
+ "top_k": 10,
26
+ "top_p": 0.90
27
+ }
28
+ try:
29
+ response = requests.post(url, json=payload, headers=headers)
30
+ response.raise_for_status()
31
+ data = response.json()
32
+ return {
33
+ "response": data['choices'][0]['content'],
34
+ "usage": data.get('usage', {}),
35
+ "generation_time": data.get('generation_time', None)
36
+ }
37
+ except requests.exceptions.RequestException as e:
38
+ return {
39
+ "response": f"Error occurred: {str(e)}",
40
+ "usage": {},
41
+ "generation_time": None
42
+ }
43
+
44
+ def get_groq_response(prompt):
45
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
46
+ completion = client.chat.completions.create(
47
+ model="llama-3.1-8b-instant",
48
+ messages=[
49
+ {
50
+ "role": "user",
51
+ "content": prompt
52
+ }
53
+ ]
54
+ )
55
+ return completion.choices[0].message.content
56
+
57
+ def google_search_agent(state: DocumentState) -> DocumentState:
58
+ """Performs Google search and extracts content from results."""
59
+ if not state.get('search_query'):
60
+ return state
61
+
62
+ try:
63
+ search_results = []
64
+ # Get top 3 search results
65
+ for url in search(state['search_query'], num_results=3):
66
+ try:
67
+ response = requests.get(url, timeout=10)
68
+ response.raise_for_status()
69
+
70
+ soup = BeautifulSoup(response.content, 'html.parser')
71
+
72
+ # Remove script and style elements
73
+ for script in soup(["script", "style"]):
74
+ script.decompose()
75
+
76
+ # Get text content
77
+ text = soup.get_text()
78
+
79
+ # Clean up text
80
+ lines = (line.strip() for line in text.splitlines())
81
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
82
+ text = ' '.join(chunk for chunk in chunks if chunk)
83
+
84
+ # Limit text length
85
+ if len(text) > 1000:
86
+ text = text[:1000] + "..."
87
+
88
+ search_results.append({
89
+ 'url': url,
90
+ 'content': text,
91
+ 'title': soup.title.string if soup.title else "No title"
92
+ })
93
+ except Exception as e:
94
+ print(f"Error scraping {url}: {e}")
95
+ continue
96
+
97
+ state['search_results'] = search_results
98
+ except Exception as e:
99
+ print(f"Error during search: {e}")
100
+ state['search_results'] = []
101
+
102
+ return state
103
+
104
+ def search_analyzer_agent(state: DocumentState) -> DocumentState:
105
+ """Analyzes user query to determine if web search is needed."""
106
+ if not state.get('search_query'):
107
+ return state
108
+
109
+ # Keywords that typically indicate need for current information
110
+ search_indicators = [
111
+ 'latest', 'recent', 'current', 'news', 'update', 'today', 'now',
112
+ 'what is', 'who is', 'when did', 'where is', 'how to', 'definition',
113
+ 'explain', 'information about', 'tell me about', 'research'
114
+ ]
115
+
116
+ query_lower = state['search_query'].lower()
117
+ state['needs_search'] = any(indicator in query_lower for indicator in search_indicators)
118
+
119
+ return state
120
+
121
+ def search_response_agent(state: DocumentState) -> DocumentState:
122
+ """Generates response based on search results."""
123
+ if not state.get('search_results'):
124
+ # Fallback to regular LLM response
125
+ llm_response = get_llm_response(state['search_query'])
126
+ state['summaries'] = [llm_response['response']]
127
+ return state
128
+
129
+ # Prepare search results for LLM
130
+ search_context = "\n\n".join([
131
+ f"Source: {result['title']} ({result['url']})\nContent: {result['content']}"
132
+ for result in state['search_results']
133
+ ])
134
+
135
+ prompt = f"""Based on the following search results, provide a comprehensive and accurate answer to the user's question: "{state['search_query']}"
136
+
137
+ Search Results:
138
+ {search_context}
139
+
140
+ Please provide a well-structured response that:
141
+ 1. Answers the user's question directly
142
+ 2. Cites the sources when relevant
143
+ 3. Is accurate and informative
144
+ 4. Is concise but comprehensive
145
+
146
+ Response:"""
147
+
148
+ llm_response = get_llm_response(prompt)
149
+ state['summaries'] = [llm_response['response']]
150
+ return state
151
+
152
+ class DocumentState(TypedDict):
153
+ documents: list[dict]
154
+ summaries: list[str]
155
+ search_results: list[dict]
156
+ search_query: str
157
+ needs_search: bool
158
+
159
+ def document_extractor_agent(state: DocumentState, pdf_path: str) -> DocumentState:
160
+ """Extracts documents from a PDF file."""
161
+ try:
162
+ loader = PyMuPDFLoader(pdf_path)
163
+ documents = loader.load()
164
+ state['documents'] = [
165
+ {
166
+ 'content': doc.page_content,
167
+ 'page': doc.metadata.get('page', 0) + 1,
168
+ 'source': doc.metadata.get('source', 'Unknown')
169
+ } for doc in documents
170
+ ]
171
+ except Exception as e:
172
+ print(f"Error loading PDF: {e}")
173
+ state['documents'] = []
174
+ return state
175
+
176
+ def document_summarizer_agent(state: DocumentState) -> DocumentState:
177
+ """Retrieves summaries of the documents."""
178
+ truncated_docs = []
179
+ for doc in state['documents']:
180
+ content = doc['content'][:500]
181
+ truncated_docs.append(f"Page {doc['page']}: {content}")
182
+
183
+ prompt = f"""Summarize these documents in exactly 3 sentences. Include page citations (p. X).
184
+
185
+ Documents:
186
+ {chr(10).join(truncated_docs)}
187
+
188
+ Write 3 sentences with page citations with only refer from the document don't add up and jump to the conclusion."""
189
+
190
+ llm_response = get_llm_response(prompt)
191
+ summary = llm_response["response"]
192
+ state['summaries'] = [summary]
193
+ return state
194
+
195
+ def create_document_graph():
196
+ talking_documents = StateGraph(DocumentState)
197
+ talking_documents.add_node('document_extractor', document_extractor_agent)
198
+ talking_documents.add_node('document_summarizer', document_summarizer_agent)
199
+ talking_documents.set_entry_point('document_extractor')
200
+ talking_documents.add_edge('document_extractor', 'document_summarizer')
201
+ return talking_documents.compile()
202
+
203
+ def create_search_graph():
204
+ search_workflow = StateGraph(DocumentState)
205
+ search_workflow.add_node('search_analyzer', search_analyzer_agent)
206
+ search_workflow.add_node('google_search', google_search_agent)
207
+ search_workflow.add_node('search_response', search_response_agent)
208
+ search_workflow.set_entry_point('search_analyzer')
209
+
210
+ # Conditional edge based on search needs
211
+ def should_search(state):
212
+ return "search" if state.get('needs_search', False) else "response"
213
+
214
+ search_workflow.add_conditional_edges(
215
+ 'search_analyzer',
216
+ should_search,
217
+ {
218
+ "search": "google_search",
219
+ "response": "search_response"
220
+ }
221
+ )
222
+ search_workflow.add_edge('google_search', 'search_response')
223
+ return search_workflow.compile()
224
+
225
+ def process_pdf_and_chat(pdf_file, message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
226
+ if pdf_file is None:
227
+ return history + [(message, "Please upload a PDF file first.")]
228
+
229
+ try:
230
+ # Create a temporary file path for the uploaded PDF
231
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
232
+ tmp_file.write(pdf_file.read())
233
+ tmp_pdf_path = tmp_file.name
234
+
235
+ # Check if user wants to search for additional information
236
+ search_keywords = ['search', 'find more', 'additional info', 'more information', 'research']
237
+ if enable_search and any(keyword in message.lower() for keyword in search_keywords):
238
+ # Use search workflow for additional information
239
+ search_graph = create_search_graph()
240
+ search_state = {
241
+ 'documents': [],
242
+ 'summaries': [],
243
+ 'search_results': [],
244
+ 'search_query': message,
245
+ 'needs_search': True
246
+ }
247
+
248
+ search_result = search_graph.invoke(search_state)
249
+
250
+ # Also process the PDF
251
+ def document_extractor_with_path(state: DocumentState) -> DocumentState:
252
+ return document_extractor_agent(state, tmp_pdf_path)
253
+
254
+ talking_documents = StateGraph(DocumentState)
255
+ talking_documents.add_node('document_extractor', document_extractor_with_path)
256
+ talking_documents.add_node('document_summarizer', document_summarizer_agent)
257
+ talking_documents.set_entry_point('document_extractor')
258
+ talking_documents.add_edge('document_extractor', 'document_summarizer')
259
+ pdf_graph = talking_documents.compile()
260
+
261
+ pdf_state = {'documents': [], 'summaries': []}
262
+ pdf_result = pdf_graph.invoke(pdf_state)
263
+
264
+ # Combine PDF and search results
265
+ combined_response = f"**PDF Summary:**\n{pdf_result['summaries'][0] if pdf_result['summaries'] else 'No summary available'}\n\n**Additional Information from Web:**\n{search_result['summaries'][0] if search_result['summaries'] else 'No additional information found'}"
266
+
267
+ response = combined_response
268
+ else:
269
+ # Regular PDF processing
270
+ def document_extractor_with_path(state: DocumentState) -> DocumentState:
271
+ return document_extractor_agent(state, tmp_pdf_path)
272
+
273
+ talking_documents = StateGraph(DocumentState)
274
+ talking_documents.add_node('document_extractor', document_extractor_with_path)
275
+ talking_documents.add_node('document_summarizer', document_summarizer_agent)
276
+ talking_documents.set_entry_point('document_extractor')
277
+ talking_documents.add_edge('document_extractor', 'document_summarizer')
278
+ graph = talking_documents.compile()
279
+
280
+ state = {'documents': [], 'summaries': []}
281
+ final_state = graph.invoke(state)
282
+
283
+ if final_state['summaries']:
284
+ response = final_state['summaries'][0]
285
+ else:
286
+ response = "Unable to process the PDF. Please check the file format."
287
+
288
+ # Clean up temporary file
289
+ os.unlink(tmp_pdf_path)
290
+
291
+ return history + [(message, response)]
292
+
293
+ except Exception as e:
294
+ return history + [(message, f"Error processing PDF: {str(e)}")]
295
+
296
+ def respond(message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
297
+ """Enhanced chat function with optional Google search"""
298
+ if enable_search:
299
+ # Use search workflow
300
+ search_graph = create_search_graph()
301
+ state = {
302
+ 'documents': [],
303
+ 'summaries': [],
304
+ 'search_results': [],
305
+ 'search_query': message,
306
+ 'needs_search': False
307
+ }
308
+
309
+ final_state = search_graph.invoke(state)
310
+
311
+ if final_state['summaries']:
312
+ response = final_state['summaries'][0]
313
+ else:
314
+ # Fallback to regular LLM response
315
+ prompt = f"{system_message}\n\nUser: {message}"
316
+ llm_response = get_llm_response(prompt)
317
+ response = llm_response["response"]
318
+ else:
319
+ # Regular chat without search
320
+ prompt = f"{system_message}\n\nUser: {message}"
321
+ llm_response = get_llm_response(prompt)
322
+ response = llm_response["response"]
323
+
324
+ return history + [(message, response)]
325
+
326
+ # Create the Gradio interface
327
+ with gr.Blocks() as demo:
328
+ gr.Markdown("# Document Summarizer with Web Search")
329
+ gr.Markdown("Upload a PDF document and ask questions about it, or chat normally. Enable search for additional web information.")
330
+
331
+ with gr.Row():
332
+ with gr.Column(scale=1):
333
+ pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
334
+ enable_search = gr.Checkbox(label="Enable Google Search", value=False)
335
+ system_message = gr.Textbox(
336
+ value="You are a helpful assistant for summarizing and finding related information needed.",
337
+ label="System message"
338
+ )
339
+ max_tokens = gr.Slider(minimum=1, maximum=2000, value=512, step=1, label="Max new tokens")
340
+ temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
341
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
342
+
343
+ with gr.Column(scale=2):
344
+ chatbot = gr.Chatbot()
345
+ msg = gr.Textbox(label="Message")
346
+ clear = gr.Button("Clear")
347
+
348
+ def user_input(message, history):
349
+ return "", history + [(message, None)]
350
+
351
+ def bot_response(history, pdf_file, enable_search, system_message, max_tokens, temperature, top_p):
352
+ message = history[-1][0]
353
+ if pdf_file is not None:
354
+ new_history = process_pdf_and_chat(pdf_file, message, history[:-1], system_message, max_tokens, temperature, top_p, enable_search)
355
+ else:
356
+ new_history = respond(message, history[:-1], system_message, max_tokens, temperature, top_p, enable_search)
357
+ return new_history
358
+
359
+ msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
360
+ bot_response, [chatbot, pdf_upload, enable_search, system_message, max_tokens, temperature, top_p], chatbot
361
+ )
362
+ clear.click(lambda: None, None, chatbot, queue=False)
363
 
364
  if __name__ == "__main__":
365
+ demo.launch()
requirements.txt CHANGED
@@ -1 +1,10 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub==0.25.2
2
+ gradio
3
+ langgraph
4
+ langchain-community
5
+ requests
6
+ groq
7
+ python-dotenv
8
+ PyMuPDF
9
+ google
10
+ beautifulsoup4