alex-i07 commited on
Commit
9e19567
·
1 Parent(s): 355ad4a

switch to wikipedia loader because of duckduckgo anti-bot protection

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -0
  2. tools.py +5 -17
requirements.txt CHANGED
@@ -10,3 +10,5 @@ langchain-openai
10
  langchain-anthropic
11
  langgraph
12
  certifi
 
 
 
10
  langchain-anthropic
11
  langgraph
12
  certifi
13
+ arxiv
14
+ pymupdf
tools.py CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
8
  from openai import OpenAI
9
  from pytubefix import YouTube
10
  from langchain_community.tools import tool
 
11
  from bs4 import BeautifulSoup, ResultSet, PageElement, Tag, NavigableString
12
 
13
  urllib3.disable_warnings()
@@ -84,22 +85,11 @@ def wiki_search(query: str) -> str | None:
84
  """
85
 
86
  try:
87
- ddg_results = []
88
  wiki_results = ""
89
- link_rows = _fetch_ddg_search_result_links(f"wikipedia {query}")
90
- print(query, link_rows)
91
- for link_row in link_rows:
92
- if not 'en.wikipedia.org' in link_row.attrs['href']:
93
- continue
94
-
95
- ddg_results.append({
96
- 'title': link_row.get_text(strip=True),
97
- 'url': link_row.attrs['href']
98
- })
99
-
100
- wiki_results += _fetch_specific_page(link_row.attrs['href'])
101
- if len(ddg_results) == 1:
102
- break
103
 
104
  return wiki_results
105
  except requests.exceptions.RequestException as e:
@@ -121,7 +111,6 @@ def archive_search(query: str) -> str | None:
121
  ddg_results = []
122
  archive_results = ""
123
  link_rows = _fetch_ddg_search_result_links(f"archive.org {query}")
124
- print(query, link_rows)
125
  for link_row in link_rows:
126
  if not 'archive.org' in link_row.attrs['href']:
127
  continue
@@ -268,7 +257,6 @@ def _fetch_ddg_search_result_links(query: str) -> ResultSet[PageElement | Tag |
268
 
269
  ddg_response = requests.get(url, headers=headers, params=params, verify=False)
270
  ddg_response.raise_for_status()
271
- print(ddg_response.text)
272
  soup = BeautifulSoup(ddg_response.text, 'html.parser')
273
  return soup.find_all('a', {'class': 'result-link'})
274
 
 
8
  from openai import OpenAI
9
  from pytubefix import YouTube
10
  from langchain_community.tools import tool
11
+ from langchain_community.document_loaders import WikipediaLoader
12
  from bs4 import BeautifulSoup, ResultSet, PageElement, Tag, NavigableString
13
 
14
  urllib3.disable_warnings()
 
85
  """
86
 
87
  try:
 
88
  wiki_results = ""
89
+ search_docs = WikipediaLoader(query=query, load_max_docs=1).load()
90
+ for doc in search_docs:
91
+ if "source" in doc.metadata and doc.metadata["source"]:
92
+ wiki_results += _fetch_specific_page(doc.metadata["source"])
 
 
 
 
 
 
 
 
 
 
93
 
94
  return wiki_results
95
  except requests.exceptions.RequestException as e:
 
111
  ddg_results = []
112
  archive_results = ""
113
  link_rows = _fetch_ddg_search_result_links(f"archive.org {query}")
 
114
  for link_row in link_rows:
115
  if not 'archive.org' in link_row.attrs['href']:
116
  continue
 
257
 
258
  ddg_response = requests.get(url, headers=headers, params=params, verify=False)
259
  ddg_response.raise_for_status()
 
260
  soup = BeautifulSoup(ddg_response.text, 'html.parser')
261
  return soup.find_all('a', {'class': 'result-link'})
262