dromerosm commited on
Commit
6061b77
·
verified ·
1 Parent(s): 80af058

Update api/index.py

Browse files
Files changed (1) hide show
  1. api/index.py +232 -21
api/index.py CHANGED
@@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict
3
  import requests
4
  import openai
5
  from duckduckgo_search import DDGS
 
6
  import concurrent.futures
7
  from docling.document_converter import DocumentConverter
8
  import dotenv
@@ -13,11 +14,18 @@ import fastapi
13
  import sys
14
  import pytz
15
  from datetime import datetime
 
 
16
 
17
- # OpenAI API Configuration
18
  dotenv.load_dotenv()
 
 
 
 
 
19
  openai_api_key = os.getenv("OPENAI_API_KEY")
20
- api_key = os.getenv("API_KEY") # Add this line to load the API key from the environment
21
  client = openai.Client()
22
 
23
  app = FastAPI(
@@ -109,6 +117,167 @@ class FetchResult(BaseModel):
109
  total_url_summaries_ok: int
110
  total_url_summaries_nok: int
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  @app.get("/",
113
  response_model=VersionInfo,
114
  summary="API Version Information",
@@ -142,30 +311,66 @@ async def get_version_info():
142
  )
143
  async def search(
144
  query: str = Query(..., description="The search query term or phrase"),
145
- timelimit: str = Query("m", description="Time range for results: 'd' (day), 'w' (week), 'm' (month), 'y' (year)"),
146
- region: str = Query("us-en", description="Geographic region for results (e.g., 'us-en', 'es-es')"),
147
  max_results: int = Query(3, description="Maximum number of results to return", ge=1, le=10),
148
  authorization: str = Header(..., description="API key for authorization", alias="Auth")
149
  ):
150
  if authorization != api_key:
151
  raise HTTPException(status_code=401, detail="Unauthorized")
152
 
153
- start_time = time.time() # Start of processing time
154
  madrid_tz = pytz.timezone('Europe/Madrid')
155
  current_time = datetime.now(madrid_tz).strftime('%Y-%m-%d %H:%M:%S %Z')
 
 
156
  total_input_tokens = 0
157
  total_completion_tokens = 0
158
  total_url_summaries_ok = 0
159
  total_url_summaries_nok = 0
160
 
 
 
 
 
161
  try:
162
- # Perform a search in DuckDuckGo using the following parameters
163
- ddgs = DDGS()
164
- search_results = ddgs.text(query, max_results=max_results, timelimit=timelimit, region=region)
 
 
 
 
 
165
  if not search_results:
166
- raise HTTPException(status_code=404, detail="No results found")
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  except Exception as e:
168
- raise HTTPException(status_code=500, detail=f"Error in search: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  # Instance to convert content with docling
171
  converter = DocumentConverter()
@@ -233,11 +438,11 @@ async def search(
233
  completion_tokens=completion_tokens
234
  )
235
 
236
- # Procesa los resultados de forma concurrente para ir más rápido
237
  with concurrent.futures.ThreadPoolExecutor() as executor:
238
  results = list(executor.map(process_result, search_results))
239
 
240
- processing_time = time.time() - start_time # Cálculo del tiempo total
241
 
242
  # Calculate total tokens
243
  total_input_tokens = sum(r.input_tokens for r in results)
@@ -362,14 +567,20 @@ async def fetch(
362
  total_url_summaries_nok=total_url_summaries_nok
363
  )
364
 
365
-
366
- # Local execution configuration
367
  if __name__ == "__main__":
368
- import uvicorn
369
-
370
- # Configurable variables
371
- HOST = "0.0.0.0"
372
- PORT = 8000
 
373
 
374
- # Start the Uvicorn server
375
- uvicorn.run("index:app", host=HOST, port=PORT, reload=True)
 
 
 
 
 
 
 
 
3
  import requests
4
  import openai
5
  from duckduckgo_search import DDGS
6
+ from googleapiclient.discovery import build
7
  import concurrent.futures
8
  from docling.document_converter import DocumentConverter
9
  import dotenv
 
14
  import sys
15
  import pytz
16
  from datetime import datetime
17
+ import uvicorn
18
+ import logging
19
 
20
+ # Load environment variables
21
  dotenv.load_dotenv()
22
+ SEARCH_ENGINE = os.getenv("SEARCH_ENGINE", "duckduckgo") # Default to DuckDuckGo
23
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
24
+ GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
25
+
26
+ # Configuración de la API de OpenAI
27
  openai_api_key = os.getenv("OPENAI_API_KEY")
28
+ api_key = os.getenv("API_KEY")
29
  client = openai.Client()
30
 
31
  app = FastAPI(
 
117
  total_url_summaries_ok: int
118
  total_url_summaries_nok: int
119
 
120
+ class SearchEngineFactory:
121
+ @staticmethod
122
+ def create_search_engine():
123
+ if SEARCH_ENGINE.lower() == "google":
124
+ return GoogleSearchEngine()
125
+ return DuckDuckGoSearchEngine()
126
+
127
+ class SearchEngineBase:
128
+ async def search(self, query: str, max_results: int, **kwargs) -> list:
129
+ pass
130
+
131
+ class DuckDuckGoSearchEngine(SearchEngineBase):
132
+ async def search(self, query: str, max_results: int, timelimit: str = "m", region: str = "us-en") -> list:
133
+ try:
134
+ with DDGS() as ddgs:
135
+ results = list(ddgs.text(query, max_results=max_results, timelimit=timelimit, region=region))
136
+ return [
137
+ {"href": r["link"], "body": r["body"]} for r in results
138
+ ] if results else []
139
+ except Exception as e:
140
+ raise HTTPException(status_code=500, detail=f"DuckDuckGo search error: {str(e)}")
141
+
142
+ class GoogleSearchEngine(SearchEngineBase):
143
+ def __init__(self):
144
+ if not GOOGLE_API_KEY or not GOOGLE_CSE_ID:
145
+ raise ValueError("Google API key and Custom Search Engine ID are required")
146
+ self.service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
147
+
148
+ def _convert_timelimit(self, timelimit: str) -> str:
149
+ """Convert DuckDuckGo timelimit format to Google dateRestrict format"""
150
+ if not timelimit:
151
+ return None
152
+
153
+ # Map of time units
154
+ time_map = {
155
+ 'd': 'd', # days stay as days
156
+ 'w': 'w', # weeks stay as weeks
157
+ 'm': 'm', # months stay as months
158
+ 'y': 'y' # years stay as years
159
+ }
160
+
161
+ # Extract unit and number in correct order
162
+ unit = timelimit[0].lower() # First character is the unit
163
+ number = timelimit[1:] if len(timelimit) > 1 else '1' # Rest is the number
164
+
165
+ print(f"unit: {unit}, number: {number}") # Debug print in correct order
166
+
167
+ if unit not in time_map:
168
+ return None
169
+
170
+ try:
171
+ int(number) # Validate number
172
+ return f"{time_map[unit]}{number}"
173
+ except ValueError:
174
+ return None
175
+
176
+ def _convert_region_to_lang(self, region: str) -> str:
177
+ """Convert DuckDuckGo region format to Google Search API language restriction"""
178
+ if not region:
179
+ return None
180
+
181
+ # Extract language code from region (e.g., 'us-en' -> 'en')
182
+ try:
183
+ lang_code = region.split('-')[1].lower()
184
+ except IndexError:
185
+ return None
186
+
187
+ # Map of language codes to Google Search API format
188
+ lang_map = {
189
+ 'ar': 'lang_ar', # Arabic
190
+ 'bg': 'lang_bg', # Bulgarian
191
+ 'ca': 'lang_ca', # Catalan
192
+ 'cs': 'lang_cs', # Czech
193
+ 'da': 'lang_da', # Danish
194
+ 'de': 'lang_de', # German
195
+ 'el': 'lang_el', # Greek
196
+ 'en': 'lang_en', # English
197
+ 'es': 'lang_es', # Spanish
198
+ 'et': 'lang_et', # Estonian
199
+ 'fi': 'lang_fi', # Finnish
200
+ 'fr': 'lang_fr', # French
201
+ 'hr': 'lang_hr', # Croatian
202
+ 'hu': 'lang_hu', # Hungarian
203
+ 'id': 'lang_id', # Indonesian
204
+ 'is': 'lang_is', # Icelandic
205
+ 'it': 'lang_it', # Italian
206
+ 'iw': 'lang_iw', # Hebrew
207
+ 'ja': 'lang_ja', # Japanese
208
+ 'ko': 'lang_ko', # Korean
209
+ 'lt': 'lang_lt', # Lithuanian
210
+ 'lv': 'lang_lv', # Latvian
211
+ 'nl': 'lang_nl', # Dutch
212
+ 'no': 'lang_no', # Norwegian
213
+ 'pl': 'lang_pl', # Polish
214
+ 'pt': 'lang_pt', # Portuguese
215
+ 'ro': 'lang_ro', # Romanian
216
+ 'ru': 'lang_ru', # Russian
217
+ 'sk': 'lang_sk', # Slovak
218
+ 'sl': 'lang_sl', # Slovenian
219
+ 'sr': 'lang_sr', # Serbian
220
+ 'sv': 'lang_sv', # Swedish
221
+ 'tr': 'lang_tr', # Turkish
222
+ 'zh': 'lang_zh-CN' # Default Chinese to Simplified
223
+ }
224
+
225
+ print(f"Converting region {region} to language code {lang_code}") # Debug print
226
+ return lang_map.get(lang_code)
227
+
228
+ async def search(self, query: str, max_results: int, timelimit: str = "m", region: str = "us-en", **kwargs) -> list:
229
+ try:
230
+ results = []
231
+ date_restrict = self._convert_timelimit(timelimit)
232
+ language = self._convert_region_to_lang(region)
233
+
234
+ for i in range(0, max_results, 10):
235
+ search_params = {
236
+ 'q': query,
237
+ 'cx': GOOGLE_CSE_ID,
238
+ 'start': i + 1,
239
+ 'num': min(10, max_results - i)
240
+ }
241
+
242
+ if date_restrict:
243
+ print(f"Adding dateRestrict: {date_restrict}")
244
+ search_params['dateRestrict'] = date_restrict
245
+
246
+ if language:
247
+ print(f"Adding language restriction: {language}")
248
+ search_params['lr'] = language
249
+
250
+ try:
251
+ response = self.service.cse().list(**search_params).execute()
252
+
253
+ if "items" not in response:
254
+ print(f"No results found for query: {query}")
255
+ continue
256
+
257
+ results.extend([
258
+ {
259
+ "href": item["link"],
260
+ "body": item.get("snippet", "")
261
+ } for item in response["items"]
262
+ ])
263
+ except Exception as search_error:
264
+ print(f"Error during Google search: {str(search_error)}")
265
+ raise HTTPException(status_code=500, detail=f"Google search error: {str(search_error)}")
266
+
267
+ if not results:
268
+ # Aquí lanzamos explícitamente un HTTPException 404 cuando no hay resultados
269
+ raise HTTPException(
270
+ status_code=404,
271
+ detail=f"No results found for query: {query}"
272
+ )
273
+
274
+ return results[:max_results]
275
+
276
+ except HTTPException as he:
277
+ raise he
278
+ except Exception as e:
279
+ raise HTTPException(status_code=500, detail=f"Google search error: {str(e)}")
280
+
281
  @app.get("/",
282
  response_model=VersionInfo,
283
  summary="API Version Information",
 
311
  )
312
  async def search(
313
  query: str = Query(..., description="The search query term or phrase"),
314
+ timelimit: str = Query("m", description="Time range for results (DuckDuckGo only): 'd' (day), 'w' (week), 'm' (month), 'y' (year)"),
315
+ region: str = Query("us-en", description="Geographic region for results (DuckDuckGo only, e.g., 'us-en', 'es-es')"),
316
  max_results: int = Query(3, description="Maximum number of results to return", ge=1, le=10),
317
  authorization: str = Header(..., description="API key for authorization", alias="Auth")
318
  ):
319
  if authorization != api_key:
320
  raise HTTPException(status_code=401, detail="Unauthorized")
321
 
322
+ start_time = time.time()
323
  madrid_tz = pytz.timezone('Europe/Madrid')
324
  current_time = datetime.now(madrid_tz).strftime('%Y-%m-%d %H:%M:%S %Z')
325
+
326
+ # Initialize counters
327
  total_input_tokens = 0
328
  total_completion_tokens = 0
329
  total_url_summaries_ok = 0
330
  total_url_summaries_nok = 0
331
 
332
+ # Create search engine instance
333
+ print(f"Using search engine: {SEARCH_ENGINE}")
334
+ search_engine = SearchEngineFactory.create_search_engine()
335
+
336
  try:
337
+ # Perform search using the selected engine
338
+ search_results = await search_engine.search(
339
+ query=query,
340
+ max_results=max_results,
341
+ timelimit=timelimit,
342
+ region=region
343
+ )
344
+
345
  if not search_results:
346
+ return SearchResponse(
347
+ query=query,
348
+ results=[],
349
+ processing_time=round(time.time() - start_time, 2),
350
+ timestamp=current_time,
351
+ total_input_tokens=0,
352
+ total_completion_tokens=0,
353
+ total_url_summaries_ok=0,
354
+ total_url_summaries_nok=0
355
+ )
356
+
357
+ except HTTPException as he:
358
+ # Re-raise HTTP exceptions (like 404) without wrapping
359
+ raise he
360
  except Exception as e:
361
+ # For other errors, check if it's a "no results" case
362
+ if "No results found" in str(e):
363
+ return SearchResponse(
364
+ query=query,
365
+ results=[],
366
+ processing_time=round(time.time() - start_time, 2),
367
+ timestamp=current_time,
368
+ total_input_tokens=0,
369
+ total_completion_tokens=0,
370
+ total_url_summaries_ok=0,
371
+ total_url_summaries_nok=0
372
+ )
373
+ raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
374
 
375
  # Instance to convert content with docling
376
  converter = DocumentConverter()
 
438
  completion_tokens=completion_tokens
439
  )
440
 
441
+ # Procesa los resultados de forma concurrente para ir más rápido
442
  with concurrent.futures.ThreadPoolExecutor() as executor:
443
  results = list(executor.map(process_result, search_results))
444
 
445
+ processing_time = time.time() - start_time # Cálculo del tiempo total
446
 
447
  # Calculate total tokens
448
  total_input_tokens = sum(r.input_tokens for r in results)
 
567
  total_url_summaries_nok=total_url_summaries_nok
568
  )
569
 
 
 
570
  if __name__ == "__main__":
571
+ # Configure logging
572
+ logging.basicConfig(
573
+ level=logging.INFO,
574
+ format='%(asctime)s - %(levelname)s - %(message)s',
575
+ datefmt='%Y-%m-%d %H:%M:%S'
576
+ )
577
 
578
+ # Run the server with hot reload for development
579
+ uvicorn.run(
580
+ "index:app",
581
+ host="0.0.0.0",
582
+ port=8000,
583
+ reload=True,
584
+ reload_dirs=["./"],
585
+ log_level="info"
586
+ )