Spaces:
Sleeping
Sleeping
Update api/index.py
Browse files- api/index.py +232 -21
api/index.py
CHANGED
|
@@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict
|
|
| 3 |
import requests
|
| 4 |
import openai
|
| 5 |
from duckduckgo_search import DDGS
|
|
|
|
| 6 |
import concurrent.futures
|
| 7 |
from docling.document_converter import DocumentConverter
|
| 8 |
import dotenv
|
|
@@ -13,11 +14,18 @@ import fastapi
|
|
| 13 |
import sys
|
| 14 |
import pytz
|
| 15 |
from datetime import datetime
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
#
|
| 18 |
dotenv.load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 20 |
-
api_key = os.getenv("API_KEY")
|
| 21 |
client = openai.Client()
|
| 22 |
|
| 23 |
app = FastAPI(
|
|
@@ -109,6 +117,167 @@ class FetchResult(BaseModel):
|
|
| 109 |
total_url_summaries_ok: int
|
| 110 |
total_url_summaries_nok: int
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
@app.get("/",
|
| 113 |
response_model=VersionInfo,
|
| 114 |
summary="API Version Information",
|
|
@@ -142,30 +311,66 @@ async def get_version_info():
|
|
| 142 |
)
|
| 143 |
async def search(
|
| 144 |
query: str = Query(..., description="The search query term or phrase"),
|
| 145 |
-
timelimit: str = Query("m", description="Time range for results: 'd' (day), 'w' (week), 'm' (month), 'y' (year)"),
|
| 146 |
-
region: str = Query("us-en", description="Geographic region for results (e.g., 'us-en', 'es-es')"),
|
| 147 |
max_results: int = Query(3, description="Maximum number of results to return", ge=1, le=10),
|
| 148 |
authorization: str = Header(..., description="API key for authorization", alias="Auth")
|
| 149 |
):
|
| 150 |
if authorization != api_key:
|
| 151 |
raise HTTPException(status_code=401, detail="Unauthorized")
|
| 152 |
|
| 153 |
-
start_time = time.time()
|
| 154 |
madrid_tz = pytz.timezone('Europe/Madrid')
|
| 155 |
current_time = datetime.now(madrid_tz).strftime('%Y-%m-%d %H:%M:%S %Z')
|
|
|
|
|
|
|
| 156 |
total_input_tokens = 0
|
| 157 |
total_completion_tokens = 0
|
| 158 |
total_url_summaries_ok = 0
|
| 159 |
total_url_summaries_nok = 0
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
try:
|
| 162 |
-
# Perform
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
if not search_results:
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
except Exception as e:
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
# Instance to convert content with docling
|
| 171 |
converter = DocumentConverter()
|
|
@@ -233,11 +438,11 @@ async def search(
|
|
| 233 |
completion_tokens=completion_tokens
|
| 234 |
)
|
| 235 |
|
| 236 |
-
# Procesa los resultados de forma concurrente para ir
|
| 237 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 238 |
results = list(executor.map(process_result, search_results))
|
| 239 |
|
| 240 |
-
processing_time = time.time() - start_time #
|
| 241 |
|
| 242 |
# Calculate total tokens
|
| 243 |
total_input_tokens = sum(r.input_tokens for r in results)
|
|
@@ -362,14 +567,20 @@ async def fetch(
|
|
| 362 |
total_url_summaries_nok=total_url_summaries_nok
|
| 363 |
)
|
| 364 |
|
| 365 |
-
|
| 366 |
-
# Local execution configuration
|
| 367 |
if __name__ == "__main__":
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
|
|
|
| 373 |
|
| 374 |
-
#
|
| 375 |
-
uvicorn.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import requests
|
| 4 |
import openai
|
| 5 |
from duckduckgo_search import DDGS
|
| 6 |
+
from googleapiclient.discovery import build
|
| 7 |
import concurrent.futures
|
| 8 |
from docling.document_converter import DocumentConverter
|
| 9 |
import dotenv
|
|
|
|
| 14 |
import sys
|
| 15 |
import pytz
|
| 16 |
from datetime import datetime
|
| 17 |
+
import uvicorn
|
| 18 |
+
import logging
|
| 19 |
|
| 20 |
+
# Load environment variables
|
| 21 |
dotenv.load_dotenv()
|
| 22 |
+
SEARCH_ENGINE = os.getenv("SEARCH_ENGINE", "duckduckgo") # Default to DuckDuckGo
|
| 23 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 24 |
+
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
|
| 25 |
+
|
| 26 |
+
# Configuración de la API de OpenAI
|
| 27 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 28 |
+
api_key = os.getenv("API_KEY")
|
| 29 |
client = openai.Client()
|
| 30 |
|
| 31 |
app = FastAPI(
|
|
|
|
| 117 |
total_url_summaries_ok: int
|
| 118 |
total_url_summaries_nok: int
|
| 119 |
|
| 120 |
+
class SearchEngineFactory:
|
| 121 |
+
@staticmethod
|
| 122 |
+
def create_search_engine():
|
| 123 |
+
if SEARCH_ENGINE.lower() == "google":
|
| 124 |
+
return GoogleSearchEngine()
|
| 125 |
+
return DuckDuckGoSearchEngine()
|
| 126 |
+
|
| 127 |
+
class SearchEngineBase:
|
| 128 |
+
async def search(self, query: str, max_results: int, **kwargs) -> list:
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
class DuckDuckGoSearchEngine(SearchEngineBase):
|
| 132 |
+
async def search(self, query: str, max_results: int, timelimit: str = "m", region: str = "us-en") -> list:
|
| 133 |
+
try:
|
| 134 |
+
with DDGS() as ddgs:
|
| 135 |
+
results = list(ddgs.text(query, max_results=max_results, timelimit=timelimit, region=region))
|
| 136 |
+
return [
|
| 137 |
+
{"href": r["link"], "body": r["body"]} for r in results
|
| 138 |
+
] if results else []
|
| 139 |
+
except Exception as e:
|
| 140 |
+
raise HTTPException(status_code=500, detail=f"DuckDuckGo search error: {str(e)}")
|
| 141 |
+
|
| 142 |
+
class GoogleSearchEngine(SearchEngineBase):
|
| 143 |
+
def __init__(self):
|
| 144 |
+
if not GOOGLE_API_KEY or not GOOGLE_CSE_ID:
|
| 145 |
+
raise ValueError("Google API key and Custom Search Engine ID are required")
|
| 146 |
+
self.service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
|
| 147 |
+
|
| 148 |
+
def _convert_timelimit(self, timelimit: str) -> str:
|
| 149 |
+
"""Convert DuckDuckGo timelimit format to Google dateRestrict format"""
|
| 150 |
+
if not timelimit:
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
# Map of time units
|
| 154 |
+
time_map = {
|
| 155 |
+
'd': 'd', # days stay as days
|
| 156 |
+
'w': 'w', # weeks stay as weeks
|
| 157 |
+
'm': 'm', # months stay as months
|
| 158 |
+
'y': 'y' # years stay as years
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
# Extract unit and number in correct order
|
| 162 |
+
unit = timelimit[0].lower() # First character is the unit
|
| 163 |
+
number = timelimit[1:] if len(timelimit) > 1 else '1' # Rest is the number
|
| 164 |
+
|
| 165 |
+
print(f"unit: {unit}, number: {number}") # Debug print in correct order
|
| 166 |
+
|
| 167 |
+
if unit not in time_map:
|
| 168 |
+
return None
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
int(number) # Validate number
|
| 172 |
+
return f"{time_map[unit]}{number}"
|
| 173 |
+
except ValueError:
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
def _convert_region_to_lang(self, region: str) -> str:
|
| 177 |
+
"""Convert DuckDuckGo region format to Google Search API language restriction"""
|
| 178 |
+
if not region:
|
| 179 |
+
return None
|
| 180 |
+
|
| 181 |
+
# Extract language code from region (e.g., 'us-en' -> 'en')
|
| 182 |
+
try:
|
| 183 |
+
lang_code = region.split('-')[1].lower()
|
| 184 |
+
except IndexError:
|
| 185 |
+
return None
|
| 186 |
+
|
| 187 |
+
# Map of language codes to Google Search API format
|
| 188 |
+
lang_map = {
|
| 189 |
+
'ar': 'lang_ar', # Arabic
|
| 190 |
+
'bg': 'lang_bg', # Bulgarian
|
| 191 |
+
'ca': 'lang_ca', # Catalan
|
| 192 |
+
'cs': 'lang_cs', # Czech
|
| 193 |
+
'da': 'lang_da', # Danish
|
| 194 |
+
'de': 'lang_de', # German
|
| 195 |
+
'el': 'lang_el', # Greek
|
| 196 |
+
'en': 'lang_en', # English
|
| 197 |
+
'es': 'lang_es', # Spanish
|
| 198 |
+
'et': 'lang_et', # Estonian
|
| 199 |
+
'fi': 'lang_fi', # Finnish
|
| 200 |
+
'fr': 'lang_fr', # French
|
| 201 |
+
'hr': 'lang_hr', # Croatian
|
| 202 |
+
'hu': 'lang_hu', # Hungarian
|
| 203 |
+
'id': 'lang_id', # Indonesian
|
| 204 |
+
'is': 'lang_is', # Icelandic
|
| 205 |
+
'it': 'lang_it', # Italian
|
| 206 |
+
'iw': 'lang_iw', # Hebrew
|
| 207 |
+
'ja': 'lang_ja', # Japanese
|
| 208 |
+
'ko': 'lang_ko', # Korean
|
| 209 |
+
'lt': 'lang_lt', # Lithuanian
|
| 210 |
+
'lv': 'lang_lv', # Latvian
|
| 211 |
+
'nl': 'lang_nl', # Dutch
|
| 212 |
+
'no': 'lang_no', # Norwegian
|
| 213 |
+
'pl': 'lang_pl', # Polish
|
| 214 |
+
'pt': 'lang_pt', # Portuguese
|
| 215 |
+
'ro': 'lang_ro', # Romanian
|
| 216 |
+
'ru': 'lang_ru', # Russian
|
| 217 |
+
'sk': 'lang_sk', # Slovak
|
| 218 |
+
'sl': 'lang_sl', # Slovenian
|
| 219 |
+
'sr': 'lang_sr', # Serbian
|
| 220 |
+
'sv': 'lang_sv', # Swedish
|
| 221 |
+
'tr': 'lang_tr', # Turkish
|
| 222 |
+
'zh': 'lang_zh-CN' # Default Chinese to Simplified
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
print(f"Converting region {region} to language code {lang_code}") # Debug print
|
| 226 |
+
return lang_map.get(lang_code)
|
| 227 |
+
|
| 228 |
+
async def search(self, query: str, max_results: int, timelimit: str = "m", region: str = "us-en", **kwargs) -> list:
|
| 229 |
+
try:
|
| 230 |
+
results = []
|
| 231 |
+
date_restrict = self._convert_timelimit(timelimit)
|
| 232 |
+
language = self._convert_region_to_lang(region)
|
| 233 |
+
|
| 234 |
+
for i in range(0, max_results, 10):
|
| 235 |
+
search_params = {
|
| 236 |
+
'q': query,
|
| 237 |
+
'cx': GOOGLE_CSE_ID,
|
| 238 |
+
'start': i + 1,
|
| 239 |
+
'num': min(10, max_results - i)
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
if date_restrict:
|
| 243 |
+
print(f"Adding dateRestrict: {date_restrict}")
|
| 244 |
+
search_params['dateRestrict'] = date_restrict
|
| 245 |
+
|
| 246 |
+
if language:
|
| 247 |
+
print(f"Adding language restriction: {language}")
|
| 248 |
+
search_params['lr'] = language
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
response = self.service.cse().list(**search_params).execute()
|
| 252 |
+
|
| 253 |
+
if "items" not in response:
|
| 254 |
+
print(f"No results found for query: {query}")
|
| 255 |
+
continue
|
| 256 |
+
|
| 257 |
+
results.extend([
|
| 258 |
+
{
|
| 259 |
+
"href": item["link"],
|
| 260 |
+
"body": item.get("snippet", "")
|
| 261 |
+
} for item in response["items"]
|
| 262 |
+
])
|
| 263 |
+
except Exception as search_error:
|
| 264 |
+
print(f"Error during Google search: {str(search_error)}")
|
| 265 |
+
raise HTTPException(status_code=500, detail=f"Google search error: {str(search_error)}")
|
| 266 |
+
|
| 267 |
+
if not results:
|
| 268 |
+
# Aquí lanzamos explícitamente un HTTPException 404 cuando no hay resultados
|
| 269 |
+
raise HTTPException(
|
| 270 |
+
status_code=404,
|
| 271 |
+
detail=f"No results found for query: {query}"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
return results[:max_results]
|
| 275 |
+
|
| 276 |
+
except HTTPException as he:
|
| 277 |
+
raise he
|
| 278 |
+
except Exception as e:
|
| 279 |
+
raise HTTPException(status_code=500, detail=f"Google search error: {str(e)}")
|
| 280 |
+
|
| 281 |
@app.get("/",
|
| 282 |
response_model=VersionInfo,
|
| 283 |
summary="API Version Information",
|
|
|
|
| 311 |
)
|
| 312 |
async def search(
|
| 313 |
query: str = Query(..., description="The search query term or phrase"),
|
| 314 |
+
timelimit: str = Query("m", description="Time range for results (DuckDuckGo only): 'd' (day), 'w' (week), 'm' (month), 'y' (year)"),
|
| 315 |
+
region: str = Query("us-en", description="Geographic region for results (DuckDuckGo only, e.g., 'us-en', 'es-es')"),
|
| 316 |
max_results: int = Query(3, description="Maximum number of results to return", ge=1, le=10),
|
| 317 |
authorization: str = Header(..., description="API key for authorization", alias="Auth")
|
| 318 |
):
|
| 319 |
if authorization != api_key:
|
| 320 |
raise HTTPException(status_code=401, detail="Unauthorized")
|
| 321 |
|
| 322 |
+
start_time = time.time()
|
| 323 |
madrid_tz = pytz.timezone('Europe/Madrid')
|
| 324 |
current_time = datetime.now(madrid_tz).strftime('%Y-%m-%d %H:%M:%S %Z')
|
| 325 |
+
|
| 326 |
+
# Initialize counters
|
| 327 |
total_input_tokens = 0
|
| 328 |
total_completion_tokens = 0
|
| 329 |
total_url_summaries_ok = 0
|
| 330 |
total_url_summaries_nok = 0
|
| 331 |
|
| 332 |
+
# Create search engine instance
|
| 333 |
+
print(f"Using search engine: {SEARCH_ENGINE}")
|
| 334 |
+
search_engine = SearchEngineFactory.create_search_engine()
|
| 335 |
+
|
| 336 |
try:
|
| 337 |
+
# Perform search using the selected engine
|
| 338 |
+
search_results = await search_engine.search(
|
| 339 |
+
query=query,
|
| 340 |
+
max_results=max_results,
|
| 341 |
+
timelimit=timelimit,
|
| 342 |
+
region=region
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
if not search_results:
|
| 346 |
+
return SearchResponse(
|
| 347 |
+
query=query,
|
| 348 |
+
results=[],
|
| 349 |
+
processing_time=round(time.time() - start_time, 2),
|
| 350 |
+
timestamp=current_time,
|
| 351 |
+
total_input_tokens=0,
|
| 352 |
+
total_completion_tokens=0,
|
| 353 |
+
total_url_summaries_ok=0,
|
| 354 |
+
total_url_summaries_nok=0
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
except HTTPException as he:
|
| 358 |
+
# Re-raise HTTP exceptions (like 404) without wrapping
|
| 359 |
+
raise he
|
| 360 |
except Exception as e:
|
| 361 |
+
# For other errors, check if it's a "no results" case
|
| 362 |
+
if "No results found" in str(e):
|
| 363 |
+
return SearchResponse(
|
| 364 |
+
query=query,
|
| 365 |
+
results=[],
|
| 366 |
+
processing_time=round(time.time() - start_time, 2),
|
| 367 |
+
timestamp=current_time,
|
| 368 |
+
total_input_tokens=0,
|
| 369 |
+
total_completion_tokens=0,
|
| 370 |
+
total_url_summaries_ok=0,
|
| 371 |
+
total_url_summaries_nok=0
|
| 372 |
+
)
|
| 373 |
+
raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
|
| 374 |
|
| 375 |
# Instance to convert content with docling
|
| 376 |
converter = DocumentConverter()
|
|
|
|
| 438 |
completion_tokens=completion_tokens
|
| 439 |
)
|
| 440 |
|
| 441 |
+
# Procesa los resultados de forma concurrente para ir más rápido
|
| 442 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 443 |
results = list(executor.map(process_result, search_results))
|
| 444 |
|
| 445 |
+
processing_time = time.time() - start_time # Cálculo del tiempo total
|
| 446 |
|
| 447 |
# Calculate total tokens
|
| 448 |
total_input_tokens = sum(r.input_tokens for r in results)
|
|
|
|
| 567 |
total_url_summaries_nok=total_url_summaries_nok
|
| 568 |
)
|
| 569 |
|
|
|
|
|
|
|
| 570 |
if __name__ == "__main__":
|
| 571 |
+
# Configure logging
|
| 572 |
+
logging.basicConfig(
|
| 573 |
+
level=logging.INFO,
|
| 574 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 575 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
| 576 |
+
)
|
| 577 |
|
| 578 |
+
# Run the server with hot reload for development
|
| 579 |
+
uvicorn.run(
|
| 580 |
+
"index:app",
|
| 581 |
+
host="0.0.0.0",
|
| 582 |
+
port=8000,
|
| 583 |
+
reload=True,
|
| 584 |
+
reload_dirs=["./"],
|
| 585 |
+
log_level="info"
|
| 586 |
+
)
|