MANOJSEQ commited on
Commit
067fead
Β·
verified Β·
1 Parent(s): 1553332

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +35 -0
  2. domain_country_map.py +87 -0
  3. main.py +2022 -0
  4. requirements.txt +14 -0
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---- Base image ----
2
+ FROM python:3.10-slim
3
+
4
+ ENV PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1 \
6
+ HF_HUB_DISABLE_TELEMETRY=1 \
7
+ PORT=7860
8
+
9
+ # (optional) handy tools for healthchecks & logs
10
+ RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
11
+ rm -rf /var/lib/apt/lists/*
12
+
13
+ WORKDIR /app
14
+
15
+ # ---- Python deps ----
16
+ COPY requirements.txt ./
17
+ RUN python -m pip install --upgrade pip && \
18
+ # Install CPU-only PyTorch first (lighter & reliable on Spaces)
19
+ pip install torch --index-url https://download.pytorch.org/whl/cpu && \
20
+ pip install -r requirements.txt
21
+
22
+ # ---- App code ----
23
+ COPY . .
24
+
25
+ # (optional) warm the sentence-transformers model into the image layer
26
+ RUN python - <<'PY'
27
+ from sentence_transformers import SentenceTransformer
28
+ SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
29
+ print("βœ… SBERT model cached")
30
+ PY
31
+
32
+ EXPOSE 7860
33
+
34
+ # ---- Run ----
35
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
domain_country_map.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ domain_country_map = {
2
+ "bbc": "United Kingdom",
3
+ "cnn": "United States",
4
+ "reuters": "United Kingdom",
5
+ "nytimes": "United States",
6
+ "theguardian": "United Kingdom",
7
+ "aljazeera": "Qatar",
8
+ "timesofindia": "India",
9
+ "indiatimes": "India",
10
+ "hindustantimes": "India",
11
+ "ndtv": "India",
12
+ "abc": "United States",
13
+ "foxnews": "United States",
14
+ "nbcnews": "United States",
15
+ "washingtonpost": "United States",
16
+ "bloomberg": "United States",
17
+ "forbes": "United States",
18
+ "cbsnews": "United States",
19
+ "dailymail": "United Kingdom",
20
+ "telegraph": "United Kingdom",
21
+ "usatoday": "United States",
22
+ "latimes": "United States",
23
+ "sydneynews": "Australia",
24
+ "smh": "Australia",
25
+ "afr": "Australia",
26
+ "thehindu": "India",
27
+ "deccanherald": "India",
28
+ "japantimes": "Japan",
29
+ "lemonde": "France",
30
+ "dw": "Germany",
31
+ "zeit": "Germany",
32
+ "globaltimes": "China",
33
+ "scmp": "Hong Kong",
34
+ "straitstimes": "Singapore",
35
+ "globeandmail": "Canada",
36
+ "torontostar": "Canada",
37
+ "cbc": "Canada",
38
+ "ctvnews": "Canada",
39
+ "koreatimes": "South Korea",
40
+ "yonhapnews": "South Korea",
41
+ "vanguardngr": "Nigeria",
42
+ "punchng": "Nigeria",
43
+ "saharareporters": "Nigeria",
44
+ "theeastafrican": "Kenya",
45
+ "nation": "Kenya",
46
+ "gulfnews": "United Arab Emirates",
47
+ "khaleejtimes": "United Arab Emirates",
48
+ "businessinsider": "United States",
49
+ "buzzfeed": "United States",
50
+ "vice": "United States",
51
+ "huffpost": "United States",
52
+ "wired": "United States",
53
+ "verge": "United States",
54
+ "techcrunch": "United States",
55
+ "mashable": "United States",
56
+ "gizmodo": "United States",
57
+ "bleedingcool": "United States",
58
+ "ibtimes": "United Kingdom",
59
+ "ibtimes.com.au": "Australia",
60
+ "nativenewsonline": "United States",
61
+ "slashdot": "United States",
62
+ "bgr": "United States",
63
+ "biztoc": "United States",
64
+ "abcnews": "Australia",
65
+ "abcnews.go": "United States",
66
+ "ft": "United Kingdom",
67
+ "deadline": "United States",
68
+ "apnews": "United States",
69
+ "pbs": "United States",
70
+ "cna": "Singapore",
71
+ "chinadaily": "China",
72
+ "jakartapost": "Indonesia",
73
+ "bangkokpost": "Thailand",
74
+ "manilabulletin": "Philippines",
75
+ "philstar": "Philippines",
76
+ "rappler": "Philippines",
77
+ "eluniversal": "Mexico",
78
+ "clarin": "Argentina",
79
+ "folha": "Brazil",
80
+ "oglobo": "Brazil",
81
+ "haaretz": "Israel",
82
+ "timesofisrael": "Israel",
83
+ "arabnews": "Saudi Arabia",
84
+ "dawn": "Pakistan",
85
+ "thenews": "Pakistan",
86
+ "geo": "Pakistan"
87
+ }
main.py ADDED
@@ -0,0 +1,2022 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query, HTTPException, Body
2
+ from typing import Optional, List, Dict, Any, Tuple, Set
3
+ import os
4
+ import time
5
+ import socket
6
+ import logging
7
+ import hashlib
8
+ from functools import lru_cache
9
+ from collections import Counter
10
+ import requests
11
+ import tldextract
12
+ import math
13
+ import nltk
14
+ from nltk.sentiment import SentimentIntensityAnalyzer
15
+ from geopy.geocoders import Nominatim
16
+ from geopy.exc import GeocoderUnavailable, GeocoderTimedOut
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ from countryinfo import CountryInfo
19
+ from sentence_transformers import SentenceTransformer, util
20
+ from domain_country_map import domain_country_map
21
+ from time import monotonic
22
+ from langdetect import detect, DetectorFactory
23
+ import re
24
+ from urllib.parse import urlparse, urlunparse, parse_qsl
25
+ from concurrent.futures import ThreadPoolExecutor, as_completed
26
+ from html import unescape
27
+ import threading
28
+ import difflib
29
+ from starlette.middleware.gzip import GZipMiddleware
30
+ from transformers import pipeline as hf_pipeline
31
+ import os
32
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
33
+ from fastapi import Path
34
+
35
+ import torch
36
+ torch.set_num_threads(2)
37
+
38
+ # Optional runtime check for local OPUS tokenizers
39
+ try:
40
+ import sentencepiece as _spm # noqa: F401
41
+ _HAS_SENTENCEPIECE = True
42
+ except Exception:
43
+ _HAS_SENTENCEPIECE = False
44
+
45
+
46
+ from enum import Enum
47
+ class Speed(str, Enum):
48
+ fast = "fast"
49
+ balanced = "balanced"
50
+ max = "max"
51
+
52
+ _local_pipes = {}
53
+ _news_clf = None
54
+ _sbert = None
55
+
56
+ # --- Translation runtime flags / caches ---
57
+ ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
58
+ _hf_bad_models: Set[str] = set()
59
+
60
+
61
+ def _translate_local(text: str, src: str, tgt: str) -> Optional[str]:
62
+ if not _HAS_SENTENCEPIECE:
63
+ # Avoid attempting to download/instantiate Marian tokenizers without sentencepiece
64
+ return None
65
+ model_id = opus_model_for(src, tgt)
66
+ if not model_id:
67
+ return None
68
+ key = model_id
69
+ try:
70
+ if key not in _local_pipes:
71
+ _local_pipes[key] = hf_pipeline("translation", model=model_id)
72
+ out = _local_pipes[key](text, max_length=512)
73
+ return out[0]["translation_text"]
74
+ except Exception as e:
75
+ log.warning("Local translate failed for %s: %s", model_id, e)
76
+ return None
77
+
78
+
79
+ def fetch_gdelt_multi(limit=120, query=None, language=None, timespan="48h", category=None, speed: Speed = Speed.balanced):
80
+ # If user forced a language, honor it (but add a small English boost for coverage)
81
+ if language:
82
+ primary = fetch_gdelt_articles(limit=limit, query=query, language=language, timespan=timespan, category=category)
83
+ # tiny English booster to catch global wires
84
+ booster = fetch_gdelt_articles(limit=max(10, limit // 6), query=query, language="en", timespan=timespan, category=category)
85
+ return primary + booster
86
+
87
+ # Otherwise rotate across multiple languages
88
+ if speed == Speed.fast:
89
+ langs = LANG_ROTATION[:3] # quicker
90
+ timespan = "24h"
91
+ elif speed == Speed.balanced:
92
+ langs = LANG_ROTATION[:8] # good mix
93
+ timespan = "48h"
94
+ else:
95
+ langs = LANG_ROTATION # max coverage
96
+ timespan = "3d"
97
+
98
+ per_lang = max(8, math.ceil(limit / len(langs)))
99
+ out = []
100
+ for lg in langs:
101
+ out.extend(fetch_gdelt_articles(limit=per_lang, query=query, language=lg, timespan=timespan, category=category))
102
+
103
+ # Optional: add a few English pulls biased to different source countries (broadens outlets)
104
+ if speed != Speed.fast:
105
+ per_cc = max(4, limit // 30) if speed == Speed.max else max(2, limit // 40)
106
+ for cc in COUNTRY_SEEDS[: (8 if speed == Speed.balanced else 16)]:
107
+ out.extend(
108
+ fetch_gdelt_articles(
109
+ limit=per_cc,
110
+ query=query,
111
+ language="en",
112
+ timespan=timespan,
113
+ category=category,
114
+ extra_tokens=[f"sourcecountry:{cc}"]
115
+ )
116
+ )
117
+
118
+ return out
119
+
120
+
121
+ def get_news_clf():
122
+ global _news_clf
123
+ if _news_clf is None:
124
+ _news_clf = hf_pipeline(
125
+ "text-classification",
126
+ model="cardiffnlp/tweet-topic-21-multi",
127
+ top_k=1,
128
+ )
129
+ return _news_clf
130
+
131
+ def get_sbert():
132
+ global _sbert
133
+ if _sbert is None:
134
+ _sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
135
+ return _sbert
136
+
137
+ # globals
138
+ SESSION = requests.Session()
139
+ ADAPTER = requests.adapters.HTTPAdapter(pool_connections=64, pool_maxsize=64, max_retries=2)
140
+ SESSION.mount("http://", ADAPTER)
141
+ SESSION.mount("https://", ADAPTER)
142
+
143
+ def _session_get(url, **kwargs):
144
+ headers = kwargs.pop("headers", {})
145
+ headers.setdefault("User-Agent", "Mozilla/5.0 (compatible; NewsGlobe/1.0)")
146
+ return SESSION.get(url, headers=headers, timeout=kwargs.pop("timeout", 12), **kwargs)
147
+
148
+ def _try_jina_reader(url: str, timeout: int) -> Optional[str]:
149
+ try:
150
+ u = url.strip()
151
+ if not u.startswith(("http://", "https://")):
152
+ u = "https://" + u
153
+ r = _session_get(f"https://r.jina.ai/{u}", timeout=timeout)
154
+ if r.status_code == 200:
155
+ txt = _clean_text(r.text)
156
+ sents = _split_sentences(txt)
157
+ if sents:
158
+ best = " ".join(sents[:2])
159
+ return best if len(best) >= 80 else (sents[0] if sents else None)
160
+ except Exception:
161
+ pass
162
+ return None
163
+
164
+
165
+
166
+ # --- description cleanup helpers ---
167
+
168
+ BOILER_DESC = re.compile(
169
+ r"(subscribe|sign in|sign up|enable javascript|cookies? (policy|settings)|"
170
+ r"privacy (policy|notice)|continue reading|read more|click here|"
171
+ r"accept (cookies|the terms)|by continuing|newsletter|advertisement|adblock)",
172
+ re.I
173
+ )
174
+
175
+ def _split_sentences(text: str) -> List[str]:
176
+ # light-weight splitter good enough for news blurbs
177
+ parts = re.split(r"(?<=[\.\?\!])\s+(?=[A-Z0-9])", (text or "").strip())
178
+ # also break on " β€’ " and long dashes if present
179
+ out = []
180
+ for p in parts:
181
+ out.extend(re.split(r"\s+[‒–—]\s+", p))
182
+ return [p.strip() for p in out if p and len(p.strip()) >= 2]
183
+
184
+ def _tidy_description(title: str, desc: str, source_name: str, max_chars: int = 240) -> str:
185
+ if not desc:
186
+ return ""
187
+
188
+ # remove repeated title
189
+ desc = _dedupe_title_from_desc(title, desc)
190
+
191
+ # strip obvious boilerplate
192
+ desc = BOILER_DESC.sub("", desc)
193
+ desc = re.sub(r"\s+", " ", desc).strip(" -–:β€’|")
194
+
195
+ # choose first 1–2 sentences that look like summary
196
+ sents = _split_sentences(desc)
197
+ if not sents:
198
+ sents = [desc]
199
+
200
+ best = " ".join(sents[:2]).strip()
201
+
202
+ # soft truncate at sentence boundary
203
+ if len(best) > max_chars:
204
+ # try only first sentence
205
+ if len(sents[0]) <= max_chars * 0.9:
206
+ best = sents[0]
207
+ else:
208
+ best = best[:max_chars].rsplit(" ", 1)[0].rstrip(",;:-–—")
209
+
210
+ # avoid parroting the headline
211
+ if _too_similar(title, best):
212
+ # try next sentence if we have it
213
+ for alt in sents[1:3]:
214
+ if not _too_similar(title, alt):
215
+ best = alt
216
+ break
217
+
218
+ # ensure it ends neatly
219
+ if best and best[-1] not in ".!?":
220
+ best += "."
221
+ return best
222
+
223
+
224
+
225
+ def _too_similar(a: str, b: str, thresh: float = 0.92) -> bool:
226
+ """Return True if strings are near-duplicates (or one contains the other)."""
227
+ a = (a or "").strip()
228
+ b = (b or "").strip()
229
+ if not a or not b:
230
+ return False
231
+ if a.lower() == b.lower():
232
+ return True
233
+ if a.lower() in b.lower() or b.lower() in a.lower():
234
+ return True
235
+ ratio = difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()
236
+ return ratio >= thresh
237
+
238
+ def _dedupe_title_from_desc(title: str, desc: str) -> str:
239
+ """If the description contains the title, strip it and tidy up."""
240
+ t = (title or "").strip()
241
+ d = (desc or "").strip()
242
+ if not t or not d:
243
+ return d
244
+ # Remove exact leading title
245
+ if d.lower().startswith(t.lower()):
246
+ d = d[len(t):].lstrip(" -–:β€’|")
247
+ # Remove inner repeats
248
+ d = d.replace(t, "").strip(" -–:β€’|")
249
+ d = _clean_text(d)
250
+ return d
251
+
252
+
253
+ # Prevent duplicate upstream fetches when identical requests arrive together
254
+ _inflight_locks: Dict[Tuple, threading.Lock] = {}
255
+ _inflight_global_lock = threading.Lock()
256
+
257
+
258
+ def _get_inflight_lock(key: Tuple) -> threading.Lock:
259
+ with _inflight_global_lock:
260
+ lk = _inflight_locks.get(key)
261
+ if lk is None:
262
+ lk = threading.Lock()
263
+ _inflight_locks[key] = lk
264
+ return lk
265
+
266
+
267
+ DESC_CACHE_LOCK = threading.Lock()
268
+
269
+ try:
270
+ from bs4 import BeautifulSoup # optional but nice to have
271
+ except Exception:
272
+ BeautifulSoup = None
273
+
274
+
275
+ # -------- Description fetching config --------
276
+ DESC_FETCH_TIMEOUT = 3 # seconds per URL
277
+ DESC_MIN_LEN = 100 # consider shorter text as "weak" and try to upgrade
278
+ DESC_CACHE_TTL = 24 * 3600 # 24h
279
+ MAX_DESC_FETCHES = 24 # cap number of fetches per request
280
+ DESC_WORKERS = 12 # parallel workers
281
+
282
+ # url -> {"text": str, "t": monotonic()}
283
+ DESC_CACHE: Dict[str, Dict[str, Any]] = {}
284
+
285
+
286
+ def _now_mono():
287
+ try:
288
+ return monotonic()
289
+ except Exception:
290
+ return time.time()
291
+
292
+
293
+ def _clean_text(s: str) -> str:
294
+ s = unescape(s or "")
295
+ s = re.sub(r"\s+", " ", s).strip()
296
+ return s
297
+
298
+
299
+ def _extract_desc_from_ld_json(html: str) -> Optional[str]:
300
+ if not html or not BeautifulSoup:
301
+ return None
302
+ try:
303
+ soup = BeautifulSoup(html, "html.parser")
304
+ for tag in soup.find_all("script", {"type": "application/ld+json"}):
305
+ try:
306
+ import json
307
+
308
+ data = json.loads(tag.string or "")
309
+ except Exception:
310
+ continue
311
+
312
+ def find_desc(obj):
313
+ if not isinstance(obj, (dict, list)):
314
+ return None
315
+ if isinstance(obj, list):
316
+ for it in obj:
317
+ v = find_desc(it)
318
+ if v:
319
+ return v
320
+ return None
321
+ # dict
322
+ for key in ("description", "abstract", "articleBody"):
323
+ val = obj.get(key)
324
+ if isinstance(val, str):
325
+ txt = _clean_text(val)
326
+ if len(txt) >= 40:
327
+ return txt
328
+ # nested
329
+ for k, v in obj.items():
330
+ if isinstance(v, (dict, list)):
331
+ got = find_desc(v)
332
+ if got:
333
+ return got
334
+ return None
335
+
336
+ d = find_desc(data)
337
+ if d and len(d) >= 40:
338
+ return d
339
+ except Exception:
340
+ pass
341
+ return None
342
+
343
+
344
+ CONSENT_HINTS = re.compile(r"(consent|gdpr|privacy choices|before you continue|we value your privacy)", re.I)
345
+
346
+
347
+ def _looks_like_consent_wall(html: str) -> bool:
348
+ if not html:
349
+ return False
350
+ if "consent.yahoo.com" in html.lower(): # common interstitial
351
+ return True
352
+ # generic phrasing
353
+ return bool(CONSENT_HINTS.search(html))
354
+
355
+
356
+ def _extract_desc_from_html(html: str) -> Optional[str]:
357
+ html = html or ""
358
+ if BeautifulSoup:
359
+ soup = BeautifulSoup(html, "html.parser")
360
+
361
+ # βœ… JSON-LD early
362
+ ld = _extract_desc_from_ld_json(html)
363
+ if ld:
364
+ txt = _clean_text(ld)
365
+ if 40 <= len(txt) <= 480:
366
+ return txt
367
+
368
+
369
+ for sel, attr in [
370
+ ('meta[property="og:description"]', "content"),
371
+ ('meta[name="twitter:description"]', "content"),
372
+ ('meta[name="description"]', "content"),
373
+ ]:
374
+ tag = soup.select_one(sel)
375
+ if tag:
376
+ txt = _clean_text(tag.get(attr, ""))
377
+ if len(txt) >= 40:
378
+ return txt
379
+ # Fallback: first meaningful <p>
380
+ for p in soup.find_all("p"):
381
+ txt = _clean_text(p.get_text(" "))
382
+ if len(txt) >= 80:
383
+ return txt
384
+ else:
385
+ # regex fallbacks (as you had)
386
+ for pat in [
387
+ r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\']([^"\']+)["\']',
388
+ r'<meta[^>]+name=["\']twitter:description["\'][^>]+content=["\']([^"\']+)["\']',
389
+ r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']',
390
+ ]:
391
+ m = re.search(pat, html, flags=re.I | re.S)
392
+ if m:
393
+ txt = _clean_text(m.group(1))
394
+ if len(txt) >= 40:
395
+ return txt
396
+ m = re.search(r"<p[^>]*>(.*?)</p>", html, flags=re.I | re.S)
397
+ if m:
398
+ txt = _clean_text(re.sub("<[^>]+>", " ", m.group(1)))
399
+ if len(txt) >= 80:
400
+ return txt
401
+ # JSON-LD as last regex-free fallback not available w/o bs4
402
+ return None
403
+
404
+
405
+ def _desc_cache_get(url: str) -> Optional[str]:
406
+ if not url:
407
+ return None
408
+ entry = DESC_CACHE.get(url)
409
+ if not entry:
410
+ return None
411
+ if _now_mono() - entry["t"] > DESC_CACHE_TTL:
412
+ DESC_CACHE.pop(url, None)
413
+ return None
414
+ return entry["text"]
415
+
416
+
417
+ def _desc_cache_put(url: str, text: str):
418
+ if url and text:
419
+ with DESC_CACHE_LOCK:
420
+ DESC_CACHE[url] = {"text": text, "t": _now_mono()}
421
+
422
+
423
+ def _attempt_fetch(url: str, timeout: int) -> Optional[str]:
424
+ headers = {
425
+ "User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:[email protected])",
426
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
427
+ "Accept-Language": "en-US,en;q=0.9",
428
+ }
429
+ try:
430
+ r = _session_get(url, headers=headers, timeout=timeout, allow_redirects=True)
431
+ if r.status_code != 200:
432
+ return None
433
+ ct = (r.headers.get("Content-Type") or "").lower()
434
+ txt = r.text or ""
435
+ if "html" not in ct and "<html" not in txt.lower():
436
+ return None
437
+ if _looks_like_consent_wall(txt):
438
+ # jump straight to Jina if a consent wall is detected
439
+ jd = _try_jina_reader(url, timeout)
440
+ if jd:
441
+ return jd
442
+ return None
443
+ desc = _extract_desc_from_html(txt)
444
+ if desc and 40 <= len(desc) <= 480:
445
+ return desc
446
+ except Exception:
447
+ # fall through to Jina
448
+ pass
449
+
450
+ # Last-resort: Jina reader
451
+ jd = _try_jina_reader(url, timeout)
452
+ if jd and 40 <= len(jd) <= 480:
453
+ return jd
454
+ return None
455
+
456
+
457
+
458
+ def fetch_page_description(url: str) -> Optional[str]:
459
+ """Fetch and cache a best-effort article description from the page (incl. AMP retries)."""
460
+ if not url:
461
+ return None
462
+ cached = _desc_cache_get(url)
463
+ if cached:
464
+ return cached
465
+
466
+ # Try the original URL first
467
+ desc = _attempt_fetch(url, DESC_FETCH_TIMEOUT)
468
+ if not desc:
469
+ # Try common AMP variants
470
+ amp_candidates = []
471
+ try:
472
+ p = urlparse(url)
473
+ # /amp path
474
+ if not p.path.endswith("/amp"):
475
+ amp_candidates.append(urlunparse(p._replace(path=(p.path.rstrip("/") + "/amp"))))
476
+ # ?amp
477
+ q = p.query
478
+ amp_candidates.append(urlunparse(p._replace(query=(q + ("&" if q else "") + "amp=1"))))
479
+ # ?outputType=amp (CNN, some US sites)
480
+ amp_candidates.append(urlunparse(p._replace(query=(q + ("&" if q else "") + "outputType=amp"))))
481
+ except Exception:
482
+ pass
483
+ for amp_url in amp_candidates:
484
+ desc = _attempt_fetch(amp_url, DESC_FETCH_TIMEOUT)
485
+ if desc:
486
+ break
487
+
488
+ if desc:
489
+ _desc_cache_put(url, desc)
490
+ return desc
491
+ return None
492
+
493
+
494
+ def _needs_desc_upgrade(a: Dict[str, Any]) -> bool:
495
+ url = a.get("url") or ""
496
+ if not url:
497
+ return False
498
+ title = (a.get("title") or "").strip()
499
+ desc = (a.get("description") or "").strip()
500
+ if not desc or desc.lower().startswith("no description"):
501
+ return True
502
+ if len(desc) < DESC_MIN_LEN:
503
+ return True
504
+ # NEW: if desc β‰ˆ title, trigger upgrade
505
+ if _too_similar(title, desc):
506
+ return True
507
+ return False
508
+
509
+
510
+ def prefetch_descriptions(raw_articles: List[Dict[str, Any]], speed: Speed = Speed.balanced):
511
+ candidates, seen = [], set()
512
+ max_fetches = 6 if speed == Speed.fast else 8 if speed == Speed.balanced else 16
513
+ timeout = 1 if speed == Speed.fast else 2
514
+ workers = 3 if speed == Speed.fast else 4 if speed == Speed.balanced else 8
515
+
516
+
517
+ for a in raw_articles:
518
+ url = a.get("url");
519
+ if not url or url in seen: continue
520
+ seen.add(url)
521
+ if _needs_desc_upgrade(a) and not _desc_cache_get(url):
522
+ candidates.append(url)
523
+ if len(candidates) >= max_fetches: break
524
+
525
+ if not candidates: return
526
+ with ThreadPoolExecutor(max_workers=workers) as ex:
527
+ futs = [ex.submit(fetch_page_description, u) for u in candidates]
528
+ for _ in as_completed(futs): pass
529
+
530
+
531
+ def prefetch_descriptions_async(raw_articles, speed: Speed = Speed.balanced):
532
+ threading.Thread(target=prefetch_descriptions, args=(raw_articles, speed), daemon=True).start()
533
+
534
+ # news_clf = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi", top_k=1)
535
+ DetectorFactory.seed = 0 # deterministic
536
+
537
+ SECTION_HINTS = {
538
+ "sports": "sports",
539
+ "sport": "sports",
540
+ "business": "business",
541
+ "money": "business",
542
+ "market": "business",
543
+ "tech": "technology",
544
+ "technology": "technology",
545
+ "sci": "science",
546
+ "science": "science",
547
+ "health": "health",
548
+ "wellness": "health",
549
+ "entertainment": "entertainment",
550
+ "culture": "entertainment",
551
+ "showbiz": "entertainment",
552
+ "crime": "crime",
553
+ "world": "general",
554
+ "weather": "weather",
555
+ "environment": "environment",
556
+ "climate": "environment",
557
+ "travel": "travel",
558
+ "politics": "politics",
559
+ "election": "politics",
560
+ }
561
+
562
+ KEYWORDS = {
563
+ "sports": r"\b(NBA|NFL|MLB|NHL|Olympic|goal|match|tournament|coach|transfer)\b",
564
+ "business": r"\b(stocks?|earnings|IPO|merger|acquisition|revenue|inflation|market)\b",
565
+ "technology": r"\b(AI|software|chip|semiconductor|app|startup|cyber|hack|quantum|robot)\b",
566
+ "science": r"\b(researchers?|study|physics|astronomy|genome|spacecraft|telescope)\b",
567
+ "health": r"\b(virus|vaccine|disease|hospital|doctor|public health|covid)\b",
568
+ "entertainment": r"\b(movie|film|box office|celebrity|series|show|album|music)\b",
569
+ "crime": r"\b(arrested|charged|police|homicide|fraud|theft|court|lawsuit)\b",
570
+ "weather": r"\b(hurricane|storm|flood|heatwave|blizzard|tornado|forecast)\b",
571
+ "environment": r"\b(climate|emissions|wildfire|deforestation|biodiversity)\b",
572
+ "travel": r"\b(flight|airline|airport|tourism|visa|cruise|hotel)\b",
573
+ "politics": r"\b(president|parliament|congress|minister|policy|campaign|election)\b",
574
+ }
575
+
576
+
577
+ def _infer_category_from_url_path(url_path: str) -> Optional[str]:
578
+ parts = [p for p in url_path.lower().split("/") if p]
579
+ for p in parts:
580
+ if p in SECTION_HINTS:
581
+ return SECTION_HINTS[p]
582
+ # also try hyphenated tokens like 'us-business' or 'tech-news'
583
+ for p in parts:
584
+ for tok in re.split(r"[-_]", p):
585
+ if tok in SECTION_HINTS:
586
+ return SECTION_HINTS[tok]
587
+ return None
588
+
589
+
590
+ def _infer_category_from_text(text: str) -> Optional[str]:
591
+ if not text:
592
+ return None
593
+ for cat, pat in KEYWORDS.items():
594
+ if re.search(pat, text, flags=re.I):
595
+ return cat
596
+ return None
597
+
598
+
599
+ def infer_category(article_url, title, description, provided):
600
+ if provided:
601
+ p = provided.strip().lower()
602
+ if p:
603
+ return p
604
+ # url rules
605
+ try:
606
+ p = urlparse(article_url).path or ""
607
+ cat = _infer_category_from_url_path(p)
608
+ if cat:
609
+ return cat
610
+ except Exception:
611
+ pass
612
+ # keyword rules
613
+ text = f"{title or ''} {description or ''}".strip()
614
+ cat = _infer_category_from_text(text)
615
+ if cat:
616
+ return cat
617
+ try:
618
+ preds = get_news_clf()(text[:512]) # lazy-loaded
619
+ if isinstance(preds[0], list):
620
+ label = preds[0][0]["label"]
621
+ else:
622
+ label = preds[0]["label"]
623
+ return label.lower()
624
+ except Exception as e:
625
+ log.warning(f"ML category failed: {e}")
626
+ return "general"
627
+
628
+
629
+
630
+ BOILER = re.compile(r"\b(live updates|breaking|what we know|in pictures|opinion)\b", re.I)
631
+
632
+
633
+ def _norm_text(s: str) -> str:
634
+ s = (s or "").strip()
635
+ s = re.sub(r"\s+", " ", s)
636
+ return s
637
+
638
+
639
+ def _cluster_text(a):
640
+ base = f"{a.get('orig_title') or a.get('title') or ''} {a.get('orig_description') or a.get('description') or ''}"
641
+ base = BOILER.sub("", base)
642
+ base = re.sub(r"\b(\d{1,2}:\d{2}\s?(AM|PM))|\b(\d{1,2}\s\w+\s\d{4})", "", base, flags=re.I)
643
+ return _norm_text(base)
644
+
645
+
646
+ def _canonical_url(u: str) -> str:
647
+ if not u:
648
+ return u
649
+ p = urlparse(u)
650
+ # drop tracking params
651
+ qs = [(k, v) for (k, v) in parse_qsl(p.query, keep_blank_values=False) if not k.lower().startswith(("utm_", "fbclid", "gclid"))]
652
+ clean = p._replace(query="&".join([f"{k}={v}" for k, v in qs]), fragment="")
653
+ # some sites add trailing slashes inconsistently
654
+ path = clean.path.rstrip("/") or "/"
655
+ clean = clean._replace(path=path)
656
+ return urlunparse(clean)
657
+
658
+
659
+ def detect_lang(text: str) -> Optional[str]:
660
+ try:
661
+ return detect(text) # returns 'en','fr','de',...
662
+ except Exception:
663
+ return None
664
+
665
+
666
+ def _embed_texts(texts: List[str]):
667
+ embs = get_sbert().encode(texts, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
668
+ return embs
669
+
670
+
671
+ # ---- cache helpers ----
672
+ CACHE_TTL_SECS = 900
673
+ SIM_THRESHOLD = 0.6
674
+ _events_cache: Dict[Tuple, Dict[str, Any]] = {}
675
+
676
+
677
+ def cache_key_for(q, category, language, limit_each, translate=False, target_lang=None, speed=Speed.balanced):
678
+ return (q or "", category or "", language or "", int(limit_each or 50),
679
+ bool(translate), (target_lang or "").lower(), speed.value)
680
+
681
+
682
+ _first_real_build = True # module-global
683
+
684
+ def get_or_build_events_cache(q, category, language, translate, target_lang, limit_each, speed=Speed.balanced):
685
+ global _first_real_build
686
+ key = cache_key_for(q, category, language, limit_each, translate, target_lang, speed)
687
+ now = monotonic()
688
+
689
+ if speed == Speed.fast:
690
+ use_timespan, use_limit = "24h", min(limit_each, 20)
691
+ elif speed == Speed.balanced:
692
+ use_timespan, use_limit = "48h", min(limit_each, 150)
693
+ else: # max
694
+ use_timespan, use_limit = "3d", limit_each
695
+
696
+ entry = _events_cache.get(key)
697
+ if entry and now - entry["t"] < CACHE_TTL_SECS:
698
+ log.info(f"CACHE HIT for {key}")
699
+ return key, entry["enriched"], entry["clusters"]
700
+
701
+ lock = _get_inflight_lock(key)
702
+ with lock:
703
+ entry = _events_cache.get(key)
704
+ if entry and now - entry["t"] < CACHE_TTL_SECS:
705
+ log.info(f"CACHE HIT (post-lock) for {key}")
706
+ return key, entry["enriched"], entry["clusters"]
707
+
708
+ if _first_real_build:
709
+ use_timespan = "24h" if use_timespan != "24h" else use_timespan
710
+ use_limit = min(use_limit, 100)
711
+
712
+ log.info(f"CACHE MISS for {key} β€” fetching (timespan={use_timespan}, limit_each={use_limit})")
713
+
714
+ raw = combine_raw_articles(
715
+ category=category, # providers may use it; inference ignores it
716
+ query=q,
717
+ language=language,
718
+ limit_each=use_limit,
719
+ timespan=use_timespan,
720
+ speed=speed,
721
+ )
722
+ prefetch_descriptions_async(raw, speed)
723
+
724
+ enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
725
+
726
+ if category:
727
+ cat_norm = (category or "").strip().lower()
728
+ enriched = [e for e in enriched_all if (e.get("category") or "").lower() == cat_norm]
729
+ else:
730
+ enriched = enriched_all
731
+
732
+ clusters = cluster_articles(enriched, sim_threshold=SIM_THRESHOLD, speed=speed)
733
+
734
+ _events_cache[key] = {"t": monotonic(), "enriched": enriched, "clusters": clusters}
735
+ _first_real_build = False
736
+ return key, enriched, clusters
737
+
738
+
739
+ # Which languages to rotate when user didn't restrict language
740
+ LANG_ROTATION = ["en", "es", "fr", "de", "ar", "ru", "pt", "zh", "hi", "ja", "ko"]
741
+
742
+ # A few sourcecountry seeds for English to diversify outlets (optional)
743
+ COUNTRY_SEEDS = ["US", "GB", "IN", "CA", "AU", "ZA", "SG", "NG", "DE", "FR", "BR", "MX", "ES", "RU", "JP", "KR", "CN"]
744
+
745
+
746
+ # ----------------- Config / Keys -----------------
747
+ USE_GNEWS_API = False
748
+ USE_NEWSDATA_API = False
749
+ USE_GDELT_API = True
750
+ USE_NEWSAPI = False
751
+
752
+ NEWSAPI_KEY = os.getenv("NEWSAPI_KEY", "ea734c66dc4044fa8e4501ad7b90e753")
753
+ GNEWS_API_KEY = os.getenv("GNEWS_API_KEY", "5419897c95e8a4b21074e0d3fe95a3dd")
754
+ NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY", "pub_1feb49a71a844719af68d0844fb43a61")
755
+ HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
756
+
757
+ logging.basicConfig(
758
+ level=logging.WARNING,
759
+ format="%(levelname)s:%(name)s:%(message)s",
760
+ )
761
+
762
+ log = logging.getLogger("newsglobe")
763
+ log.setLevel(logging.WARNING)
764
+
765
+ fetch_log = logging.getLogger("newsglobe.fetch_summary")
766
+ fetch_log.setLevel(logging.INFO)
767
+ _fetch_handler = logging.StreamHandler()
768
+ _fetch_handler.setLevel(logging.INFO)
769
+ _fetch_handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
770
+ fetch_log.addHandler(_fetch_handler)
771
+ fetch_log.propagate = False # don't pass to root (which is WARNING)
772
+
773
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
774
+ logging.getLogger("urllib3.connectionpool").setLevel(logging.WARNING)
775
+ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARNING)
776
+ logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
777
+ logging.getLogger("transformers").setLevel(logging.WARNING)
778
+
779
+ for name in ("urllib3", "urllib3.connectionpool", "requests.packages.urllib3"):
780
+ lg = logging.getLogger(name)
781
+ lg.setLevel(logging.ERROR)
782
+ lg.propagate = False
783
+
784
+
785
+ def _newsapi_enabled() -> bool:
786
+ if not NEWSAPI_KEY:
787
+ log.warning("NewsAPI disabled: missing NEWSAPI_KEY env var")
788
+ return False
789
+ return True
790
+
791
+
792
+ def cluster_id(cluster, enriched_articles):
793
+ urls = sorted([(enriched_articles[i].get("url") or "") for i in cluster["indices"] if enriched_articles[i].get("url")])
794
+ base = "|".join(urls) if urls else "empty"
795
+ return hashlib.md5(base.encode("utf-8")).hexdigest()[:10]
796
+
797
+
798
+ # ----------------- NLTK / VADER -----------------
799
+ try:
800
+ nltk.data.find("sentiment/vader_lexicon")
801
+ except LookupError:
802
+ nltk.download("vader_lexicon") # one-time fetch in a fresh container
803
+
804
+ try:
805
+ _vader = SentimentIntensityAnalyzer()
806
+ except Exception:
807
+ _vader = None
808
+
809
+ def classify_sentiment(text: str) -> str:
810
+ if not text:
811
+ return "neutral"
812
+ if _vader is None:
813
+ return "neutral"
814
+ scores = _vader.polarity_scores(text)
815
+ c = scores["compound"]
816
+ return "positive" if c >= 0.2 else "negative" if c <= -0.2 else "neutral"
817
+
818
+
819
+ # ----------------- Geocode helpers -----------------
820
+ def get_country_centroid(country_name):
821
+ if not country_name or country_name == "Unknown":
822
+ return {"lat": 0, "lon": 0, "country": "Unknown"}
823
+ try:
824
+ country = CountryInfo(country_name)
825
+ latlng = country.capital_latlng()
826
+ return {"lat": latlng[0], "lon": latlng[1], "country": country_name}
827
+ except Exception as e:
828
+ log.info(f"Could not get centroid for {country_name}: {e}")
829
+ return {"lat": 0, "lon": 0, "country": country_name or "Unknown"}
830
+
831
+
832
+ def resolve_domain_to_ip(domain):
833
+ if not domain:
834
+ return None
835
+ try:
836
+ return socket.gethostbyname(domain)
837
+ except socket.gaierror:
838
+ return None
839
+
840
+
841
+ def geolocate_ip(ip):
842
+ try:
843
+ r = _session_get(f"https://ipwho.is/{ip}?fields=success,country,latitude,longitude", timeout=8)
844
+ j = r.json()
845
+ if j.get("success"):
846
+ return {"lat": j["latitude"], "lon": j["longitude"], "country": j["country"]}
847
+ except Exception:
848
+ pass
849
+ return None
850
+
851
+
852
+ geolocator = Nominatim(user_agent="newsglobe-app (contact: [email protected])")
853
+ domain_geo_cache: Dict[str, Dict[str, Any]] = {}
854
+
855
+ MAJOR_OUTLETS = {
856
+ "bbc.co.uk": "United Kingdom",
857
+ "theguardian.com": "United Kingdom",
858
+ "reuters.com": "United States",
859
+ "aljazeera.com": "Qatar",
860
+ "lemonde.fr": "France",
861
+ "dw.com": "Germany",
862
+ "abc.net.au": "Australia",
863
+ "ndtv.com": "India",
864
+ "globo.com": "Brazil",
865
+ "elpais.com": "Spain",
866
+ "lefigaro.fr": "France",
867
+ "kyodonews.net": "Japan",
868
+ "straitstimes.com": "Singapore",
869
+ "thesun.my": "Malaysia", # <-- add this
870
+ }
871
+
872
+
873
+ def geocode_source(source_text: str, domain: str = "", do_network: bool = False):
874
+ cache_key = f"{source_text}|{domain}"
875
+ if cache_key in domain_geo_cache:
876
+ return domain_geo_cache[cache_key]
877
+
878
+ ext = tldextract.extract(domain or "")
879
+ fqdn = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
880
+
881
+ # 0) Major outlets / domain map
882
+ if fqdn in MAJOR_OUTLETS:
883
+ coords = get_country_centroid(MAJOR_OUTLETS[fqdn]); domain_geo_cache[cache_key] = coords; return coords
884
+ if ext.domain in domain_country_map:
885
+ coords = get_country_centroid(domain_country_map[ext.domain]); domain_geo_cache[cache_key] = coords; return coords
886
+
887
+ # 1) Suffix fallback (instant)
888
+ coords = get_country_centroid(_suffix_country(ext.suffix))
889
+ domain_geo_cache[cache_key] = coords
890
+
891
+ # 2) Optional async refinement (never block hot path)
892
+ if do_network:
893
+ threading.Thread(target=_refine_geo_async, args=(cache_key, source_text, fqdn), daemon=True).start()
894
+
895
+ return coords
896
+
897
+ def _suffix_country(suffix: Optional[str]) -> str:
898
+ s = (suffix or "").split(".")[-1]
899
+ m = {
900
+ "au":"Australia","uk":"United Kingdom","gb":"United Kingdom","ca":"Canada","in":"India","us":"United States",
901
+ "ng":"Nigeria","de":"Germany","fr":"France","jp":"Japan","sg":"Singapore","za":"South Africa","nz":"New Zealand",
902
+ "ie":"Ireland","it":"Italy","es":"Spain","se":"Sweden","ch":"Switzerland","nl":"Netherlands","br":"Brazil",
903
+ "my":"Malaysia","id":"Indonesia","ph":"Philippines","th":"Thailand","vn":"Vietnam","sa":"Saudi Arabia",
904
+ "ae":"United Arab Emirates","tr":"Turkey","mx":"Mexico","ar":"Argentina","cl":"Chile","co":"Colombia",
905
+ "il":"Israel","kr":"South Korea","cn":"China","tw":"Taiwan","hk":"Hong Kong"
906
+ }
907
+ return m.get(s, "United States" if s in ("com","org","net") else "Unknown")
908
+
909
+
910
+
911
+ def _refine_geo_async(cache_key, source_text, fqdn):
912
+ try:
913
+ # Try IP geo (cheap)
914
+ ip = resolve_domain_to_ip(fqdn) if fqdn else None
915
+ if ip:
916
+ coords = geolocate_ip(ip)
917
+ if coords:
918
+ domain_geo_cache[cache_key] = coords
919
+ return
920
+ # Try Nominatim FAST (lower timeout)
921
+ location = geolocator.geocode(f"{source_text} News Headquarters", timeout=2)
922
+ if location and hasattr(location, "raw"):
923
+ coords = {
924
+ "lat": location.latitude,
925
+ "lon": location.longitude,
926
+ "country": location.raw.get("address", {}).get("country", "Unknown"),
927
+ }
928
+ domain_geo_cache[cache_key] = coords
929
+ except Exception:
930
+ pass
931
+
932
+ # ----------------- HuggingFace translate (optional) -----------------
933
+ HF_MODEL_PRIMARY = None # disable NLLB remote (avoids 404 spam); use OPUS + pivot/LibreTranslate
934
+
935
+ # 2-letter ISO -> NLLB codes
936
+ NLLB_CODES = {
937
+ "en": "eng_Latn",
938
+ "es": "spa_Latn",
939
+ "fr": "fra_Latn",
940
+ "de": "deu_Latn",
941
+ "it": "ita_Latn",
942
+ "pt": "por_Latn",
943
+ "zh": "zho_Hans",
944
+ "ru": "rus_Cyrl",
945
+ "ar": "arb_Arab",
946
+ "hi": "hin_Deva",
947
+ "ja": "jpn_Jpan",
948
+ "ko": "kor_Hang",
949
+ }
950
+
951
+
952
+ # OPUS-MT model map for common pairs (expand as needed)
953
+ def opus_model_for(src2: str, tgt2: str) -> Optional[str]:
954
+ pairs = {
955
+ ("es", "en"): "Helsinki-NLP/opus-mt-es-en",
956
+ ("en", "es"): "Helsinki-NLP/opus-mt-en-es",
957
+ ("fr", "en"): "Helsinki-NLP/opus-mt-fr-en",
958
+ ("en", "fr"): "Helsinki-NLP/opus-mt-en-fr",
959
+ ("de", "en"): "Helsinki-NLP/opus-mt-de-en",
960
+ ("en", "de"): "Helsinki-NLP/opus-mt-en-de",
961
+ ("pt", "en"): "Helsinki-NLP/opus-mt-pt-en",
962
+ ("en", "pt"): "Helsinki-NLP/opus-mt-en-pt",
963
+ ("it", "en"): "Helsinki-NLP/opus-mt-it-en",
964
+ ("en", "it"): "Helsinki-NLP/opus-mt-en-it",
965
+ ("ru", "en"): "Helsinki-NLP/opus-mt-ru-en",
966
+ ("en", "ru"): "Helsinki-NLP/opus-mt-en-ru",
967
+ ("zh", "en"): "Helsinki-NLP/opus-mt-zh-en",
968
+ ("en", "zh"): "Helsinki-NLP/opus-mt-en-zh",
969
+ ("ja", "en"): "Helsinki-NLP/opus-mt-ja-en",
970
+ ("en", "ja"): "Helsinki-NLP/opus-mt-en-ja",
971
+ ("ko", "en"): "Helsinki-NLP/opus-mt-ko-en",
972
+ ("en", "ko"): "Helsinki-NLP/opus-mt-en-ko",
973
+ ("hi", "en"): "Helsinki-NLP/opus-mt-hi-en",
974
+ ("en", "hi"): "Helsinki-NLP/opus-mt-en-hi",
975
+ ("ar", "en"): "Helsinki-NLP/opus-mt-ar-en",
976
+ ("en", "ar"): "Helsinki-NLP/opus-mt-en-ar",
977
+ }
978
+ return pairs.get((src2, tgt2))
979
+
980
+
981
+ SUPPORTED = {"en", "fr", "de", "es", "it", "hi", "ar", "ru", "ja", "ko", "pt", "zh"}
982
+
983
+ LIBRETRANSLATE_URL = os.getenv("LIBRETRANSLATE_URL") # e.g., http://127.0.0.1:5000
984
+
985
+ def _translate_via_libre(text: str, src: str, tgt: str) -> Optional[str]:
986
+ url = LIBRETRANSLATE_URL
987
+ if not url or not text or src == tgt:
988
+ return None
989
+ try:
990
+ r = SESSION.post(
991
+ f"{url.rstrip('/')}/translate",
992
+ json={"q": text, "source": src, "target": tgt, "format": "text"},
993
+ timeout=6
994
+ )
995
+ if r.status_code == 200:
996
+ j = r.json()
997
+ out = j.get("translatedText")
998
+ return out if isinstance(out, str) and out else None
999
+ else:
1000
+ log.warning("LibreTranslate HTTP %s: %s", r.status_code, r.text[:200])
1001
+ except Exception as e:
1002
+ log.warning("LibreTranslate failed: %s", e)
1003
+ return None
1004
+
1005
+
1006
+ def _hf_call(model_id: str, payload: dict) -> Optional[str]:
1007
+ # require both a token and explicit opt-in
1008
+ if not (HUGGINGFACE_API_TOKEN and ALLOW_HF_REMOTE):
1009
+ return None
1010
+ if model_id in _hf_bad_models:
1011
+ return None
1012
+
1013
+ url = f"https://api-inference.huggingface.co/models/{model_id}"
1014
+ headers = {
1015
+ "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}",
1016
+ "HF-API-KEY": HUGGINGFACE_API_TOKEN,
1017
+ "Accept": "application/json",
1018
+ "Content-Type": "application/json",
1019
+ }
1020
+ try:
1021
+ r = requests.post(url, headers=headers, json=payload, timeout=25)
1022
+ if r.status_code != 200:
1023
+ if r.status_code == 404:
1024
+ _hf_bad_models.add(model_id)
1025
+ log.warning("HF %s -> 404: Not Found (disabled for this process)", model_id)
1026
+ else:
1027
+ log.warning("HF %s -> %s: %s", model_id, r.status_code, r.text[:300])
1028
+ return None
1029
+ j = r.json()
1030
+ except Exception as e:
1031
+ log.warning("HF request failed: %s", e)
1032
+ return None
1033
+
1034
+ if isinstance(j, list) and j and isinstance(j[0], dict):
1035
+ if "generated_text" in j[0]:
1036
+ return j[0]["generated_text"]
1037
+ if "translation_text" in j[0]:
1038
+ return j[0]["translation_text"]
1039
+ if isinstance(j, dict) and "generated_text" in j:
1040
+ return j["generated_text"]
1041
+ if isinstance(j, str):
1042
+ return j
1043
+ return None
1044
+
1045
+ @lru_cache(maxsize=4096)
1046
+ def _translate_cached(text: str, src: str, tgt: str) -> str:
1047
+ if not text or src == tgt:
1048
+ return text
1049
+
1050
+ # 0) Local LibreTranslate (fast & free, if running)
1051
+ out = _translate_via_libre(text, src, tgt)
1052
+ if out:
1053
+ return out
1054
+
1055
+ # 1) OPUS serverless (direct pair) – try this first
1056
+ opus_model = opus_model_for(src, tgt)
1057
+ if opus_model:
1058
+ out = _hf_call(opus_model, {"inputs": text})
1059
+ if out:
1060
+ return out
1061
+
1062
+ # 2) NLLB serverless (optional; disabled if HF_MODEL_PRIMARY is None)
1063
+ try:
1064
+ if HF_MODEL_PRIMARY and (src in NLLB_CODES) and (tgt in NLLB_CODES):
1065
+ out = _hf_call(
1066
+ HF_MODEL_PRIMARY,
1067
+ {
1068
+ "inputs": text,
1069
+ "parameters": {"src_lang": NLLB_CODES[src], "tgt_lang": NLLB_CODES[tgt]},
1070
+ "options": {"wait_for_model": True},
1071
+ },
1072
+ )
1073
+ if out:
1074
+ return out
1075
+ except Exception:
1076
+ pass
1077
+
1078
+ # 3) Two-hop pivot via English for non-English↔non-English
1079
+ if src != "en" and tgt != "en":
1080
+ step_en = _translate_cached(text, src, "en")
1081
+ if step_en and step_en != text:
1082
+ out = _translate_cached(step_en, "en", tgt)
1083
+ if out:
1084
+ return out
1085
+
1086
+ # 4) Local OPUS fallback (direct pair with local pipeline)
1087
+ out = _translate_local(text, src, tgt)
1088
+ if out:
1089
+ return out
1090
+
1091
+ log.warning("All translate paths failed (%s->%s); returning original.", src, tgt)
1092
+ return text
1093
+
1094
+
1095
+
1096
+
1097
+ def translate_text(text: str, target_lang: Optional[str], fallback_src: Optional[str] = None) -> str:
1098
+ if not text or not target_lang:
1099
+ return text
1100
+ tgt = target_lang.lower()
1101
+ if tgt not in SUPPORTED:
1102
+ return text
1103
+ src = (fallback_src or detect_lang(text) or "en").lower()
1104
+ if src == tgt:
1105
+ return text
1106
+ if src not in SUPPORTED:
1107
+ if src.startswith("zh"):
1108
+ src = "zh"
1109
+ elif src.startswith("pt"):
1110
+ src = "pt"
1111
+ elif src[:2] in SUPPORTED:
1112
+ src = src[:2]
1113
+ else:
1114
+ src = "en"
1115
+ return _translate_cached(text, src, tgt)
1116
+
1117
+
1118
+ # ----------------- FastAPI -----------------
1119
+ app = FastAPI()
1120
+ app.add_middleware(
1121
+ CORSMiddleware,
1122
+ allow_origins=["*"],
1123
+ allow_credentials=False,
1124
+ allow_methods=["*"],
1125
+ allow_headers=["*"],
1126
+ )
1127
+
1128
+ app.add_middleware(GZipMiddleware, minimum_size=500)
1129
+
1130
+ # === Warm config ===
1131
+ WARM_LIMIT_EACH = 20 # smaller bite to prime caches
1132
+ WARM_TIMESPAN = "24h" # narrower GDELT window for faster first fetch
1133
+ WARM_PREFETCH_DESCRIPTIONS = False
1134
+
1135
+ def _fmt_mmss(ms: float) -> str:
1136
+ total_sec = int(round(ms / 1000.0))
1137
+ m, s = divmod(total_sec, 60)
1138
+ return f"{m}:{s:02d}"
1139
+
1140
+ def _warm_once():
1141
+ try:
1142
+ log.info("WARM: starting background warm-up (limit_each=%d, timespan=%s)", WARM_LIMIT_EACH, WARM_TIMESPAN)
1143
+ t0 = time.perf_counter()
1144
+
1145
+ # models (you already call these in startup, but keep them here too)
1146
+ get_sbert()
1147
+ get_news_clf()
1148
+
1149
+ # fetch a small set with shorter timespan
1150
+ t1 = time.perf_counter()
1151
+ raw = combine_raw_articles(
1152
+ category=None, query=None, language="en",
1153
+ limit_each=WARM_LIMIT_EACH, timespan=WARM_TIMESPAN,
1154
+ log_summary=False # ← silence warm-up summary
1155
+ )
1156
+ t_fetch = (time.perf_counter() - t1) * 1000
1157
+
1158
+ # optional: skip description prefetch during warm to save time
1159
+ if WARM_PREFETCH_DESCRIPTIONS:
1160
+ prefetch_descriptions_async(raw)
1161
+
1162
+ # enrich + cluster once (no translation on warm)
1163
+ t2 = time.perf_counter()
1164
+ enriched = [enrich_article(a, language="en", translate=False, target_lang=None) for a in raw]
1165
+ t_enrich = (time.perf_counter() - t2) * 1000
1166
+
1167
+ t3 = time.perf_counter()
1168
+ clusters = cluster_articles(enriched, sim_threshold=SIM_THRESHOLD)
1169
+ t_cluster = (time.perf_counter() - t3) * 1000
1170
+
1171
+ # stash in cache under the common default key so /news and /events hit warm data
1172
+ key = cache_key_for(q=None, category=None, language="en",
1173
+ limit_each=WARM_LIMIT_EACH, translate=False, target_lang=None,
1174
+ speed=Speed.balanced) # οΏ½οΏ½ add speed
1175
+
1176
+ _events_cache[key] = {"t": monotonic(), "enriched": enriched, "clusters": clusters}
1177
+
1178
+ t_total = (time.perf_counter() - t0) * 1000
1179
+ log.info(
1180
+ "WARM: fetch=%s, enrich=%s, cluster=%s, total=%s (raw=%d, enriched=%d, clusters=%d)",
1181
+ _fmt_mmss(t_fetch), _fmt_mmss(t_enrich), _fmt_mmss(t_cluster), _fmt_mmss(t_total),
1182
+ len(raw), len(enriched), len(clusters),
1183
+ )
1184
+ except Exception as e:
1185
+ log.warning(f"WARM: failed: {e}")
1186
+
1187
+ @app.on_event("startup")
1188
+ def warm():
1189
+ # keep your existing model warms
1190
+ get_sbert()
1191
+ get_news_clf()
1192
+ # fire-and-forget warm in a background thread so startup stays snappy
1193
+ threading.Thread(target=_warm_once, daemon=True).start()
1194
+
1195
+ # ----------------- Providers -----------------
1196
+ # ISO -> GDELT 'sourcelang:' names (keep yours)
1197
+ _GDELT_LANG = {
1198
+ "en": "english",
1199
+ "es": "spanish",
1200
+ "fr": "french",
1201
+ "de": "german",
1202
+ "it": "italian",
1203
+ "pt": "portuguese",
1204
+ "ru": "russian",
1205
+ "ar": "arabic",
1206
+ "hi": "hindi",
1207
+ "ja": "japanese",
1208
+ "ko": "korean",
1209
+ "zh": "chinese",
1210
+ }
1211
+
1212
+
1213
+ def _gdelt_safe_query(user_q, language):
1214
+ parts = []
1215
+ if user_q:
1216
+ q = user_q.strip()
1217
+ if len(q) < 3:
1218
+ q = f'"{q}" news'
1219
+ parts.append(q)
1220
+ if language and (lg := _GDELT_LANG.get(language.lower())):
1221
+ parts.append(f"sourcelang:{lg}")
1222
+ if not parts:
1223
+ # rotate or randomly choose one to diversify
1224
+ parts.append("sourcelang:english")
1225
+ return " ".join(parts)
1226
+
1227
+
1228
+ def fetch_gdelt_articles(
1229
+ limit=50,
1230
+ query=None,
1231
+ language=None,
1232
+ timespan="3d",
1233
+ category=None,
1234
+ extra_tokens: Optional[List[str]] = None
1235
+ ):
1236
+ q = _gdelt_safe_query(query, language)
1237
+ if extra_tokens:
1238
+ q = f"{q} " + " ".join(extra_tokens)
1239
+ url = "https://api.gdeltproject.org/api/v2/doc/doc"
1240
+ params = {
1241
+ "query": q,
1242
+ "mode": "ArtList",
1243
+ "format": "json",
1244
+ "sort": "DateDesc",
1245
+ "maxrecords": int(min(250, max(1, limit))),
1246
+ "timespan": timespan,
1247
+ }
1248
+ headers = {
1249
+ "User-Agent": "Mozilla/5.0 (compatible; NewsGlobe/1.0; +mailto:[email protected])",
1250
+ "Accept": "application/json",
1251
+ }
1252
+
1253
+ def _do_request(p):
1254
+ r = _session_get(url, params=p, timeout=10)
1255
+ log.info(f"GDELT URL: {r.url} (status={r.status_code})")
1256
+ if r.status_code != 200:
1257
+ log.warning(f"GDELT HTTP {r.status_code}: {r.text[:400]}")
1258
+ return None
1259
+ try:
1260
+ return r.json()
1261
+ except Exception:
1262
+ ct = r.headers.get("Content-Type", "")
1263
+ log.warning(f"GDELT non-JSON response. CT={ct}. Body[:400]: {r.text[:400]}")
1264
+ return None
1265
+
1266
+ data = _do_request(params)
1267
+ if data is None:
1268
+ # Retry narrower and smaller if needed
1269
+ p2 = {**params, "timespan": "24h", "maxrecords": min(100, params["maxrecords"])}
1270
+ data = _do_request(p2)
1271
+ if not data:
1272
+ return []
1273
+
1274
+ arts = data.get("articles") or []
1275
+ results = []
1276
+ for a in arts:
1277
+ desc = a.get("description") or a.get("content") or ""
1278
+ title = a.get("title") or ""
1279
+ if desc and (
1280
+ desc.strip().lower() == title.strip().lower() or
1281
+ (len(desc) <= 60 and _too_similar(title, desc))
1282
+ ):
1283
+ desc = ""
1284
+ desc = desc or "No description available"
1285
+ results.append(
1286
+ {
1287
+ "title": title,
1288
+ "url": a.get("url"),
1289
+ "source": {"name": a.get("domain") or "GDELT"},
1290
+ "description": desc,
1291
+ "publishedAt": a.get("seendate"),
1292
+ "api_source": "gdelt",
1293
+ "gdelt_sourcecountry": a.get("sourcecountry"),
1294
+ # Keep the user's chosen category only for debugging/reference; do NOT use for inference.
1295
+ "requested_category": category,
1296
+ }
1297
+ )
1298
+ log.info(f"GDELT returned {len(results)}")
1299
+ return results
1300
+
1301
+
1302
+
1303
+ def fetch_newsdata_articles(category=None, limit=20, query=None, language=None):
1304
+ base_url = "https://newsdata.io/api/1/news"
1305
+ allowed = [
1306
+ "business",
1307
+ "entertainment",
1308
+ "environment",
1309
+ "food",
1310
+ "health",
1311
+ "politics",
1312
+ "science",
1313
+ "sports",
1314
+ "technology",
1315
+ "top",
1316
+ "world",
1317
+ ]
1318
+ params = {"apikey": NEWSDATA_API_KEY, "language": (language or "en")}
1319
+ if category and category in allowed:
1320
+ params["category"] = category
1321
+ if query:
1322
+ params["q"] = query
1323
+
1324
+ all_articles, next_page = [], None
1325
+ while len(all_articles) < limit:
1326
+ if next_page:
1327
+ params["page"] = next_page
1328
+ resp = _session_get(base_url, params=params, timeout=12)
1329
+ if resp.status_code != 200:
1330
+ break
1331
+ data = resp.json()
1332
+ articles = data.get("results", [])
1333
+ for a in articles:
1334
+ a["api_source"] = "newsdata"
1335
+ all_articles.extend(articles)
1336
+ next_page = data.get("nextPage")
1337
+ if not next_page:
1338
+ break
1339
+
1340
+ # normalize timestamps if available
1341
+ for a in all_articles:
1342
+ a["publishedAt"] = a.get("pubDate")
1343
+ return all_articles[:limit]
1344
+
1345
+
1346
+ def fetch_gnews_articles(limit=20, query=None, language=None):
1347
+ url = f"https://gnews.io/api/v4/top-headlines?lang={(language or 'en')}&max={limit}&token={GNEWS_API_KEY}"
1348
+ if query:
1349
+ url += f"&q={requests.utils.quote(query)}"
1350
+ try:
1351
+ r = _session_get(url, timeout=12)
1352
+ if r.status_code != 200:
1353
+ return []
1354
+ arts = r.json().get("articles", [])
1355
+ for a in arts:
1356
+ a["api_source"] = "gnews"
1357
+ return arts
1358
+ except Exception:
1359
+ return []
1360
+
1361
+
1362
+ NEWSAPI_COUNTRIES = ["us", "gb", "ca", "au", "in", "za", "sg", "ie", "nz"]
1363
+
1364
+
1365
+ def fetch_newsapi_headlines_multi(limit=50, language=None):
1366
+ if not _newsapi_enabled():
1367
+ return []
1368
+ all_ = []
1369
+ per = max(1, math.ceil(limit / max(1, len(NEWSAPI_COUNTRIES))))
1370
+ per = min(per, 100) # NewsAPI pageSize cap
1371
+ for c in NEWSAPI_COUNTRIES:
1372
+ url = f"https://newsapi.org/v2/top-headlines?country={c}&pageSize={per}&apiKey={NEWSAPI_KEY}"
1373
+ r = _session_get(url, timeout=12)
1374
+ if r.status_code != 200:
1375
+ log.warning(f"NewsAPI top-headlines {c} -> HTTP {r.status_code}: {r.text[:200]}")
1376
+ continue
1377
+ arts = r.json().get("articles", [])
1378
+ for a in arts:
1379
+ a["api_source"] = "newsapi"
1380
+ all_.extend(arts)
1381
+ time.sleep(0.2)
1382
+ return all_[:limit] # βœ… enforce exact limit
1383
+
1384
+
1385
+ def fetch_newsapi_articles(category=None, limit=20, query=None, language=None):
1386
+ if not _newsapi_enabled():
1387
+ return []
1388
+
1389
+ # If a query is provided, use /everything (language allowed here)
1390
+ if query:
1391
+ url = f"https://newsapi.org/v2/everything?pageSize={limit}&apiKey={NEWSAPI_KEY}&q={requests.utils.quote(query)}"
1392
+ if language:
1393
+ url += f"&language={language}"
1394
+ try:
1395
+ r = _session_get(url, timeout=12)
1396
+ if r.status_code != 200:
1397
+ log.warning(f"NewsAPI /everything HTTP {r.status_code}: {r.text[:200]}")
1398
+ return []
1399
+ arts = r.json().get("articles", [])
1400
+ for a in arts:
1401
+ a["api_source"] = "newsapi"
1402
+ # DO NOT stamp category here; we infer later
1403
+ return arts[:limit]
1404
+ except Exception as e:
1405
+ log.warning(f"NewsAPI /everything request failed: {e}")
1406
+ return []
1407
+
1408
+ # Otherwise, rotate /top-headlines by country (no language param)
1409
+ results = []
1410
+ per_country = max(5, limit // len(NEWSAPI_COUNTRIES))
1411
+ for c in NEWSAPI_COUNTRIES:
1412
+ url = f"https://newsapi.org/v2/top-headlines?country={c}&pageSize={per_country}&apiKey={NEWSAPI_KEY}"
1413
+ if category:
1414
+ url += f"&category={category}"
1415
+ try:
1416
+ r = _session_get(url, timeout=12)
1417
+ if r.status_code != 200:
1418
+ log.warning(f"NewsAPI top-headlines {c} -> HTTP {r.status_code}: {r.text[:200]}")
1419
+ continue
1420
+ arts = r.json().get("articles", [])
1421
+ for a in arts:
1422
+ a["api_source"] = "newsapi"
1423
+ # DO NOT stamp category here; we infer later
1424
+ results.extend(arts)
1425
+ except Exception as e:
1426
+ log.warning(f"NewsAPI top-headlines {c} failed: {e}")
1427
+ time.sleep(0.2)
1428
+ return results[:limit]
1429
+
1430
+
1431
+
1432
+ def normalize_newsdata_article(article):
1433
+ return {
1434
+ "title": article.get("title"),
1435
+ "url": article.get("link"),
1436
+ "source": {"name": article.get("source_id", "NewsData.io")},
1437
+ "description": article.get("description") or "No description available",
1438
+ "publishedAt": article.get("publishedAt"),
1439
+ "api_source": article.get("api_source", "newsdata"),
1440
+ "category": ((article.get("category") or [None])[0] if isinstance(article.get("category"), list) else article.get("category")),
1441
+ }
1442
+
1443
+
1444
+ # ----------------- Enrichment -----------------
1445
+ def enrich_article(a, language=None, translate=False, target_lang=None):
1446
+ # Normalize source name
1447
+ source_name = (a.get("source", {}) or {}).get("name", "").strip() or "Unknown"
1448
+ s_lower = source_name.lower()
1449
+ if "newsapi" in s_lower:
1450
+ source_name = "NewsAPI"
1451
+ elif "gnews" in s_lower:
1452
+ source_name = "GNews"
1453
+ elif "newsdata" in s_lower:
1454
+ source_name = "NewsData.io"
1455
+
1456
+ # Canonicalize URL & derive domain
1457
+ article_url = _canonical_url(a.get("url") or "")
1458
+ try:
1459
+ ext = tldextract.extract(article_url)
1460
+ domain = ".".join([p for p in (ext.domain, ext.suffix) if p]) if (ext.domain or ext.suffix) else ""
1461
+ except Exception:
1462
+ domain = ""
1463
+
1464
+ # Country guess (GDELT provides ISO2)
1465
+ country_guess = None
1466
+ if a.get("api_source") == "gdelt":
1467
+ sc = a.get("gdelt_sourcecountry")
1468
+ if sc:
1469
+ iso2map = {
1470
+ "US": "United States", "GB": "United Kingdom", "AU": "Australia", "CA": "Canada", "IN": "India",
1471
+ "DE": "Germany", "FR": "France", "IT": "Italy", "ES": "Spain", "BR": "Brazil", "JP": "Japan",
1472
+ "CN": "China", "RU": "Russia", "KR": "South Korea", "ZA": "South Africa", "NG": "Nigeria",
1473
+ "MX": "Mexico", "AR": "Argentina", "CL": "Chile", "CO": "Colombia", "NL": "Netherlands",
1474
+ "SE": "Sweden", "NO": "Norway", "DK": "Denmark", "FI": "Finland", "IE": "Ireland", "PL": "Poland",
1475
+ "PT": "Portugal", "GR": "Greece", "TR": "Turkey", "IL": "Israel", "SA": "Saudi Arabia",
1476
+ "AE": "United Arab Emirates", "SG": "Singapore", "MY": "Malaysia", "TH": "Thailand",
1477
+ "PH": "Philippines", "ID": "Indonesia", "NZ": "New Zealand",
1478
+ }
1479
+ country_guess = iso2map.get(str(sc).upper(), sc if len(str(sc)) > 2 else None)
1480
+
1481
+ coords = get_country_centroid(country_guess) if country_guess else geocode_source(source_name, domain, do_network=False)
1482
+
1483
+ # Title / description (raw)
1484
+ title = (a.get("title") or "").strip() or "(untitled)"
1485
+ description = (a.get("description") or "").strip()
1486
+
1487
+ if description.lower().startswith("no description"):
1488
+ description = ""
1489
+
1490
+ # Prefer cached page summary when weak/title-like
1491
+ cached_desc = _desc_cache_get(article_url)
1492
+ need_upgrade = (
1493
+ (not description)
1494
+ or description.lower().startswith("no description")
1495
+ or len(description) < DESC_MIN_LEN
1496
+ or _too_similar(title, description)
1497
+ )
1498
+ if need_upgrade and cached_desc:
1499
+ description = cached_desc
1500
+
1501
+ if description:
1502
+ description = _tidy_description(title, description, source_name)
1503
+ if (not description) or _too_similar(title, description):
1504
+ description = f"Quick take: {title.rstrip('.')}."
1505
+
1506
+ # Save originals for categorization and debug
1507
+ orig_title = title
1508
+ orig_description = description
1509
+
1510
+ # Language detection / sentiment
1511
+ detected_lang = (detect_lang(f"{title} {description}") or "").lower()
1512
+ ml_text = f"{orig_title}. {orig_description}".strip()
1513
+ sentiment = classify_sentiment(f"{orig_title} {orig_description}")
1514
+
1515
+ # Stable id & category (ALWAYS infer; ignore provider/requested categories)
1516
+ seed = f"{source_name}|{article_url}|{title}"
1517
+ uid = hashlib.md5(seed.encode("utf-8")).hexdigest()[:12]
1518
+ cat = infer_category(article_url, orig_title, orig_description, None)
1519
+
1520
+ return {
1521
+ "id": uid,
1522
+ "title": title,
1523
+ "url": article_url,
1524
+ "source": source_name,
1525
+ "lat": coords["lat"],
1526
+ "lon": coords["lon"],
1527
+ "country": coords.get("country", "Unknown"),
1528
+ "description": description,
1529
+ "sentiment": sentiment,
1530
+ "api_source": a.get("api_source", "unknown"),
1531
+ "publishedAt": a.get("publishedAt"),
1532
+ "_ml_text": ml_text,
1533
+ "orig_title": orig_title,
1534
+ "orig_description": orig_description,
1535
+ "detected_lang": detected_lang,
1536
+ "translated": False,
1537
+ "category": cat,
1538
+ }
1539
+
1540
+
1541
+
1542
+
1543
+ # ----------------- Clustering into Events -----------------
1544
+ # sbert_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
1545
+
1546
+ # cluster_articles()
1547
+ def cluster_articles(articles: List[Dict[str, Any]], sim_threshold=SIM_THRESHOLD, speed=Speed.balanced):
1548
+ if speed == Speed.fast:
1549
+ articles = articles[:150] # early cap
1550
+ sim_threshold = max(sim_threshold, 0.64)
1551
+ elif speed == Speed.balanced:
1552
+ articles = articles[:]
1553
+ sim_threshold = max(sim_threshold, 0.62)
1554
+ texts = [_cluster_text(a) for a in articles]
1555
+ embs = get_sbert().encode(texts, convert_to_tensor=True, normalize_embeddings=True, show_progress_bar=False)
1556
+ clusters = [] # [{indices:[...], centroid:tensor}]
1557
+ centroids = []
1558
+
1559
+ for i, emb in enumerate(embs):
1560
+ best_idx, best_sim = -1, -1.0
1561
+ for ci, c_emb in enumerate(centroids):
1562
+ sim = util.cos_sim(emb, c_emb).item()
1563
+ if sim > sim_threshold and sim > best_sim:
1564
+ best_sim, best_idx = sim, ci
1565
+ if best_idx >= 0:
1566
+ clusters[best_idx]["indices"].append(i)
1567
+ idxs = clusters[best_idx]["indices"]
1568
+ new_c = embs[idxs].mean(dim=0)
1569
+ new_c = new_c / new_c.norm()
1570
+ centroids[best_idx] = new_c
1571
+ clusters[best_idx]["centroid"] = new_c
1572
+ else:
1573
+ # use texts[i] now (titles[] no longer exists)
1574
+ event_id = hashlib.md5(texts[i].encode("utf-8")).hexdigest()[:10]
1575
+ clusters.append({"id": event_id, "indices": [i], "centroid": emb})
1576
+ centroids.append(emb)
1577
+
1578
+ # second-pass merge to reduce fragmenting
1579
+ merged = _merge_close_clusters(clusters, embs, threshold=0.70)
1580
+
1581
+ # keep ids stable: recompute with URLs of member articles
1582
+ for c in merged:
1583
+ c["id"] = cluster_id(c, articles)
1584
+
1585
+ return merged
1586
+
1587
+
1588
+ def event_payload_from_cluster(cluster, enriched_articles):
1589
+ idxs = cluster["indices"]
1590
+ arts = [enriched_articles[i] for i in idxs]
1591
+ title_counts = Counter([a["title"] for a in arts])
1592
+ canonical_title = title_counts.most_common(1)[0][0]
1593
+ keywords = list({w.lower() for t in title_counts for w in t.split() if len(w) > 3})[:8]
1594
+ sources = {a["source"] for a in arts}
1595
+ countries = {a["country"] for a in arts if a["country"] and a["country"] != "Unknown"}
1596
+ ts = [a.get("publishedAt") for a in arts if a.get("publishedAt")]
1597
+ return {
1598
+ "event_id": cluster_id(cluster, enriched_articles), # <-- stable id
1599
+ "title": canonical_title,
1600
+ "keywords": keywords,
1601
+ "article_count": len(arts),
1602
+ "source_count": len(sources),
1603
+ "country_count": len(countries),
1604
+ "time_range": {"min": min(ts) if ts else None, "max": max(ts) if ts else None},
1605
+ "sample_urls": [a["url"] for a in arts[:3] if a.get("url")],
1606
+ }
1607
+
1608
+
1609
+ def aggregate_event_by_country(cluster, enriched_articles):
1610
+ idxs = cluster["indices"]
1611
+ arts = [enriched_articles[i] for i in idxs]
1612
+ by_country: Dict[str, Dict[str, Any]] = {}
1613
+
1614
+ for a in arts:
1615
+ c = a.get("country") or "Unknown"
1616
+ if c not in by_country:
1617
+ coords = get_country_centroid(c)
1618
+ by_country[c] = {"country": c, "lat": coords["lat"], "lon": coords["lon"], "articles": []}
1619
+ by_country[c]["articles"].append(a)
1620
+
1621
+ # summarize per country
1622
+ results = []
1623
+ for c, block in by_country.items():
1624
+ arr = block["articles"]
1625
+ # avg sentiment mapped to -1/0/+1
1626
+ to_num = {"negative": -1, "neutral": 0, "positive": 1}
1627
+ vals = [to_num.get(a["sentiment"], 0) for a in arr]
1628
+ avg = sum(vals) / max(len(vals), 1)
1629
+ avg_sent = "positive" if avg > 0.15 else "negative" if avg < -0.15 else "neutral"
1630
+ top_sources = [s for s, _ in Counter([a["source"] for a in arr]).most_common(3)]
1631
+ # tiny extractive summary: top 2 headlines
1632
+ summary = " β€’ ".join([a["title"] for a in arr[:2]])
1633
+ results.append(
1634
+ {
1635
+ "country": c,
1636
+ "lat": block["lat"],
1637
+ "lon": block["lon"],
1638
+ "count": len(arr),
1639
+ "avg_sentiment": avg_sent,
1640
+ "top_sources": top_sources,
1641
+ "summary": summary,
1642
+ "samples": [
1643
+ {
1644
+ "title": a["title"],
1645
+ "orig_title": a.get("orig_title"),
1646
+ "orig_description": a.get("orig_description"), # πŸ‘ˆ add this
1647
+ "url": a["url"],
1648
+ "source": a["source"],
1649
+ "sentiment": a["sentiment"],
1650
+ "detected_lang": a.get("detected_lang"),
1651
+ }
1652
+ for a in arr[:5]
1653
+ ],
1654
+ }
1655
+ )
1656
+ return results
1657
+
1658
+
1659
+ def _merge_close_clusters(clusters, embs, threshold=0.68):
1660
+ # clusters: [{"indices":[...], "centroid":tensor}, ...] – add centroid in your first pass
1661
+ merged = []
1662
+ used = set()
1663
+ for i in range(len(clusters)):
1664
+ if i in used:
1665
+ continue
1666
+ base = clusters[i]
1667
+ group = [i]
1668
+ for j in range(i + 1, len(clusters)):
1669
+ if j in used:
1670
+ continue
1671
+ sim = util.cos_sim(base["centroid"], clusters[j]["centroid"]).item()
1672
+ if sim >= threshold:
1673
+ group.append(j)
1674
+ # merge those groups
1675
+ all_idx = []
1676
+ cents = []
1677
+ for g in group:
1678
+ used.add(g)
1679
+ all_idx.extend(clusters[g]["indices"])
1680
+ cents.append(clusters[g]["centroid"])
1681
+ # new centroid
1682
+ newc = torch.stack(cents, dim=0).mean(dim=0)
1683
+ newc = newc / newc.norm()
1684
+ merged.append({"indices": sorted(set(all_idx)), "centroid": newc})
1685
+ return merged
1686
+
1687
+
1688
+ # ----------------- Endpoints -----------------
1689
+ prefetch = False
1690
+
1691
+ @app.get("/events")
1692
+ def get_events(
1693
+ q: Optional[str] = Query(None),
1694
+ category: Optional[str] = Query(None),
1695
+ language: Optional[str] = Query(None),
1696
+ translate: Optional[bool] = Query(False),
1697
+ target_lang: Optional[str] = Query(None),
1698
+ limit_each: int = Query(150, ge=5, le=250),
1699
+ max_events: int = Query(15, ge=5, le=50),
1700
+ min_countries: int = Query(2, ge=1, le=50),
1701
+ min_articles: int = Query(2, ge=1, le=200),
1702
+ speed: Speed = Query(Speed.balanced),
1703
+ ):
1704
+
1705
+ # always build cache on untranslated data
1706
+ cache_key, enriched, clusters = get_or_build_events_cache(
1707
+ q, category, language, False, None, limit_each, speed=speed
1708
+ )
1709
+
1710
+ # optional post-translate view (does not mutate cache)
1711
+ view = enriched
1712
+ if translate and target_lang:
1713
+ view = [dict(i) for i in enriched]
1714
+ for i in view:
1715
+ src_hint = i.get("detected_lang")
1716
+ i["title"] = translate_text(i.get("title") or "", target_lang, fallback_src=src_hint)
1717
+ i["description"] = translate_text(i.get("description") or "", target_lang, fallback_src=src_hint)
1718
+ i["translated"] = True
1719
+
1720
+ events = [event_payload_from_cluster(c, view) for c in clusters]
1721
+ events = [e for e in events if (e["country_count"] >= min_countries and e["article_count"] >= min_articles)]
1722
+ events.sort(key=lambda e: e["article_count"], reverse=True)
1723
+
1724
+ return {"events": events[:max_events], "cache_key": "|".join(map(str, cache_key))}
1725
+
1726
+ @app.get("/event/{event_id}")
1727
+ def get_event_details(
1728
+ event_id: str,
1729
+ cache_key: Optional[str] = Query(None),
1730
+ q: Optional[str] = Query(None),
1731
+ category: Optional[str] = Query(None),
1732
+ language: Optional[str] = Query(None),
1733
+ translate: Optional[bool] = Query(False),
1734
+ target_lang: Optional[str] = Query(None),
1735
+ limit_each: int = Query(150, ge=5, le=250),
1736
+ ):
1737
+ # /event/{event_id}
1738
+ if cache_key:
1739
+ parts = cache_key.split("|")
1740
+ if len(parts) != 7:
1741
+ raise HTTPException(status_code=400, detail="Bad cache_key")
1742
+ speed_str = parts[6]
1743
+ try:
1744
+ speed_obj = Speed(speed_str) # "fast" | "balanced" | "max"
1745
+ except ValueError:
1746
+ speed_obj = Speed.balanced
1747
+ key_tuple = (parts[0], parts[1], parts[2], int(parts[3]),
1748
+ parts[4] == "True", parts[5].lower(), speed_str)
1749
+ else:
1750
+ speed_obj = Speed.balanced
1751
+ key_tuple = cache_key_for(q, category, language, limit_each, translate, target_lang, speed=speed_obj)
1752
+
1753
+ entry = _events_cache.get(key_tuple)
1754
+ if not entry:
1755
+ # always build untranslated
1756
+ _, enriched, clusters = get_or_build_events_cache(
1757
+ q, category, language, False, None, limit_each, speed=speed_obj
1758
+ )
1759
+ else:
1760
+ enriched, clusters = entry["enriched"], entry["clusters"]
1761
+
1762
+ # optional post-translate view (do not mutate cache)
1763
+ eview = enriched
1764
+ if translate and target_lang:
1765
+ eview = [dict(i) for i in enriched]
1766
+ for i in eview:
1767
+ src_hint = i.get("detected_lang")
1768
+ i["title"] = translate_text(i.get("title") or "", target_lang, fallback_src=src_hint)
1769
+ i["description"] = translate_text(i.get("description") or "", target_lang, fallback_src=src_hint)
1770
+ i["translated"] = True
1771
+
1772
+ cluster = next((c for c in clusters if cluster_id(c, enriched) == event_id), None)
1773
+ if not cluster:
1774
+ raise HTTPException(status_code=404, detail="Event not found with current filters")
1775
+
1776
+ payload = event_payload_from_cluster(cluster, eview)
1777
+ countries = aggregate_event_by_country(cluster, eview)
1778
+ payload["articles_in_event"] = sum(c["count"] for c in countries)
1779
+ return {"event": payload, "countries": countries}
1780
+
1781
+
1782
+ @app.get("/news")
1783
+ def get_news(
1784
+ cache_key: Optional[str] = Query(None),
1785
+ category: Optional[str] = Query(None),
1786
+ sentiment: Optional[str] = Query(None),
1787
+ q: Optional[str] = Query(None),
1788
+ language: Optional[str] = Query(None),
1789
+ translate: Optional[bool] = Query(False),
1790
+ target_lang: Optional[str] = Query(None),
1791
+ limit_each: int = Query(100, ge=5, le=100),
1792
+ lite: bool = Query(True),
1793
+ speed: Speed = Query(Speed.balanced),
1794
+ page: int = Query(1, ge=1),
1795
+ page_size: int = Query(120, ge=5, le=300),
1796
+ ):
1797
+ enriched: List[Dict[str, Any]] = []
1798
+
1799
+ # Pull from cache if provided
1800
+ if cache_key:
1801
+ parts = cache_key.split("|")
1802
+ if len(parts) == 7:
1803
+ key_tuple = (
1804
+ parts[0], # q
1805
+ parts[1], # category
1806
+ parts[2], # language
1807
+ int(parts[3]), # limit_each
1808
+ parts[4] == "True", # translate
1809
+ parts[5].lower(), # target_lang
1810
+ parts[6], # speed
1811
+ )
1812
+ entry = _events_cache.get(key_tuple)
1813
+ if entry:
1814
+ enriched = entry["enriched"]
1815
+
1816
+ # Fetch fresh if no cached items
1817
+ if not enriched:
1818
+ raw = combine_raw_articles(category=category, query=q, language=language, limit_each=limit_each, speed=speed)
1819
+ prefetch_descriptions_async(raw, speed)
1820
+ enriched_all = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
1821
+ if category:
1822
+ cat_norm = (category or "").strip().lower()
1823
+ enriched = [e for e in enriched_all if (e.get("category") or "").lower() == cat_norm]
1824
+ else:
1825
+ enriched = enriched_all
1826
+ else:
1827
+ # If we got cached items but want to ensure the selected category is enforced:
1828
+ if category:
1829
+ cat_norm = (category or "").strip().lower()
1830
+ enriched = [e for e in enriched if (e.get("category") or "").lower() == cat_norm]
1831
+
1832
+ # Translation (optional)
1833
+ if translate and target_lang:
1834
+ enriched = [dict(i) for i in enriched]
1835
+ for i in enriched:
1836
+ i["original_title"] = i.get("orig_title") or i.get("title")
1837
+ i["original_description"] = i.get("orig_description") or i.get("description")
1838
+ src_hint = i.get("detected_lang")
1839
+ i["title"] = translate_text(i.get("title") or "", target_lang, fallback_src=src_hint)
1840
+ i["description"] = translate_text(i.get("description") or "", target_lang, fallback_src=src_hint)
1841
+ i["translated"] = True
1842
+ i["translated_from"] = (src_hint or "").lower()
1843
+ i["translated_to"] = target_lang.lower()
1844
+
1845
+ # Optional sentiment filter
1846
+ if sentiment:
1847
+ s = sentiment.strip().lower()
1848
+ enriched = [i for i in enriched if i.get("sentiment", "").lower() == s]
1849
+
1850
+ # Pagination
1851
+ total = len(enriched)
1852
+ start = (page - 1) * page_size
1853
+ end = start + page_size
1854
+ items = [dict(i) for i in enriched[start:end]]
1855
+
1856
+ # Trim debug fields
1857
+ if lite:
1858
+ drop = {"_ml_text"}
1859
+ for i in items:
1860
+ for k in drop:
1861
+ i.pop(k, None)
1862
+
1863
+ return {
1864
+ "items": items,
1865
+ "total": total,
1866
+ "page": page,
1867
+ "page_size": page_size
1868
+ }
1869
+
1870
+
1871
+
1872
+
1873
+
1874
+ def combine_raw_articles(category=None, query=None, language=None, limit_each=30,
1875
+ timespan="3d", speed=Speed.balanced, log_summary: bool = True):
1876
+ if speed == Speed.fast:
1877
+ timespan = "24h"
1878
+ limit_each = min(limit_each, 20)
1879
+ elif speed == Speed.balanced:
1880
+ timespan = "48h"
1881
+ limit_each = min(limit_each, 150)
1882
+
1883
+ a1 = []
1884
+ if USE_NEWSAPI:
1885
+ if not query:
1886
+ a1 = fetch_newsapi_headlines_multi(limit=limit_each, language=language)
1887
+ else:
1888
+ a1 = fetch_newsapi_articles(category=category, limit=limit_each, query=query, language=language)
1889
+
1890
+ a2 = []
1891
+ if USE_NEWSDATA_API:
1892
+ a2 = [
1893
+ normalize_newsdata_article(a)
1894
+ for a in fetch_newsdata_articles(category=category, limit=limit_each, query=query, language=language)
1895
+ if a.get("link")
1896
+ ]
1897
+
1898
+ a3 = fetch_gnews_articles(limit=limit_each, query=query, language=language) if USE_GNEWS_API else []
1899
+ # a4 = fetch_gdelt_articles(
1900
+ # limit=min(100, limit_each * 2),
1901
+ # query=query,
1902
+ # language=language,
1903
+ # timespan=timespan,
1904
+ # category=category
1905
+ # )
1906
+ gdelt_limit = limit_each
1907
+ a4 = fetch_gdelt_multi(
1908
+ limit=gdelt_limit,
1909
+ query=query,
1910
+ language=language, # if provided, we honor it (with small EN boost)
1911
+ timespan=timespan,
1912
+ category=category,
1913
+ speed=speed,
1914
+ )
1915
+
1916
+ # Dedup by canonical URL (maintain source precedence)
1917
+ seen, merged = set(), []
1918
+ for a in a1 + a3 + a2 + a4:
1919
+ if a.get("url"):
1920
+ a["url"] = _canonical_url(a["url"])
1921
+ url = a["url"]
1922
+ if url not in seen:
1923
+ seen.add(url)
1924
+ merged.append(a)
1925
+
1926
+ if log_summary:
1927
+ fetch_log.info("----- Article Fetch Summary -----")
1928
+ fetch_log.info(f"πŸ“Š NewsAPI returned: {len(a1)} articles")
1929
+ fetch_log.info(f"πŸ“Š NewsData.io returned: {len(a2)} articles")
1930
+ fetch_log.info(f"πŸ“Š GNews returned: {len(a3)} articles")
1931
+ fetch_log.info(f"πŸ“Š GDELT returned: {len(a4)} articles")
1932
+ fetch_log.info(f"βœ… Total merged articles after deduplication: {len(merged)}")
1933
+ fetch_log.info("---------------------------------")
1934
+
1935
+ return merged
1936
+
1937
+
1938
+ @app.get("/related")
1939
+ def related_articles(
1940
+ id: Optional[str] = Query(None, description="article id from /news"),
1941
+ title: Optional[str] = Query(None),
1942
+ description: Optional[str] = Query(None),
1943
+ q: Optional[str] = Query(None),
1944
+ category: Optional[str] = Query(None),
1945
+ language: Optional[str] = Query(None),
1946
+ limit_each: int = Query(50, ge=5, le=100),
1947
+ k: int = Query(10, ge=1, le=50),
1948
+ ):
1949
+ # ensure we have a working article list (enriched) to search over
1950
+ raw = combine_raw_articles(category=category, query=q, language=language, limit_each=limit_each)
1951
+ enriched = [enrich_article(a, language=language, translate=False, target_lang=None) for a in raw]
1952
+ if not enriched:
1953
+ return {"items": []}
1954
+
1955
+ # pick the query vector
1956
+ if id:
1957
+ base = next((a for a in enriched if a.get("id") == id), None)
1958
+ if not base:
1959
+ raise HTTPException(404, "article id not found in current fetch")
1960
+ query_text = base["_ml_text"]
1961
+ else:
1962
+ text = f"{title or ''} {description or ''}".strip()
1963
+ if not text:
1964
+ raise HTTPException(400, "provide either id or title/description")
1965
+ query_text = text
1966
+
1967
+ corpus_texts = [a["_ml_text"] for a in enriched]
1968
+ corpus_embs = _embed_texts(corpus_texts)
1969
+ query_emb = _embed_texts([query_text])[0]
1970
+
1971
+ # cosine similarities
1972
+ sims = util.cos_sim(query_emb, corpus_embs).cpu().numpy().flatten()
1973
+
1974
+ # take top-k excluding the query itself (if id provided)
1975
+ idxs = sims.argsort()[::-1]
1976
+ items = []
1977
+ for idx in idxs:
1978
+ a = enriched[idx]
1979
+ if id and a["id"] == id:
1980
+ continue
1981
+ items.append({**a, "similarity": float(sims[idx])})
1982
+ if len(items) >= k:
1983
+ break
1984
+
1985
+ return {"items": items}
1986
+
1987
+ @app.middleware("http")
1988
+ async def timing_middleware(request, call_next):
1989
+ start = time.perf_counter()
1990
+ response = None
1991
+ try:
1992
+ response = await call_next(request)
1993
+ return response
1994
+ finally:
1995
+ dur_ms = (time.perf_counter() - start) * 1000
1996
+ # log.info(f"{request.method} {request.url.path} -> {dur_ms:.1f} ms ({_fmt_mmss(dur_ms)})")
1997
+ if response is not None:
1998
+ try:
1999
+ response.headers["X-Process-Time-ms"] = f"{dur_ms:.1f}"
2000
+ except Exception:
2001
+ pass
2002
+
2003
+ @app.post("/client-metric")
2004
+ def client_metric(payload: Dict[str, Any] = Body(...)):
2005
+ name = (payload.get("name") or "").strip()
2006
+ # Drop redraw spam if it ever slips through again
2007
+ if name in {"Load all article markers on globe", "Load event country markers on globe"}:
2008
+ return {"ok": True}
2009
+ return {"ok": True}
2010
+
2011
+ @app.get("/diag/translate")
2012
+ def diag_translate():
2013
+ remote = _hf_call("Helsinki-NLP/opus-mt-es-en", {"inputs":"Hola mundo"})
2014
+ local = _translate_local("Hola mundo", "es", "en")
2015
+ libre = _translate_via_libre("Hola mundo", "es", "en")
2016
+ return {
2017
+ "token": bool(HUGGINGFACE_API_TOKEN),
2018
+ "remote_ok": bool(remote),
2019
+ "local_ok": bool(local),
2020
+ "libre_ok": bool(libre),
2021
+ "sample": libre or remote or local
2022
+ }
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.111.0
2
+ uvicorn[standard]==0.30.1
3
+ requests>=2.31.0
4
+ tldextract>=5.1.2
5
+ nltk>=3.8.1
6
+ geopy>=2.4.1
7
+ countryinfo>=0.1.2
8
+ sentence-transformers>=2.5.1
9
+ transformers>=4.42.0
10
+ torch>=2.1.0
11
+ langdetect>=1.0.9
12
+ beautifulsoup4>=4.12.3
13
+ sentencepiece>=0.2.0
14
+ starlette==0.37.2