elly99 commited on
Commit
a7be358
·
verified ·
1 Parent(s): 198ea1b

Create science/scientific_analysis.py

Browse files
Files changed (1) hide show
  1. src/science/scientific_analysis.py +390 -0
src/science/scientific_analysis.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # © 2025 Elena Marziali — Code released under Apache 2.0 license.
2
+ # See LICENSE in the repository for details.
3
+ # Removal of this copyright is prohibited.
4
+
5
+ # === Asynchronous Functions ===
6
+ MAX_REQUESTS = 5
7
+ API_SEMAPHORE = asyncio.Semaphore(MAX_REQUESTS)
8
+
9
+ async def safe_api_request(url):
10
+ async with API_SEMAPHORE:
11
+ async with aiohttp.ClientSession() as session:
12
+ try:
13
+ async with session.get(url, timeout=10) as response:
14
+ response.raise_for_status()
15
+ return await response.json()
16
+ except Exception as e:
17
+ logging.error(f"API request error: {e}")
18
+ return None
19
+
20
+ # Connection pooling
21
+ async def safe_api_request(url):
22
+ async with aiohttp.ClientSession() as session:
23
+ try:
24
+ async with session.get(url, timeout=10) as response:
25
+ response.raise_for_status()
26
+ return await response.json()
27
+ except Exception as e:
28
+ logging.error(f"API request error: {e}")
29
+ return None
30
+
31
+ # Smart timeout
32
+ import asyncio
33
+
34
+ async def timeout_handler(task, timeout=20):
35
+ try:
36
+ return await asyncio.wait_for(task, timeout)
37
+ except asyncio.TimeoutError:
38
+ logging.error("API request timed out")
39
+ return None
40
+
41
+ import requests
42
+
43
+ url = "http://export.arxiv.org/api/query?search_query=all:physics&start=0&max_results=1"
44
+ response = requests.get(url, timeout=50)
45
+
46
+ if response.status_code == 200:
47
+ print("Connection to arXiv OK")
48
+ else:
49
+ print(f"Connection error: {response.status_code}")
50
+
51
+ # Advanced parallelization
52
+ async def fetch_multiple_data(urls):
53
+ tasks = [safe_api_request(url) for url in urls]
54
+ results = await asyncio.gather(*tasks, return_exceptions=True)
55
+ return results
56
+
57
+ # Retrieve scientific sources from Zenodo
58
+ async def search_zenodo_async(query, max_results=5):
59
+ """
60
+ Searches for open access articles and resources from Zenodo using their public API.
61
+ """
62
+ url = f"https://zenodo.org/api/records/?q={query}&size={max_results}"
63
+
64
+ async with aiohttp.ClientSession() as session:
65
+ try:
66
+ async with session.get(url, timeout=10) as response:
67
+ response.raise_for_status()
68
+ data = await response.json()
69
+
70
+ articles = []
71
+ for hit in data.get("hits", {}).get("hits", []):
72
+ title = hit.get("metadata", {}).get("title", "Title not available")
73
+ authors = ", ".join([c.get("name", "") for c in hit.get("metadata", {}).get("creators", [])])
74
+ abstract = hit.get("metadata", {}).get("description", "Abstract not available")
75
+ link = hit.get("links", {}).get("html", "No link")
76
+
77
+ articles.append({
78
+ "title": title,
79
+ "authors": authors,
80
+ "abstract": abstract,
81
+ "url": link
82
+ })
83
+
84
+ return articles if articles else [{"error": "No results found on Zenodo."}]
85
+
86
+ except Exception as e:
87
+ return []
88
+
89
+ # Retrieve scientific sources from PubMed
90
+ async def search_pubmed_async(query, max_results=5):
91
+ """ Asynchronously retrieves scientific articles from PubMed. """
92
+ url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&retmode=xml"
93
+
94
+ async with aiohttp.ClientSession() as session:
95
+ try:
96
+ async with session.get(url, timeout=10) as response:
97
+ response.raise_for_status()
98
+ content = await response.text()
99
+ root = ET.fromstring(content)
100
+
101
+ articles = []
102
+ for id_element in root.findall(".//Id"):
103
+ pubmed_id = id_element.text
104
+ articles.append(f"https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}/") # Article links
105
+ return articles
106
+ except Exception as e:
107
+ return f"PubMed error: {e}"
108
+
109
+
110
+ # Function to handle asynchronous responses from arXiv
111
+ def parse_arxiv_response(content):
112
+ """ Extracts titles and abstracts from arXiv articles. """
113
+ try:
114
+ root = ET.fromstring(content)
115
+ except ET.ParseError:
116
+ logging.error("Error parsing arXiv XML.")
117
+ return []
118
+
119
+ articles = []
120
+ for entry in root.findall(".//entry"):
121
+ title = entry.find("title").text if entry.find("title") is not None else "Title not available"
122
+ abstract = entry.find("summary").text if entry.find("summary") is not None else "Abstract not available"
123
+ articles.append({"title": title, "abstract": abstract})
124
+
125
+ return articles
126
+
127
+ # === Asynchronous search on arXiv ===
128
+ # Queries the arXiv API to retrieve scientific articles.
129
+ async def search_arxiv_async(query, max_results=3, retry_attempts=3, timeout=20):
130
+ """ Retrieves scientific articles from arXiv with advanced error handling. """
131
+ url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={max_results}"
132
+
133
+ async with aiohttp.ClientSession() as session:
134
+ for attempt in range(retry_attempts):
135
+ try:
136
+ async with session.get(url, timeout=timeout) as response:
137
+ response.raise_for_status()
138
+ content = await response.text()
139
+
140
+ if not content.strip():
141
+ raise ValueError("Error: Empty response from arXiv.")
142
+
143
+ return parse_arxiv_response(content)
144
+
145
+ except (aiohttp.ClientError, asyncio.TimeoutError, ValueError) as e:
146
+ wait_time = min(2 ** attempt + np.random.uniform(0, 1), 10) # Max wait time: 10 seconds
147
+ logging.error(f"Attempt {attempt+1}: Error - {e}. Retrying in {wait_time:.1f} seconds...")
148
+ await asyncio.sleep(wait_time)
149
+
150
+ logging.error("Error: Unable to retrieve data from arXiv after multiple attempts.")
151
+ return []
152
+
153
+ # === Asynchronous search on OpenAlex ===
154
+ # Retrieves scientific articles with complete metadata (title, authors, abstract, DOI)
155
+ async def search_openalex_async(query, max_results=5):
156
+ """ Safely retrieves scientific articles from OpenAlex. """
157
+ url = f"https://api.openalex.org/works?filter=title.search:{query}&per-page={max_results}"
158
+
159
+ async with aiohttp.ClientSession() as session:
160
+ try:
161
+ async with session.get(url, timeout=10) as response:
162
+ response.raise_for_status()
163
+ data = await response.json()
164
+
165
+ articles = []
166
+ for record in data.get("results", []):
167
+ title = record.get("title", "Title not available")
168
+
169
+ authors = ", ".join([
170
+ aut.get("display_name", "Unknown author")
171
+ for aut in record.get("authorships", [])
172
+ ])
173
+
174
+ abstract = record.get("abstract", "Abstract not available")
175
+ article_url = record.get("doi") or record.get("id", "No link")
176
+
177
+ articles.append({
178
+ "title": title,
179
+ "authors": authors,
180
+ "abstract": abstract,
181
+ "url": article_url
182
+ })
183
+
184
+ return articles
185
+
186
+ except Exception as e:
187
+ return f"OpenAlex error: {e}"
188
+
189
+
190
+ # === Synchronous search on BASE ===
191
+ # Queries the BASE engine for open-access articles.
192
+ def search_base(query, max_results=5):
193
+ url = f"https://api.base-search.net/cgi-bin/BaseHttpSearchInterface?q={query}&num={max_results}&format=json"
194
+
195
+ try:
196
+ response = requests.get(url)
197
+ response.raise_for_status()
198
+ data = response.json()
199
+
200
+ results = []
201
+ for record in data.get("docs", []):
202
+ title = record.get("dcTitle", ["Title not available"])[0]
203
+ link = record.get("link", ["No link available"])[0]
204
+ results.append(f"**{title}**\n[Link to article]({link})\n")
205
+
206
+ return "\n\n".join(results) if results else "No results found."
207
+
208
+ except Exception as e:
209
+ return f"Error during BASE search: {e}"
210
+
211
+ # === Distributed search across multiple databases ===
212
+ # Executes parallel queries on arXiv, OpenAlex, PubMed, Zenodo.
213
+ async def search_multi_database(query):
214
+ try:
215
+ tasks = [
216
+ search_arxiv_async(query),
217
+ search_openalex_async(query),
218
+ search_pubmed_async(query),
219
+ search_zenodo_async(query)
220
+ ]
221
+ results = await asyncio.gather(*tasks, return_exceptions=True)
222
+
223
+ articles = []
224
+ for source in results:
225
+ if isinstance(source, list):
226
+ articles += source
227
+ else:
228
+ logging.warning(f"Invalid source: {type(source)} → {source}")
229
+
230
+ # Normalize immediately after
231
+ articles = normalize_articles(articles)
232
+
233
+ if isinstance(articles, list) and all(isinstance(a, dict) for a in articles):
234
+ formatted_search = format_articles(articles)
235
+ else:
236
+ logging.error(f"Error: 'articles' is not a valid list. Type received: {type(articles)} - Value: {repr(articles)}")
237
+ formatted_search = "Unable to format search: response not properly structured."
238
+
239
+ return articles, formatted_search
240
+
241
+ except Exception as e:
242
+ logging.error(f"Error during multi-database search: {e}")
243
+ return [], "Internal error"
244
+
245
+
246
+ # === Scientific Source Integration ===
247
+ # Selects the first N valid articles and formats them as Markdown references.
248
+ async def integrate_sources_from_database(concept, max_sources=5):
249
+ articles, formatted_search = await search_multi_database(concept)
250
+
251
+ if not isinstance(articles, list) or not all(isinstance(a, dict) for a in articles):
252
+ logging.warning("Invalid 'articles' structure. No sources will be displayed.")
253
+ return "No valid sources available."
254
+
255
+ references = []
256
+ for a in articles[:max_sources]:
257
+ title = a.get("title", "Title not available")
258
+ url = a.get("url", "#")
259
+ if url and isinstance(url, str):
260
+ references.append(f"- [{title}]({url})")
261
+
262
+ return "\n".join(references) if references else "No relevant sources found."
263
+
264
+
265
+ # === Data Normalization ===
266
+ # Converts heterogeneous input (dicts, strings, links) into a consistent list of articles.
267
+ def normalize_source(source):
268
+ if isinstance(source, list) and all(isinstance(x, dict) for x in source):
269
+ return source
270
+ elif isinstance(source, dict): # Single article as dictionary
271
+ return [source]
272
+ elif isinstance(source, str): # Unstructured string
273
+ logging.warning(f"Ignored textual source: {source[:50]}...")
274
+ return []
275
+ else:
276
+ logging.warning(f"Invalid source type: {type(source)}")
277
+ return []
278
+
279
+ def normalize_articles(article_list):
280
+ valid_articles = []
281
+ for a in article_list:
282
+ if isinstance(a, dict):
283
+ valid_articles.append(a)
284
+ elif isinstance(a, str) and "pubmed.ncbi.nlm.nih.gov" in a:
285
+ valid_articles.append({
286
+ "title": "PubMed Link",
287
+ "abstract": "Not available",
288
+ "url": a,
289
+ "authors": "Unknown"
290
+ })
291
+ else:
292
+ logging.warning(f"Ignored: {repr(a)}")
293
+ return valid_articles
294
+
295
+ articles, formatted_search = await search_multi_database("quantum physics")
296
+ print(formatted_search)
297
+
298
+
299
+ # === Async Task Protection Wrapper ===
300
+ # Handles timeouts and errors during asynchronous function execution.
301
+ def protect_async_task(func):
302
+ async def wrapper(*args, **kwargs):
303
+ try:
304
+ return await asyncio.wait_for(func(*args, **kwargs), timeout=20)
305
+ except asyncio.CancelledError:
306
+ logging.warning("Task cancelled.")
307
+ return None
308
+ except Exception as e:
309
+ logging.error(f"Error during execution of {func.__name__}: {e}")
310
+ return None
311
+ return wrapper
312
+
313
+ # === Asynchronous Scientific Explanation Generation ===
314
+ # Builds the prompt and invokes the LLM model.
315
+ async def generate_explanation_async(problem, level, concept, topic):
316
+ """Generates the explanation using the LLM asynchronously."""
317
+ prompt = prompt_template.format(
318
+ problem=problem,
319
+ concept=concept,
320
+ topic=topic,
321
+ level=level
322
+ )
323
+ try:
324
+ response = await asyncio.to_thread(llm.invoke, prompt.strip())
325
+ return response
326
+ except Exception as e:
327
+ logging.error(f"LLM API error: {e}")
328
+ return "Error generating the response."
329
+
330
+ # === Conditional Interactive Chart Generation ===
331
+ # Generates a chart based on the analyzed problem if requested.
332
+ def generate_conditional_chart(problem, chart_choice):
333
+ """Generates an interactive chart if requested."""
334
+ fig = None
335
+ if chart_choice.lower() in ["yes", "y"]:
336
+ try:
337
+ fig = generate_interactive_chart(problem)
338
+ if fig is None:
339
+ raise ValueError("Chart not generated correctly.")
340
+ print("Chart generated successfully!")
341
+ except Exception as e:
342
+ logging.error(f"Chart error: {e}")
343
+ return fig
344
+
345
+ # === Structured Output: Text + Chart ===
346
+ # Combines the generated explanation with the graphical visualization.
347
+ async def generate_complete_result(problem, level, concept, topic, chart_choice):
348
+ """Combines explanation and chart to generate a structured output."""
349
+ response = await generate_explanation_async(problem, level, concept, topic)
350
+ chart = generate_conditional_chart(problem, chart_choice)
351
+ return {
352
+ "response": response,
353
+ "chart": chart
354
+ }
355
+
356
+
357
+ # === Scientific Article Validation ===
358
+ # Checks that each article has a title, abstract, and URL.
359
+ def validate_articles(raw_articles, max_articles=5):
360
+ """
361
+ Validates and filters the list of articles received from an AI or API source.
362
+ Returns a clean list of dictionaries containing at least 'title', 'abstract', and 'url'.
363
+ """
364
+ if not isinstance(raw_articles, list):
365
+ logging.warning(f"[validate_articles] Invalid input: expected list, received {type(raw_articles)}")
366
+ return []
367
+
368
+ valid_articles = []
369
+ for i, art in enumerate(raw_articles):
370
+ if not isinstance(art, dict):
371
+ logging.warning(f"[validate_articles] Invalid element at position {i}: {type(art)}")
372
+ continue
373
+
374
+ title = art.get("title")
375
+ abstract = art.get("abstract")
376
+ url = art.get("url")
377
+
378
+ if all([title, abstract, url]):
379
+ valid_articles.append({
380
+ "title": str(title).strip(),
381
+ "abstract": str(abstract).strip(),
382
+ "url": str(url).strip()
383
+ })
384
+ else:
385
+ logging.info(f"[validate_articles] Article discarded due to incomplete data (i={i}).")
386
+
387
+ if not valid_articles:
388
+ logging.warning("[validate_articles] No valid articles after filtering.")
389
+
390
+ return valid_articles[:max_articles]