from newspaper import Article from bs4 import BeautifulSoup import nltk import requests nltk.download('punkt') nltk.download('punkt_tab') def generate_related_urls1(title): from duckduckgo_search import DDGS num_results = 11 with DDGS() as ddgs: results = ddgs.text(title, max_results=num_results) return [result["href"] for result in results] def generate_related_urls(title): """ :param title: str :param num_results: int :return: list """ from googlesearch import search urls_list = [] num_results = 11 for url in search(title, num_results=num_results): if url.startswith("https") and "google.com/search" not in url: urls_list.append(url) return urls_list def extract_data(title): """ :param title: str :param max_articles:int :return: dict """ urls_list = generate_related_urls(title) articles_data = [] for url in urls_list[:11]: print(f"Processing URL: {url}") try: response = requests.get(url, timeout=10) if response.status_code == 200: print(f"Success: {url}\n") response = requests.get(url) html = response.text soup = BeautifulSoup(html, "html.parser") h1_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) topics_list = [] for h1 in h1_tags: topics_list.append(h1.text.strip()) article = Article(url, language="en") article.download() article.parse() article.nlp() article_data =[ { "url": url, "title": article.title, "text": article.text, "authors": article.authors, "published_date": str(article.publish_date) if article.publish_date else "Unknown", "top_image": article.top_image, "videos": article.movies, "keywords": article.keywords, "summary": article.summary, "topics": topics_list }] articles_data.append(article_data) elif response.status_code == 404: print(f"Error: 404 Not Found - {url}\n") elif response.status_code == 403: print(f"Error: 403 Forbidden - {url}. Access Denied.\n") except Exception as e: print(f"Failed to process {url}: {str(e)}\n") finally: print("=" * 50 + "\n") return articles_data