Spaces:

jaywadekar
/

gwIAS

Running

App Files Files Community

jaywadekar commited on Jun 25

Commit

0b7fd0d

1 Parent(s): ba88389

Added PDFs in urls.txt

Browse files

Files changed (2) hide show

rag.py +74 -29
urls.txt +34 -17

rag.py CHANGED Viewed

@@ -9,7 +9,7 @@ from langchain_chroma import Chroma
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import WebBaseLoader
 from langchain.schema import Document
 import requests
 import json
@@ -145,69 +145,77 @@ class GitHubLoader(WebBaseLoader):
         return text.strip()
     def _scrape(self, url: str, *args, **kwargs) -> str:
-        """Scrape data from URL and clean it.
-        Args:
-            url: The URL to scrape
-            *args: Additional positional arguments
-            **kwargs: Additional keyword arguments including bs_kwargs
-        Returns:
-            str: The cleaned content
-        """
         response = requests.get(url)
         response.raise_for_status()
         # For directory listings (tree URLs), use the API
         if '/tree/' in url:
-            # Parse URL components
             parts = url.replace("https://github.com/", "").split("/")
             owner = parts[0]
             repo = parts[1]
             branch = parts[3]  # usually 'main' or 'master'
             path = "/".join(parts[4:]) if len(parts) > 4 else ""
-            # Construct API URL
             api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
             api_response = requests.get(api_url)
             api_response.raise_for_status()
-            # Parse directory listing
             contents = api_response.json()
             if isinstance(contents, list):
-                # Format directory contents
                 files = [f"{item['name']} ({item['type']})" for item in contents]
                 return "Directory contents:\n" + "\n".join(files)
             else:
                 return f"Error: Unexpected API response for {url}"
-        # For regular files, parse HTML
         soup = BeautifulSoup(response.text, 'html.parser')
         # For README and markdown files
         readme_content = soup.find('article', class_='markdown-body')
-        if readme_content:
             return self.clean_text(readme_content.get_text())
         # For code files
         code_content = soup.find('table', class_='highlight')
-        if code_content:
             return self.clean_text(code_content.get_text())
         # For other content, get main content
         main_content = soup.find('main')
-        if main_content:
             return self.clean_text(main_content.get_text())
-        # Final fallback
-        return self.clean_text(soup.get_text())
 # Load documentation from urls
 def load_docs():
     # Get urls
     urlsfile = open("urls.txt")
     urls = urlsfile.readlines()
-    urls = [url.replace("\n","") for url in urls]
     urlsfile.close()
     # Load documents from URLs
@@ -218,17 +226,40 @@ def load_docs():
         if not url:
             continue
         # Check if URL is a Jupyter notebook
-        if url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
             print(f"Loading notebook: {url}")
             notebook_docs = load_github_notebook(url)
             docs.extend(notebook_docs)
-        # Handle Python and Markdown files using raw content
         elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
             print(f"Loading raw content: {url}")
             try:
                 raw_url = github_to_raw(url)
-                loader = WebBaseLoader([raw_url])
                 web_docs = loader.load()
                 # Preserve original URL in metadata
                 for doc in web_docs:
@@ -285,11 +316,25 @@ def load_docs():
     return docs
 def extract_reference(url):
-    """Extract a reference keyword from the GitHub URL"""
     if "blob/main" in url:
         return url.split("blob/main/")[-1]
     elif "tree/main" in url:
         return url.split("tree/main/")[-1] or "root"
     return url
 # Join content pages for processing

 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
 from langchain.schema import Document
 import requests
 import json
         return text.strip()
     def _scrape(self, url: str, *args, **kwargs) -> str:
         response = requests.get(url)
         response.raise_for_status()
         # For directory listings (tree URLs), use the API
         if '/tree/' in url:
             parts = url.replace("https://github.com/", "").split("/")
             owner = parts[0]
             repo = parts[1]
             branch = parts[3]  # usually 'main' or 'master'
             path = "/".join(parts[4:]) if len(parts) > 4 else ""
             api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
             api_response = requests.get(api_url)
             api_response.raise_for_status()
             contents = api_response.json()
             if isinstance(contents, list):
                 files = [f"{item['name']} ({item['type']})" for item in contents]
                 return "Directory contents:\n" + "\n".join(files)
             else:
                 return f"Error: Unexpected API response for {url}"
         soup = BeautifulSoup(response.text, 'html.parser')
         # For README and markdown files
         readme_content = soup.find('article', class_='markdown-body')
+        if readme_content and hasattr(readme_content, 'get_text'):
             return self.clean_text(readme_content.get_text())
         # For code files
         code_content = soup.find('table', class_='highlight')
+        if code_content and hasattr(code_content, 'get_text'):
             return self.clean_text(code_content.get_text())
         # For other content, get main content
         main_content = soup.find('main')
+        if main_content and hasattr(main_content, 'get_text'):
             return self.clean_text(main_content.get_text())
+        # Final fallback: get all text from soup
+        if hasattr(soup, 'get_text'):
+            return self.clean_text(soup.get_text())
+        else:
+            return self.clean_text(str(soup))
+    def load(self):
+        docs = []
+        for url in self.web_paths:
+            text = self._scrape(url)
+            docs.append(Document(page_content=text, metadata={"source": url}))
+        return docs
+class RawContentLoader(WebBaseLoader):
+    """Loader for raw content from GitHub (Python files, etc.)"""
+    def _scrape(self, url: str, *args, **kwargs) -> str:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    def load(self):
+        docs = []
+        for url in self.web_paths:
+            text = self._scrape(url)
+            docs.append(Document(page_content=text, metadata={"source": url}))
+        return docs
 # Load documentation from urls
 def load_docs():
     # Get urls
     urlsfile = open("urls.txt")
     urls = urlsfile.readlines()
+    urls = [url.replace("\n","") for url in urls if not url.strip().startswith("#") and url.strip()]
     urlsfile.close()
     # Load documents from URLs
         if not url:
             continue
+        # Handle PDF files
+        if url.endswith('.pdf'):
+            print(f"Loading PDF: {url}")
+            try:
+                loader = PyPDFLoader(url)
+                pdf_docs = loader.load()
+                for doc in pdf_docs:
+                    doc.metadata['source'] = url
+                docs.extend(pdf_docs)
+            except Exception as e:
+                print(f"Error loading PDF {url}: {str(e)}")
         # Check if URL is a Jupyter notebook
+        elif url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
             print(f"Loading notebook: {url}")
             notebook_docs = load_github_notebook(url)
             docs.extend(notebook_docs)
+        # Handle raw content URLs (already in raw.githubusercontent.com format)
+        elif 'raw.githubusercontent.com' in url:
+            print(f"Loading raw content: {url}")
+            try:
+                loader = RawContentLoader([url])
+                web_docs = loader.load()
+                # Preserve original URL in metadata
+                for doc in web_docs:
+                    doc.metadata['source'] = url
+                docs.extend(web_docs)
+            except Exception as e:
+                print(f"Error loading {url}: {str(e)}")
+        # Handle Python and Markdown files using raw content (convert from blob to raw)
         elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
             print(f"Loading raw content: {url}")
             try:
                 raw_url = github_to_raw(url)
+                loader = RawContentLoader([raw_url])
                 web_docs = loader.load()
                 # Preserve original URL in metadata
                 for doc in web_docs:
     return docs
 def extract_reference(url):
+    """Extract a reference keyword from the URL for display in citations."""
+    # Handle GitHub blob URLs
     if "blob/main" in url:
         return url.split("blob/main/")[-1]
+    # Handle GitHub tree URLs
     elif "tree/main" in url:
         return url.split("tree/main/")[-1] or "root"
+    # Handle raw.githubusercontent.com URLs
+    elif "raw.githubusercontent.com" in url:
+        # Example: https://raw.githubusercontent.com/user/repo/branch/path/to/file.py
+        parts = url.split("raw.githubusercontent.com/")[-1].split("/")
+        if len(parts) > 3:
+            # Remove user, repo, branch
+            return "/".join(parts[3:])
+        else:
+            return url
+    # For arXiv PDFs and other URLs, just use the filename
+    elif url.endswith('.pdf') or url.endswith('.ipynb') or url.endswith('.py') or url.endswith('.md'):
+        return url.split("/")[-1]
     return url
 # Join content pages for processing

urls.txt CHANGED Viewed

@@ -1,20 +1,37 @@
 https://github.com/JayWadekar/gwIAS-HM/tree/main
 https://github.com/JayWadekar/gwIAS-HM/tree/main/Pipeline
 https://github.com/JayWadekar/gwIAS-HM/blob/main/README.md
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ML_modules.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_hm_search.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_mz_fast.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coincidence_HM.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/data_operations.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/download_data.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/gw_detect_file.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/params.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/python_utils.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ranking_HM.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/readligo.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_generator_HM.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py
-https://github.com/JayWadekar/gwIAS-HM/blob/main/Tutorial_notebooks/4.Trig_Coin_on_cluster.ipynb

+# Repository Structure
 https://github.com/JayWadekar/gwIAS-HM/tree/main
 https://github.com/JayWadekar/gwIAS-HM/tree/main/Pipeline
 https://github.com/JayWadekar/gwIAS-HM/blob/main/README.md
+# Core Pipeline Components
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/coherent_score_hm_search.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/coherent_score_mz_fast.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/coincidence_HM.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/triggers_single_detector_HM.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/ranking_HM.py
+# Data Handling
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/data_operations.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/download_data.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/readligo.py
+# Template Bank Generation
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/template_bank_generator_HM.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/template_bank_params_O3a_HM.py
+# Machine Learning Components
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/ML_modules.py
+# Utilities and Configuration
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/params.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/python_utils.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/utils.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/gw_detect_file.py
+https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/triggering_on_cluster.py
+# Tutorials and Documentation
+https://github.com/JayWadekar/gwIAS-HM/blob/main/Tutorial_notebooks/4.Trig_Coin_on_cluster.ipynb
+# Research Papers
+https://arxiv.org/pdf/1902.10341.pdf
+https://arxiv.org/pdf/2405.17400v2.pdf