Spaces:

realrohilbansal
/

course-search-av

Runtime error

App Files Files Community

Rohil Bansal commited on Dec 10, 2024

Commit

821284f

1 Parent(s): 8778311

search improved

Browse files

Files changed (3) hide show

course_search/app/run.py +14 -2
course_search/search_system/data_pipeline.py +19 -31
course_search/search_system/rag_system.py +1 -1

course_search/app/run.py CHANGED Viewed

@@ -10,6 +10,7 @@ if str(project_root) not in sys.path:
     sys.path.append(str(project_root))
 from course_search.search_system.data_pipeline import DataPipeline
 # Setup logging
 logging.basicConfig(level=logging.INFO)
@@ -32,10 +33,21 @@ def main():
         # Setup paths
         project_root, data_dir = setup_paths()
         # Run data pipeline
         logger.info("Running data pipeline...")
         pipeline = DataPipeline()
-        pipeline.run_pipeline(save_path=str(data_dir / 'courses_with_embeddings.pkl'))
         # Run Gradio app
         logger.info("Starting Gradio app...")
@@ -43,7 +55,7 @@ def main():
         if not gradio_path.exists():
             raise FileNotFoundError(f"Gradio app not found at: {gradio_path}")
         # Change to project root directory before running
         os.chdir(str(project_root))

     sys.path.append(str(project_root))
 from course_search.search_system.data_pipeline import DataPipeline
+from course_search.search_system.rag_system import RAGSystem
 # Setup logging
 logging.basicConfig(level=logging.INFO)
         # Setup paths
         project_root, data_dir = setup_paths()
+        # Create cache directory
+        cache_dir = data_dir / 'cache'
+        cache_dir.mkdir(exist_ok=True)
         # Run data pipeline
         logger.info("Running data pipeline...")
         pipeline = DataPipeline()
+        df = pipeline.run_pipeline(
+            save_path=str(data_dir / 'courses.pkl'),
+            force_scrape=False  # Set to True to force new scraping
+        )
+        # Initialize RAG system with caching
+        rag_system = RAGSystem()
+        rag_system.load_and_process_data(df, cache_dir=cache_dir)
         # Run Gradio app
         logger.info("Starting Gradio app...")
         if not gradio_path.exists():
             raise FileNotFoundError(f"Gradio app not found at: {gradio_path}")
         # Change to project root directory before running
         os.chdir(str(project_root))

course_search/search_system/data_pipeline.py CHANGED Viewed

@@ -1,47 +1,35 @@
 import pandas as pd
-from typing import Optional
-from course_search.scraper.course_scraper import CourseScraper
-from course_search.search_system.embeddings import EmbeddingGenerator
-from course_search.search_system.vector_store import FAISSManager
 import logging
 logger = logging.getLogger(__name__)
 class DataPipeline:
     def __init__(self):
         self.scraper = CourseScraper()
-        self.embedding_generator = EmbeddingGenerator()
-        self.vector_store = FAISSManager()
-    def run_pipeline(self, save_path: Optional[str] = None) -> pd.DataFrame:
-        """
-        Run the complete data pipeline: scraping, embedding generation, and vector storage
-        """
         try:
-            # Step 1: Scrape courses
-            logger.info("Starting course scraping...")
-            df = self.scraper.scrape_all_courses()
-            logger.info(f"Scraped {len(df)} courses successfully")
-            # Step 2: Generate embeddings
-            logger.info("Generating embeddings...")
-            df = self.embedding_generator.add_embeddings_to_df(
-                df,
-                text_column='description'
-            )
-            logger.info("Embeddings generated successfully")
-            # Step 3: Upload to FAISS
-            logger.info("Uploading to FAISS...")
-            self.vector_store.upsert_courses(df)
-            # Step 4: Save data if path provided
-            if save_path:
-                logger.info(f"Saving data to {save_path}")
-                df.to_pickle(save_path)
             return df
         except Exception as e:
-            logger.error(f"Error in pipeline: {str(e)}")
             raise

 import pandas as pd
+from pathlib import Path
 import logging
+from course_search.scraper.course_scraper import CourseScraper
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class DataPipeline:
     def __init__(self):
         self.scraper = CourseScraper()
+    def run_pipeline(self, save_path: str, force_scrape: bool = False) -> pd.DataFrame:
+        """Run the data pipeline with option to use cached data"""
         try:
+            data_path = Path(save_path)
+            # Check if cached data exists
+            if not force_scrape and data_path.exists():
+                logger.info("Loading cached data...")
+                return pd.read_pickle(data_path)
+            # If no cached data or force_scrape is True, scrape new data
+            logger.info("Scraping course data...")
+            df = self.scraper.scrape_all_courses()
+            # Save the data
+            logger.info(f"Saving data to {save_path}")
+            df.to_pickle(save_path)
             return df
         except Exception as e:
+            logger.error(f"Error in data pipeline: {str(e)}")
             raise

course_search/search_system/rag_system.py CHANGED Viewed

@@ -96,7 +96,7 @@ class RAGSystem:
                 raise ValueError("FAISS index not initialized. Please load data first.")
             # Get query embedding
-            query_embedding = self.model.encode([query])[0]
             # Get initial similarity scores
             D, I = self.index.search(query_embedding.reshape(1, -1), top_k * 2)

                 raise ValueError("FAISS index not initialized. Please load data first.")
             # Get query embedding
+            query_embedding = self.model.encode([query], convert_to_numpy=True)
             # Get initial similarity scores
             D, I = self.index.search(query_embedding.reshape(1, -1), top_k * 2)