Rohil Bansal commited on
Commit
821284f
·
1 Parent(s): 8778311

search improved

Browse files
course_search/app/run.py CHANGED
@@ -10,6 +10,7 @@ if str(project_root) not in sys.path:
10
  sys.path.append(str(project_root))
11
 
12
  from course_search.search_system.data_pipeline import DataPipeline
 
13
 
14
  # Setup logging
15
  logging.basicConfig(level=logging.INFO)
@@ -32,10 +33,21 @@ def main():
32
  # Setup paths
33
  project_root, data_dir = setup_paths()
34
 
 
 
 
 
35
  # Run data pipeline
36
  logger.info("Running data pipeline...")
37
  pipeline = DataPipeline()
38
- pipeline.run_pipeline(save_path=str(data_dir / 'courses_with_embeddings.pkl'))
 
 
 
 
 
 
 
39
 
40
  # Run Gradio app
41
  logger.info("Starting Gradio app...")
@@ -43,7 +55,7 @@ def main():
43
 
44
  if not gradio_path.exists():
45
  raise FileNotFoundError(f"Gradio app not found at: {gradio_path}")
46
-
47
  # Change to project root directory before running
48
  os.chdir(str(project_root))
49
 
 
10
  sys.path.append(str(project_root))
11
 
12
  from course_search.search_system.data_pipeline import DataPipeline
13
+ from course_search.search_system.rag_system import RAGSystem
14
 
15
  # Setup logging
16
  logging.basicConfig(level=logging.INFO)
 
33
  # Setup paths
34
  project_root, data_dir = setup_paths()
35
 
36
+ # Create cache directory
37
+ cache_dir = data_dir / 'cache'
38
+ cache_dir.mkdir(exist_ok=True)
39
+
40
  # Run data pipeline
41
  logger.info("Running data pipeline...")
42
  pipeline = DataPipeline()
43
+ df = pipeline.run_pipeline(
44
+ save_path=str(data_dir / 'courses.pkl'),
45
+ force_scrape=False # Set to True to force new scraping
46
+ )
47
+
48
+ # Initialize RAG system with caching
49
+ rag_system = RAGSystem()
50
+ rag_system.load_and_process_data(df, cache_dir=cache_dir)
51
 
52
  # Run Gradio app
53
  logger.info("Starting Gradio app...")
 
55
 
56
  if not gradio_path.exists():
57
  raise FileNotFoundError(f"Gradio app not found at: {gradio_path}")
58
+
59
  # Change to project root directory before running
60
  os.chdir(str(project_root))
61
 
course_search/search_system/data_pipeline.py CHANGED
@@ -1,47 +1,35 @@
1
  import pandas as pd
2
- from typing import Optional
3
- from course_search.scraper.course_scraper import CourseScraper
4
- from course_search.search_system.embeddings import EmbeddingGenerator
5
- from course_search.search_system.vector_store import FAISSManager
6
  import logging
 
7
 
 
8
  logger = logging.getLogger(__name__)
9
 
10
  class DataPipeline:
11
  def __init__(self):
12
  self.scraper = CourseScraper()
13
- self.embedding_generator = EmbeddingGenerator()
14
- self.vector_store = FAISSManager()
15
-
16
- def run_pipeline(self, save_path: Optional[str] = None) -> pd.DataFrame:
17
- """
18
- Run the complete data pipeline: scraping, embedding generation, and vector storage
19
- """
20
  try:
21
- # Step 1: Scrape courses
22
- logger.info("Starting course scraping...")
23
- df = self.scraper.scrape_all_courses()
24
- logger.info(f"Scraped {len(df)} courses successfully")
 
 
25
 
26
- # Step 2: Generate embeddings
27
- logger.info("Generating embeddings...")
28
- df = self.embedding_generator.add_embeddings_to_df(
29
- df,
30
- text_column='description'
31
- )
32
- logger.info("Embeddings generated successfully")
33
 
34
- # Step 3: Upload to FAISS
35
- logger.info("Uploading to FAISS...")
36
- self.vector_store.upsert_courses(df)
37
 
38
- # Step 4: Save data if path provided
39
- if save_path:
40
- logger.info(f"Saving data to {save_path}")
41
- df.to_pickle(save_path)
42
-
43
  return df
44
 
45
  except Exception as e:
46
- logger.error(f"Error in pipeline: {str(e)}")
47
  raise
 
1
  import pandas as pd
2
+ from pathlib import Path
 
 
 
3
  import logging
4
+ from course_search.scraper.course_scraper import CourseScraper
5
 
6
+ logging.basicConfig(level=logging.INFO)
7
  logger = logging.getLogger(__name__)
8
 
9
  class DataPipeline:
10
  def __init__(self):
11
  self.scraper = CourseScraper()
12
+
13
+ def run_pipeline(self, save_path: str, force_scrape: bool = False) -> pd.DataFrame:
14
+ """Run the data pipeline with option to use cached data"""
 
 
 
 
15
  try:
16
+ data_path = Path(save_path)
17
+
18
+ # Check if cached data exists
19
+ if not force_scrape and data_path.exists():
20
+ logger.info("Loading cached data...")
21
+ return pd.read_pickle(data_path)
22
 
23
+ # If no cached data or force_scrape is True, scrape new data
24
+ logger.info("Scraping course data...")
25
+ df = self.scraper.scrape_all_courses()
 
 
 
 
26
 
27
+ # Save the data
28
+ logger.info(f"Saving data to {save_path}")
29
+ df.to_pickle(save_path)
30
 
 
 
 
 
 
31
  return df
32
 
33
  except Exception as e:
34
+ logger.error(f"Error in data pipeline: {str(e)}")
35
  raise
course_search/search_system/rag_system.py CHANGED
@@ -96,7 +96,7 @@ class RAGSystem:
96
  raise ValueError("FAISS index not initialized. Please load data first.")
97
 
98
  # Get query embedding
99
- query_embedding = self.model.encode([query])[0]
100
 
101
  # Get initial similarity scores
102
  D, I = self.index.search(query_embedding.reshape(1, -1), top_k * 2)
 
96
  raise ValueError("FAISS index not initialized. Please load data first.")
97
 
98
  # Get query embedding
99
+ query_embedding = self.model.encode([query], convert_to_numpy=True)
100
 
101
  # Get initial similarity scores
102
  D, I = self.index.search(query_embedding.reshape(1, -1), top_k * 2)