import os, json from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from transcription import run_whisper_transcription from lc_utils import segments_to_documents from logging_config import logger EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" def build_index(media_path: str, out_dir: str = "data"): """Transcribe media_path and build a FAISS index in out_dir.""" try: logger.info(f"Starting transcription for {media_path}") # Ensure output directory exists os.makedirs(out_dir, exist_ok=True) # Run Whisper transcription segments = run_whisper_transcription(media_path) if not segments: raise ValueError("No transcription segments were generated") logger.info(f"Transcription complete. Generated {len(segments)} segments.") # Convert to documents docs = segments_to_documents(segments, media_path) # Create embeddings and build index logger.info("Creating embeddings...") embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL) logger.info("Building FAISS index...") store = FAISS.from_documents(docs, embeddings) # Save the index and segments store.save_local(out_dir) segments_path = os.path.join(out_dir, "segments.json") with open(segments_path, "w") as f: json.dump(segments, f) logger.info(f"Index successfully written to {out_dir}") return store except Exception as e: logger.error(f"Error in build_index: {str(e)}", exc_info=True) raise if __name__ == "__main__": import sys if len(sys.argv) != 2: print("Usage: python index_builder.py ") sys.exit(1) build_index(sys.argv[1])