ClipQuery / index_builder.py
maguid28's picture
initial commit
45b9636
import os, json
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from transcription import run_whisper_transcription
from lc_utils import segments_to_documents
from logging_config import logger
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
def build_index(media_path: str, out_dir: str = "data"):
"""Transcribe media_path and build a FAISS index in out_dir."""
try:
logger.info(f"Starting transcription for {media_path}")
# Ensure output directory exists
os.makedirs(out_dir, exist_ok=True)
# Run Whisper transcription
segments = run_whisper_transcription(media_path)
if not segments:
raise ValueError("No transcription segments were generated")
logger.info(f"Transcription complete. Generated {len(segments)} segments.")
# Convert to documents
docs = segments_to_documents(segments, media_path)
# Create embeddings and build index
logger.info("Creating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
logger.info("Building FAISS index...")
store = FAISS.from_documents(docs, embeddings)
# Save the index and segments
store.save_local(out_dir)
segments_path = os.path.join(out_dir, "segments.json")
with open(segments_path, "w") as f:
json.dump(segments, f)
logger.info(f"Index successfully written to {out_dir}")
return store
except Exception as e:
logger.error(f"Error in build_index: {str(e)}", exc_info=True)
raise
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: python index_builder.py <media_path>")
sys.exit(1)
build_index(sys.argv[1])