|
import os, json |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from transcription import run_whisper_transcription |
|
from lc_utils import segments_to_documents |
|
from logging_config import logger |
|
|
|
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
|
|
|
def build_index(media_path: str, out_dir: str = "data"): |
|
"""Transcribe media_path and build a FAISS index in out_dir.""" |
|
try: |
|
logger.info(f"Starting transcription for {media_path}") |
|
|
|
|
|
os.makedirs(out_dir, exist_ok=True) |
|
|
|
|
|
segments = run_whisper_transcription(media_path) |
|
if not segments: |
|
raise ValueError("No transcription segments were generated") |
|
|
|
logger.info(f"Transcription complete. Generated {len(segments)} segments.") |
|
|
|
|
|
docs = segments_to_documents(segments, media_path) |
|
|
|
|
|
logger.info("Creating embeddings...") |
|
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL) |
|
|
|
logger.info("Building FAISS index...") |
|
store = FAISS.from_documents(docs, embeddings) |
|
|
|
|
|
store.save_local(out_dir) |
|
segments_path = os.path.join(out_dir, "segments.json") |
|
with open(segments_path, "w") as f: |
|
json.dump(segments, f) |
|
|
|
logger.info(f"Index successfully written to {out_dir}") |
|
return store |
|
|
|
except Exception as e: |
|
logger.error(f"Error in build_index: {str(e)}", exc_info=True) |
|
raise |
|
|
|
|
|
if __name__ == "__main__": |
|
import sys |
|
|
|
if len(sys.argv) != 2: |
|
print("Usage: python index_builder.py <media_path>") |
|
sys.exit(1) |
|
build_index(sys.argv[1]) |
|
|