File size: 1,908 Bytes
45b9636 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import os, json
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from transcription import run_whisper_transcription
from lc_utils import segments_to_documents
from logging_config import logger
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
def build_index(media_path: str, out_dir: str = "data"):
"""Transcribe media_path and build a FAISS index in out_dir."""
try:
logger.info(f"Starting transcription for {media_path}")
# Ensure output directory exists
os.makedirs(out_dir, exist_ok=True)
# Run Whisper transcription
segments = run_whisper_transcription(media_path)
if not segments:
raise ValueError("No transcription segments were generated")
logger.info(f"Transcription complete. Generated {len(segments)} segments.")
# Convert to documents
docs = segments_to_documents(segments, media_path)
# Create embeddings and build index
logger.info("Creating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
logger.info("Building FAISS index...")
store = FAISS.from_documents(docs, embeddings)
# Save the index and segments
store.save_local(out_dir)
segments_path = os.path.join(out_dir, "segments.json")
with open(segments_path, "w") as f:
json.dump(segments, f)
logger.info(f"Index successfully written to {out_dir}")
return store
except Exception as e:
logger.error(f"Error in build_index: {str(e)}", exc_info=True)
raise
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: python index_builder.py <media_path>")
sys.exit(1)
build_index(sys.argv[1])
|