|
import logging
|
|
import random
|
|
from database_manager import get_transcriptions_for_language
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class LazyTranscriptLoader:
|
|
"""Lazy loading class for transcripts to avoid loading all at once"""
|
|
|
|
def __init__(self, language, batch_size=50, randomize=False, domain=None, subdomain=None):
|
|
"""
|
|
Initialize the lazy loader
|
|
|
|
Args:
|
|
language (str): Language code for transcripts
|
|
batch_size (int): Number of transcripts to load per batch
|
|
randomize (bool): Whether to randomize the order of transcripts
|
|
domain (str): Optional domain filter
|
|
subdomain (str): Optional subdomain filter
|
|
"""
|
|
self.language = language
|
|
self.batch_size = batch_size
|
|
self.randomize = randomize
|
|
self.domain = domain
|
|
self.subdomain = subdomain
|
|
|
|
|
|
self._loaded_transcripts = []
|
|
self._current_index = 0
|
|
self._total_count = 0
|
|
self._loaded_count = 0
|
|
self._exclude_ids = []
|
|
|
|
|
|
try:
|
|
result = get_transcriptions_for_language(
|
|
language_code=self.language,
|
|
include_recorded=False,
|
|
count_only=True,
|
|
domain=self.domain,
|
|
subdomain=self.subdomain
|
|
)
|
|
self._total_count = result.get('count', 0)
|
|
|
|
|
|
if self._total_count > 0:
|
|
self._load_next_batch()
|
|
except Exception as e:
|
|
logger.error(f"Error initializing LazyTranscriptLoader: {e}")
|
|
|
|
self._total_count = 0
|
|
self._loaded_transcripts = []
|
|
|
|
def _load_next_batch(self):
|
|
"""Load the next batch of transcripts"""
|
|
try:
|
|
transcripts = get_transcriptions_for_language(
|
|
language_code=self.language,
|
|
include_recorded=False,
|
|
limit=self.batch_size,
|
|
exclude_ids=self._exclude_ids,
|
|
domain=self.domain,
|
|
subdomain=self.subdomain
|
|
)
|
|
|
|
|
|
self._loaded_count += len(transcripts)
|
|
|
|
|
|
for t in transcripts:
|
|
self._exclude_ids.append(t['id'])
|
|
|
|
|
|
if self.randomize:
|
|
random.shuffle(transcripts)
|
|
|
|
|
|
self._loaded_transcripts = transcripts
|
|
self._current_index = 0
|
|
|
|
return len(transcripts) > 0
|
|
except Exception as e:
|
|
logger.error(f"Error loading transcript batch: {e}")
|
|
return False
|
|
|
|
def get_current(self):
|
|
"""Get the current transcript"""
|
|
if not self._loaded_transcripts:
|
|
return None
|
|
|
|
if self._current_index >= len(self._loaded_transcripts):
|
|
|
|
if not self._load_next_batch():
|
|
|
|
if len(self._loaded_transcripts) > 0:
|
|
self._current_index = len(self._loaded_transcripts) - 1
|
|
else:
|
|
return None
|
|
|
|
if self._loaded_transcripts and self._current_index < len(self._loaded_transcripts):
|
|
return self._loaded_transcripts[self._current_index]
|
|
|
|
return None
|
|
|
|
def move_next(self):
|
|
"""Move to the next transcript and return it"""
|
|
if not self._loaded_transcripts:
|
|
return None
|
|
|
|
self._current_index += 1
|
|
|
|
if self._current_index >= len(self._loaded_transcripts):
|
|
|
|
if not self._load_next_batch():
|
|
|
|
if len(self._loaded_transcripts) > 0:
|
|
self._current_index = len(self._loaded_transcripts) - 1
|
|
|
|
|
|
|
|
return None
|
|
else:
|
|
return None
|
|
|
|
return self.get_current()
|
|
|
|
def move_prev(self):
|
|
"""Move to the previous transcript if possible"""
|
|
if not self._loaded_transcripts:
|
|
return None
|
|
|
|
|
|
if self._current_index >= len(self._loaded_transcripts):
|
|
self._current_index = len(self._loaded_transcripts) - 1
|
|
|
|
|
|
if self._current_index <= 0:
|
|
return None
|
|
|
|
|
|
self._current_index -= 1
|
|
return self.get_current()
|
|
|
|
def get_progress(self):
|
|
"""Get the current progress information"""
|
|
total = self._total_count
|
|
current = min(self._loaded_count - len(self._loaded_transcripts) + self._current_index + 1, total) if total > 0 else 0
|
|
|
|
return {
|
|
'current': current,
|
|
'total': total,
|
|
'loaded': self._loaded_count
|
|
}
|
|
|