Spaces:
Sleeping
Sleeping
| from typing import List, Dict | |
| import hashlib | |
| import logging | |
| # Configure logging | |
| logger = logging.getLogger(__name__) | |
| # Initialize a cache for tokenization results | |
| # Using a simple in-memory dictionary with text hash as key | |
| _tokenization_cache: Dict[str, List[str]] = {} | |
| # Maximum cache size (number of entries) | |
| MAX_CACHE_SIZE = 1000 | |
| try: | |
| from botok import WordTokenizer | |
| # Initialize the tokenizer once at the module level | |
| BOTOK_TOKENIZER = WordTokenizer() | |
| except ImportError: | |
| # Handle the case where botok might not be installed, | |
| # though it's a core dependency for this app. | |
| BOTOK_TOKENIZER = None | |
| logger.error("botok library not found. Tokenization will fail.") | |
| # Optionally, raise an error here if botok is absolutely critical for the app to even start | |
| # raise ImportError("botok is required for tokenization. Please install it.") | |
| def _get_text_hash(text: str) -> str: | |
| """ | |
| Generate a hash for the input text to use as a cache key. | |
| Args: | |
| text: The input text to hash | |
| Returns: | |
| A string representation of the MD5 hash of the input text | |
| """ | |
| return hashlib.md5(text.encode('utf-8')).hexdigest() | |
| def tokenize_texts(texts: List[str], mode: str = "syllable") -> List[List[str]]: | |
| """ | |
| Tokenizes a list of raw Tibetan texts using botok, with caching for performance. | |
| This function maintains an in-memory cache of previously tokenized texts to avoid | |
| redundant processing of the same content. The cache uses MD5 hashes of the input | |
| texts as keys. | |
| Args: | |
| texts: List of raw text strings to tokenize. | |
| Returns: | |
| List of tokenized texts (each as a list of tokens). | |
| Raises: | |
| RuntimeError: If the botok tokenizer failed to initialize. | |
| """ | |
| if BOTOK_TOKENIZER is None: | |
| # This case should ideally be handled more gracefully, | |
| # perhaps by preventing analysis if the tokenizer failed to load. | |
| raise RuntimeError( | |
| "Botok tokenizer failed to initialize. Cannot tokenize texts." | |
| ) | |
| tokenized_texts_list = [] | |
| if mode not in ["word", "syllable"]: | |
| logger.warning(f"Invalid tokenization mode: '{mode}'. Defaulting to 'syllable'.") | |
| mode = "syllable" | |
| # Process each text | |
| for text_content in texts: | |
| # Skip empty texts | |
| if not text_content.strip(): | |
| tokenized_texts_list.append([]) | |
| continue | |
| # Generate hash for cache lookup | |
| cache_key_string = text_content + f"_{mode}" # Include mode in string for hashing | |
| text_hash = _get_text_hash(cache_key_string) | |
| # Check if we have this text in cache | |
| if text_hash in _tokenization_cache: | |
| # Cache hit - use cached tokens | |
| tokens = _tokenization_cache[text_hash] | |
| logger.debug(f"Cache hit for text hash {text_hash[:8]}... (mode: {mode})") | |
| else: | |
| # Cache miss - tokenize and store in cache | |
| try: | |
| current_tokens = [] | |
| if BOTOK_TOKENIZER: | |
| raw_botok_items = list(BOTOK_TOKENIZER.tokenize(text_content)) | |
| if mode == "word": | |
| for item_idx, w in enumerate(raw_botok_items): | |
| if hasattr(w, 'text') and isinstance(w.text, str): | |
| token_text = w.text.strip() | |
| if token_text: # Ensure token is not empty or just whitespace | |
| current_tokens.append(token_text) | |
| # Optionally log if w.text is not a string or missing, for debugging | |
| # elif w.text is not None: | |
| # logger.debug(f"Token item {item_idx} has non-string text {type(w.text)} for hash {text_hash[:8]}. Skipping word.") | |
| # else: | |
| # logger.debug(f"Token item {item_idx} missing text attribute for hash {text_hash[:8]}. Skipping word.") | |
| logger.debug( | |
| f"WORD TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): " | |
| f"{current_tokens[:30]}" | |
| ) | |
| elif mode == "syllable": | |
| # This is the original syllable extraction logic | |
| for item_idx, w in enumerate(raw_botok_items): | |
| if hasattr(w, 'syls') and w.syls: | |
| for syl_idx, syl_item in enumerate(w.syls): | |
| syllable_to_process = None | |
| if isinstance(syl_item, str): | |
| syllable_to_process = syl_item | |
| elif isinstance(syl_item, list): | |
| try: | |
| syllable_to_process = "".join(syl_item) | |
| except TypeError: | |
| logger.warning( | |
| f"Syllable item in w.syls was a list, but could not be joined (non-string elements?): {syl_item} " | |
| f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} " | |
| f"for hash {text_hash[:8]}. Skipping this syllable." | |
| ) | |
| continue | |
| if syllable_to_process is not None: | |
| stripped_syl = syllable_to_process.strip() | |
| if stripped_syl: | |
| current_tokens.append(stripped_syl) | |
| elif syl_item is not None: | |
| logger.warning( | |
| f"Unexpected type for syllable item (neither str nor list): {type(syl_item)} ('{str(syl_item)[:100]}') " | |
| f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} " | |
| f"for hash {text_hash[:8]}. Skipping this syllable." | |
| ) | |
| elif hasattr(w, 'text') and w.text: # Fallback if no 'syls' but in syllable mode | |
| if isinstance(w.text, str): | |
| token_text = w.text.strip() | |
| if token_text: | |
| current_tokens.append(token_text) # Treat as a single syllable/token | |
| elif w.text is not None: | |
| logger.warning( | |
| f"Unexpected type for w.text (in syllable mode fallback): {type(w.text)} ('{str(w.text)[:100]}') " | |
| f"for item {item_idx} (POS: {getattr(w, 'pos', 'N/A')}) " | |
| f"for hash {text_hash[:8]}. Skipping this token." | |
| ) | |
| logger.debug( | |
| f"SYLLABLE TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): " | |
| f"{current_tokens[:30]}" | |
| ) | |
| tokens = current_tokens | |
| else: | |
| logger.error(f"BOTOK_TOKENIZER is None for text hash {text_hash[:8]}, cannot tokenize (mode: {mode}).") | |
| tokens = [] | |
| # Store in cache if not empty | |
| if tokens: | |
| # If cache is full, remove a random entry (simple strategy) | |
| if len(_tokenization_cache) >= MAX_CACHE_SIZE: | |
| # Remove first key (oldest if ordered dict, random otherwise) | |
| _tokenization_cache.pop(next(iter(_tokenization_cache))) | |
| _tokenization_cache[text_hash] = tokens | |
| logger.debug(f"Added tokens to cache with hash {text_hash[:8]}... (mode: {mode})") | |
| except Exception as e: | |
| logger.error(f"Error tokenizing text (mode: {mode}): {e}") | |
| tokens = [] | |
| tokenized_texts_list.append(tokens) | |
| return tokenized_texts_list | |