Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

ttm-webapp-hf / pipeline /stopwords_lite_bo.py

Daniel Wojahn

feat(ui): Add preset-based analysis UI and Gradio 6 compatibility

75e8f38 13 days ago

2.01 kB

	# -- coding: utf-8 --
	"""Module for reduced Tibetan stopwords.

	This file provides a less aggressive list of Tibetan stopwords for use in the Tibetan Text Metrics application.
	It contains only the most common particles and punctuation that are unlikely to carry significant meaning.
	"""

	# Initial set of stopwords with clear categories
	PARTICLES_INITIAL_LITE = [
	"ཏུ", "གི", "ཀྱི", "གིས", "ཀྱིས", "ཡིས", "ཀྱང", "སྟེ", "ཏེ", "ནོ", "ཏོ",
	"ཅིང", "ཅིག", "ཅེས", "ཞེས", "གྱིས", "ན",
	]

	MARKERS_AND_PUNCTUATION = ["༈", "།", "༎", "༑"]

	# Reduced list of particles and suffixes
	MORE_PARTICLES_SUFFIXES_LITE = [
	"འི་", "དུ་", "གིས་", "ཏེ", "གི་", "ཡི་", "ཀྱི་", "པས་", "ཀྱིས་", "ཡི", "ལ", "ནི་", "ར", "དུ",
	"ལས", "གྱིས་", "ས", "ཏེ་", "གྱི་", "དེ", "ཀ་", "སྟེ", "སྟེ་", "ངམ", "ཏོ", "དོ", "དམ་",
	"ན", "འམ་", "ལོ", "ཀྱིས", "བས་", "ཤིག", "གིས", "ཀི་", "ཡིས་", "གྱི", "གི"
	]

	# Combine all categorized lists
	_ALL_STOPWORDS_CATEGORIZED_LITE = (
	PARTICLES_INITIAL_LITE +
	MARKERS_AND_PUNCTUATION +
	MORE_PARTICLES_SUFFIXES_LITE
	)

	def _normalize_tibetan_token(token: str) -> str:
	"""
	Normalize a Tibetan token by removing trailing tsek (་).

	This ensures consistent matching regardless of whether the tokenizer
	preserves or strips the tsek.
	"""
	return token.rstrip('་')

	# Final flat list of unique stopwords (normalized to remove trailing tsek)
	TIBETAN_STOPWORDS_LITE = list(set(_normalize_tibetan_token(sw) for sw in _ALL_STOPWORDS_CATEGORIZED_LITE))

	# Final set of unique stopwords for efficient Jaccard/LCS filtering (as a set)
	# Normalized to match tokenizer output regardless of tsek handling
	TIBETAN_STOPWORDS_LITE_SET = set(TIBETAN_STOPWORDS_LITE)