Spaces:

ChaitraSaiK
/

Byte_pair_Encoder

Runtime error

App Files Files Community

Byte_pair_Encoder / app.py

ChaitraSaiK

removed 350_file, third commit

b3797cd 8 months ago

raw

history blame contribute delete

5.95 kB

	import gradio as gr
	from typing import List, Dict, Tuple
	import numpy as np

	def get_stats(ids):
	counts = {}
	for pair in zip(ids, ids[1:]):
	counts[pair] = counts.get(pair, 0) + 1
	return counts

	def merge(ids, pair, idx):
	newids = []
	i = 0
	while i < len(ids):
	if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
	newids.append(idx)
	i += 2
	else:
	newids.append(ids[i])
	i += 1
	return newids

	# Read the Telugu text file and train BPE
	def train_bpe(vocab_size: int = 350):
	# Read the preprocessed Telugu text
	with open('telugu_preprocessed_file.txt', 'r', encoding='utf-8') as f:
	text = f.read()

	# Convert initial text to bytes
	tokens = list(text.encode('utf-8'))

	# Train merges
	num_merges = vocab_size - 256
	ids = list(tokens)
	merges = {}

	for i in range(num_merges):
	stats = get_stats(ids)
	if not stats: # If no more pairs to merge
	break
	pair = max(stats, key=stats.get)
	idx = 256 + i
	print(f"merging {pair} into a new token {idx}") # Optional: for monitoring training
	ids = merge(ids, pair, idx)
	merges[pair] = idx

	return merges

	# Train the tokenizer
	merges = train_bpe()

	class OptimizedBPETokenizer:
	def __init__(self, merges: Dict[Tuple[int, int], int]):
	self.merges = merges
	self.idx_to_pair = {idx: pair for pair, idx in merges.items()}

	# Create lookup table for faster encoding
	self.merge_lookup = {}
	for (first, second), idx in merges.items():
	if first not in self.merge_lookup:
	self.merge_lookup[first] = {}
	self.merge_lookup[first][second] = idx

	def encode(self, text: str, chunk_size: int = 1000000) -> List[int]:
	if not isinstance(text, str):
	return []

	# Convert to regular integers instead of numpy types
	ids = [int(x) for x in text.encode('utf-8')]

	# Apply merges
	while True:
	stats = get_stats(ids)
	if not stats:
	break
	pair = max(stats, key=stats.get)
	if pair not in self.merges:
	break
	ids = merge(ids, pair, self.merges[pair])

	return ids

	def decode(self, ids: List[int]) -> str:
	result = []
	for token in ids:
	if token < 256:
	result.append(token)
	else:
	# Expand merged tokens
	pair = self.idx_to_pair[token]
	result.extend(self._expand_token(pair[0]))
	result.extend(self._expand_token(pair[1]))
	return bytes(result).decode('utf-8')

	def _expand_token(self, token: int) -> List[int]:
	if token < 256:
	return [token]
	pair = self.idx_to_pair[token]
	result = []
	result.extend(self._expand_token(pair[0]))
	result.extend(self._expand_token(pair[1]))
	return result

	# Initialize tokenizer
	tokenizer = OptimizedBPETokenizer(merges)

	def encode_text(text: str) -> str:
	"""Function to handle encoding"""
	if not text:
	return "Please enter text to encode"
	try:
	tokens = tokenizer.encode(text)
	return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}"
	except Exception as e:
	return f"Encoding error: {str(e)}"

	def decode_tokens(text: str) -> str:
	"""Function to handle decoding"""
	if not text:
	return "Please enter tokens to decode"
	try:
	tokens = [int(x) for x in text.strip('[]').split(',')]
	decoded_text = tokenizer.decode(tokens)
	return f"Decoded text: {decoded_text}"
	except Exception as e:
	return f"Error: Please provide valid integers for decoding. Details: {str(e)}"

	# Create the Gradio interface
	with gr.Blocks(title="Telugu BPE Tokenizer") as iface:
	gr.Markdown("# Telugu BPE Tokenizer")
	gr.Markdown("A byte-pair encoding tokenizer trained on Telugu text.")

	with gr.Row():
	# Encoding Section
	with gr.Column():
	gr.Markdown("### Encode Text")
	input_text = gr.Textbox(
	label="Input Text",
	placeholder="Enter Telugu text to encode..."
	)
	encode_button = gr.Button("Encode")
	encode_output = gr.Textbox(label="Encoding Result")

	# Decoding Section
	with gr.Column():
	gr.Markdown("### Decode Tokens")
	input_tokens = gr.Textbox(
	label="Input Tokens",
	placeholder="Enter comma-separated tokens (e.g., 256,257,258)"
	)
	decode_button = gr.Button("Decode")
	decode_output = gr.Textbox(label="Decoding Result")

	# Set up the button click events
	encode_button.click(
	fn=encode_text,
	inputs=input_text,
	outputs=encode_output
	)

	decode_button.click(
	fn=decode_tokens,
	inputs=input_tokens,
	outputs=decode_output
	)

	# Add examples
	with gr.Row():
	with gr.Column():
	gr.Examples(
	examples=[
	["నమస్కారం"],
	["తెలుగు భాష"],
	],
	inputs=input_text,
	outputs=encode_output,
	fn=encode_text,
	label="Encoding Examples"
	)

	with gr.Column():
	gr.Examples(
	examples=[
	["256,257,258"], # Example tokens
	],
	inputs=input_tokens,
	outputs=decode_output,
	fn=decode_tokens,
	label="Decoding Examples"
	)

	if __name__ == "__main__":
	iface.launch()