ChaitraSaiK's picture
removed 350_file, third commit
b3797cd
import gradio as gr
from typing import List, Dict, Tuple
import numpy as np
def get_stats(ids):
counts = {}
for pair in zip(ids, ids[1:]):
counts[pair] = counts.get(pair, 0) + 1
return counts
def merge(ids, pair, idx):
newids = []
i = 0
while i < len(ids):
if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
newids.append(idx)
i += 2
else:
newids.append(ids[i])
i += 1
return newids
# Read the Telugu text file and train BPE
def train_bpe(vocab_size: int = 350):
# Read the preprocessed Telugu text
with open('telugu_preprocessed_file.txt', 'r', encoding='utf-8') as f:
text = f.read()
# Convert initial text to bytes
tokens = list(text.encode('utf-8'))
# Train merges
num_merges = vocab_size - 256
ids = list(tokens)
merges = {}
for i in range(num_merges):
stats = get_stats(ids)
if not stats: # If no more pairs to merge
break
pair = max(stats, key=stats.get)
idx = 256 + i
print(f"merging {pair} into a new token {idx}") # Optional: for monitoring training
ids = merge(ids, pair, idx)
merges[pair] = idx
return merges
# Train the tokenizer
merges = train_bpe()
class OptimizedBPETokenizer:
def __init__(self, merges: Dict[Tuple[int, int], int]):
self.merges = merges
self.idx_to_pair = {idx: pair for pair, idx in merges.items()}
# Create lookup table for faster encoding
self.merge_lookup = {}
for (first, second), idx in merges.items():
if first not in self.merge_lookup:
self.merge_lookup[first] = {}
self.merge_lookup[first][second] = idx
def encode(self, text: str, chunk_size: int = 1000000) -> List[int]:
if not isinstance(text, str):
return []
# Convert to regular integers instead of numpy types
ids = [int(x) for x in text.encode('utf-8')]
# Apply merges
while True:
stats = get_stats(ids)
if not stats:
break
pair = max(stats, key=stats.get)
if pair not in self.merges:
break
ids = merge(ids, pair, self.merges[pair])
return ids
def decode(self, ids: List[int]) -> str:
result = []
for token in ids:
if token < 256:
result.append(token)
else:
# Expand merged tokens
pair = self.idx_to_pair[token]
result.extend(self._expand_token(pair[0]))
result.extend(self._expand_token(pair[1]))
return bytes(result).decode('utf-8')
def _expand_token(self, token: int) -> List[int]:
if token < 256:
return [token]
pair = self.idx_to_pair[token]
result = []
result.extend(self._expand_token(pair[0]))
result.extend(self._expand_token(pair[1]))
return result
# Initialize tokenizer
tokenizer = OptimizedBPETokenizer(merges)
def encode_text(text: str) -> str:
"""Function to handle encoding"""
if not text:
return "Please enter text to encode"
try:
tokens = tokenizer.encode(text)
return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}"
except Exception as e:
return f"Encoding error: {str(e)}"
def decode_tokens(text: str) -> str:
"""Function to handle decoding"""
if not text:
return "Please enter tokens to decode"
try:
tokens = [int(x) for x in text.strip('[]').split(',')]
decoded_text = tokenizer.decode(tokens)
return f"Decoded text: {decoded_text}"
except Exception as e:
return f"Error: Please provide valid integers for decoding. Details: {str(e)}"
# Create the Gradio interface
with gr.Blocks(title="Telugu BPE Tokenizer") as iface:
gr.Markdown("# Telugu BPE Tokenizer")
gr.Markdown("A byte-pair encoding tokenizer trained on Telugu text.")
with gr.Row():
# Encoding Section
with gr.Column():
gr.Markdown("### Encode Text")
input_text = gr.Textbox(
label="Input Text",
placeholder="Enter Telugu text to encode..."
)
encode_button = gr.Button("Encode")
encode_output = gr.Textbox(label="Encoding Result")
# Decoding Section
with gr.Column():
gr.Markdown("### Decode Tokens")
input_tokens = gr.Textbox(
label="Input Tokens",
placeholder="Enter comma-separated tokens (e.g., 256,257,258)"
)
decode_button = gr.Button("Decode")
decode_output = gr.Textbox(label="Decoding Result")
# Set up the button click events
encode_button.click(
fn=encode_text,
inputs=input_text,
outputs=encode_output
)
decode_button.click(
fn=decode_tokens,
inputs=input_tokens,
outputs=decode_output
)
# Add examples
with gr.Row():
with gr.Column():
gr.Examples(
examples=[
["నమస్కారం"],
["తెలుగు భాష"],
],
inputs=input_text,
outputs=encode_output,
fn=encode_text,
label="Encoding Examples"
)
with gr.Column():
gr.Examples(
examples=[
["256,257,258"], # Example tokens
],
inputs=input_tokens,
outputs=decode_output,
fn=decode_tokens,
label="Decoding Examples"
)
if __name__ == "__main__":
iface.launch()