import re | |
def simulate_token_count(text): | |
""" | |
simulation token | |
""" | |
if not text: | |
return 0 | |
text = text.replace('\n', ' \n ') | |
spaces_and_punct = sum(1 for c in text if c.isspace() or c in ',.;:!?()[]{}"\'`-_=+<>/@#$%^&*|\\') | |
digits = sum(1 for c in text if c.isdigit()) | |
words = text.split() | |
short_words = sum(1 for w in words if len(w) <= 2) | |
code_blocks = len(re.findall(r'```[\s\S]*?```', text)) | |
urls = len(re.findall(r'https?://\S+', text)) | |
adjusted_length = len(text) - spaces_and_punct - digits - short_words | |
token_count = ( | |
adjusted_length / 4 + | |
spaces_and_punct * 0.25 + | |
digits * 0.5 + | |
short_words * 0.5 + | |
code_blocks * 5 + | |
urls * 4 | |
) | |
return int(token_count * 1.1) + 1 |