File size: 860 Bytes
92343a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import re
def simulate_token_count(text):
"""
simulation token
"""
if not text:
return 0
text = text.replace('\n', ' \n ')
spaces_and_punct = sum(1 for c in text if c.isspace() or c in ',.;:!?()[]{}"\'`-_=+<>/@#$%^&*|\\')
digits = sum(1 for c in text if c.isdigit())
words = text.split()
short_words = sum(1 for w in words if len(w) <= 2)
code_blocks = len(re.findall(r'```[\s\S]*?```', text))
urls = len(re.findall(r'https?://\S+', text))
adjusted_length = len(text) - spaces_and_punct - digits - short_words
token_count = (
adjusted_length / 4 +
spaces_and_punct * 0.25 +
digits * 0.5 +
short_words * 0.5 +
code_blocks * 5 +
urls * 4
)
return int(token_count * 1.1) + 1 |