Spaces:
Runtime error
Runtime error
File size: 5,555 Bytes
eaaf050 4c54085 1e9a96d eaaf050 1e9a96d 4c54085 eaaf050 4c54085 1e9a96d 4c54085 1e9a96d eaaf050 8520a66 1e9a96d eaaf050 1e9a96d 8520a66 1e9a96d 8520a66 4c54085 8520a66 1e9a96d 8520a66 1e9a96d 8520a66 4c54085 fe63cd7 4c54085 fe63cd7 4c54085 fe63cd7 eaaf050 8520a66 4c54085 8520a66 1e9a96d 4c54085 1e9a96d 8520a66 4c54085 1e9a96d eaaf050 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
#!/usr/bin/env python
# coding=utf-8
# Copyright 2025 The Footscray Coding Collective. All rights reserved.
"""
Text cleaning tool for smolagents.
Provides a Tool implementation that wraps the cleantext library for normalizing
text content with handling for various text transformation options.
"""
# Standard library imports
import logging
from typing import Any, Dict, Optional
# Third-party imports
from cleantext import clean
from smolagents import Tool
# Configure module logger
logger = logging.getLogger(__name__)
# pylint: disable=too-few-public-methods
class TextCleanerTool(Tool):
"""A simple text cleaner tool."""
name = "clean_text"
description = """This tool can be used to process messy user-generated content into
normalized text. It handles a variety of text transformation options,
such as fixing unicode errors, transliterating to closest ASCII,
lowercasing text, normalizing line breaks, removing punctuation,
replacing numbers with a token, and more."""
inputs = {
"text": {"type": "string", "description": "The input text to clean"},
"options": {
"type": "object",
"description": (
"Optional parameters for text cleaning. Available options: "
"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
"no_emails, no_phone_numbers, no_numbers, no_digits, "
"no_currency_symbols, no_punct, no_emoji, lang"
),
"optional": True,
"nullable": True,
},
}
output_type = "string"
def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
"""
Clean text using the cleantext library with flexible options.
User-generated content on the Web and in social media is often dirty.
Preprocess your scraped data with `clean-text` to create a normalized
text representation. For instance, turn this corrupted input:
```
A bunch of \\u2018new\\u2019 references, including
[Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
»Yóù àré rïght <3!«
```
into this clean output:
```
A bunch of 'new' references, including [moana](<URL>).
"you are right <3!"
```
`clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
i.e., RegEx.
Usage of the cleantext API:
clean("some input",
fix_unicode=True, # fix various unicode errors
to_ascii=True, # transliterate to closest ASCII
lower=True, # lowercase text
no_line_breaks=False, # normalize line breaks
no_urls=False, # replace URLs with a token
no_emails=False, # replace email addresses with token
no_phone_numbers=False, # replace phone numbers with token
no_numbers=False, # replace all numbers with token
no_digits=False, # replace all digits with 0
no_currency_symbols=False, # replace currency symbols with token
no_punct=False, # remove punctuations
replace_with_punct="", # replacement for punctuation
replace_with_url="<URL>", # replacement for URLs
replace_with_email="<EMAIL>", # replacement for emails
replace_with_phone_number="<PHONE>", # replacement for phones
replace_with_number="<NUMBER>", # replacement for numbers
replace_with_digit="0", # replacement for digits
replace_with_currency_symbol="<CUR>", # currency replacement
lang="en" # language ('en' or 'de' supported)
)
"""
# Input validation
if not text:
return ""
if not isinstance(text, str):
try:
text = str(text)
except (ValueError, TypeError) as e:
logger.error("Failed to convert input to string: %s", e)
return f"Error: Could not process input of type {type(text)}"
# Default replacement tokens
replacements = {
"replace_with_url": "<URL>",
"replace_with_email": "<EMAIL>",
"replace_with_phone_number": "<PHONE>",
"replace_with_number": "<NUMBER>",
"replace_with_digit": "0",
"replace_with_currency_symbol": "<CUR>",
"replace_with_punct": "",
}
# Default options
default_options = {
"fix_unicode": True,
"to_ascii": True,
"lower": True,
"no_line_breaks": False,
"no_urls": False,
"no_emails": False,
"no_phone_numbers": False,
"no_numbers": False,
"no_digits": False,
"no_currency_symbols": False,
"no_punct": False,
"no_emoji": False,
"lang": "en",
}
# Merge user options with defaults
if options:
default_options.update(options)
# Merge all parameters
params = {**default_options, **replacements}
try:
# Apply cleantext with parameters
return clean(text, **params)
except (ValueError, TypeError, AttributeError) as e:
logger.error("Error cleaning text: %s", e)
return f"Error during text cleaning: {str(e)}"
__all__ = ["TextCleanerTool"]
|