OpenDeepResearch / scripts /text_cleaner_tool.py
Leonardo
Sync local Space with Hub
eaaf050 verified
#!/usr/bin/env python
# coding=utf-8
# Copyright 2025 The Footscray Coding Collective. All rights reserved.
"""
Text cleaning tool for smolagents.
Provides a Tool implementation that wraps the cleantext library for normalizing
text content with handling for various text transformation options.
"""
# Standard library imports
import logging
from typing import Any, Dict, Optional
# Third-party imports
from cleantext import clean
from smolagents import Tool
# Configure module logger
logger = logging.getLogger(__name__)
# pylint: disable=too-few-public-methods
class TextCleanerTool(Tool):
"""A simple text cleaner tool."""
name = "clean_text"
description = """This tool can be used to process messy user-generated content into
normalized text. It handles a variety of text transformation options,
such as fixing unicode errors, transliterating to closest ASCII,
lowercasing text, normalizing line breaks, removing punctuation,
replacing numbers with a token, and more."""
inputs = {
"text": {"type": "string", "description": "The input text to clean"},
"options": {
"type": "object",
"description": (
"Optional parameters for text cleaning. Available options: "
"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
"no_emails, no_phone_numbers, no_numbers, no_digits, "
"no_currency_symbols, no_punct, no_emoji, lang"
),
"optional": True,
"nullable": True,
},
}
output_type = "string"
def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
"""
Clean text using the cleantext library with flexible options.
User-generated content on the Web and in social media is often dirty.
Preprocess your scraped data with `clean-text` to create a normalized
text representation. For instance, turn this corrupted input:
```
A bunch of \\u2018new\\u2019 references, including
[Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).
»Yóù àré rïght <3!«
```
into this clean output:
```
A bunch of 'new' references, including [moana](<URL>).
"you are right <3!"
```
`clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
i.e., RegEx.
Usage of the cleantext API:
clean("some input",
fix_unicode=True, # fix various unicode errors
to_ascii=True, # transliterate to closest ASCII
lower=True, # lowercase text
no_line_breaks=False, # normalize line breaks
no_urls=False, # replace URLs with a token
no_emails=False, # replace email addresses with token
no_phone_numbers=False, # replace phone numbers with token
no_numbers=False, # replace all numbers with token
no_digits=False, # replace all digits with 0
no_currency_symbols=False, # replace currency symbols with token
no_punct=False, # remove punctuations
replace_with_punct="", # replacement for punctuation
replace_with_url="<URL>", # replacement for URLs
replace_with_email="<EMAIL>", # replacement for emails
replace_with_phone_number="<PHONE>", # replacement for phones
replace_with_number="<NUMBER>", # replacement for numbers
replace_with_digit="0", # replacement for digits
replace_with_currency_symbol="<CUR>", # currency replacement
lang="en" # language ('en' or 'de' supported)
)
"""
# Input validation
if not text:
return ""
if not isinstance(text, str):
try:
text = str(text)
except (ValueError, TypeError) as e:
logger.error("Failed to convert input to string: %s", e)
return f"Error: Could not process input of type {type(text)}"
# Default replacement tokens
replacements = {
"replace_with_url": "<URL>",
"replace_with_email": "<EMAIL>",
"replace_with_phone_number": "<PHONE>",
"replace_with_number": "<NUMBER>",
"replace_with_digit": "0",
"replace_with_currency_symbol": "<CUR>",
"replace_with_punct": "",
}
# Default options
default_options = {
"fix_unicode": True,
"to_ascii": True,
"lower": True,
"no_line_breaks": False,
"no_urls": False,
"no_emails": False,
"no_phone_numbers": False,
"no_numbers": False,
"no_digits": False,
"no_currency_symbols": False,
"no_punct": False,
"no_emoji": False,
"lang": "en",
}
# Merge user options with defaults
if options:
default_options.update(options)
# Merge all parameters
params = {**default_options, **replacements}
try:
# Apply cleantext with parameters
return clean(text, **params)
except (ValueError, TypeError, AttributeError) as e:
logger.error("Error cleaning text: %s", e)
return f"Error during text cleaning: {str(e)}"
__all__ = ["TextCleanerTool"]