#!/usr/bin/env python # coding=utf-8 # Copyright 2025 The Footscray Coding Collective. All rights reserved. """ Text cleaning tool for smolagents. Provides a Tool implementation that wraps the cleantext library for normalizing text content with handling for various text transformation options. """ # Standard library imports import logging from typing import Any, Dict, Optional # Third-party imports from cleantext import clean from smolagents import Tool # Configure module logger logger = logging.getLogger(__name__) # pylint: disable=too-few-public-methods class TextCleanerTool(Tool): """A simple text cleaner tool.""" name = "clean_text" description = """This tool can be used to process messy user-generated content into normalized text. It handles a variety of text transformation options, such as fixing unicode errors, transliterating to closest ASCII, lowercasing text, normalizing line breaks, removing punctuation, replacing numbers with a token, and more.""" inputs = { "text": {"type": "string", "description": "The input text to clean"}, "options": { "type": "object", "description": ( "Optional parameters for text cleaning. Available options: " "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, " "no_emails, no_phone_numbers, no_numbers, no_digits, " "no_currency_symbols, no_punct, no_emoji, lang" ), "optional": True, "nullable": True, }, } output_type = "string" def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str: """ Clean text using the cleantext library with flexible options. User-generated content on the Web and in social media is often dirty. Preprocess your scraped data with `clean-text` to create a normalized text representation. For instance, turn this corrupted input: ``` A bunch of \\u2018new\\u2019 references, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29). »Yóù àré rïght <3!« ``` into this clean output: ``` A bunch of 'new' references, including [moana](). "you are right <3!" ``` `clean-text` uses ftfy, unidecode and numerous hand-crafted rules, i.e., RegEx. Usage of the cleantext API: clean("some input", fix_unicode=True, # fix various unicode errors to_ascii=True, # transliterate to closest ASCII lower=True, # lowercase text no_line_breaks=False, # normalize line breaks no_urls=False, # replace URLs with a token no_emails=False, # replace email addresses with token no_phone_numbers=False, # replace phone numbers with token no_numbers=False, # replace all numbers with token no_digits=False, # replace all digits with 0 no_currency_symbols=False, # replace currency symbols with token no_punct=False, # remove punctuations replace_with_punct="", # replacement for punctuation replace_with_url="", # replacement for URLs replace_with_email="", # replacement for emails replace_with_phone_number="", # replacement for phones replace_with_number="", # replacement for numbers replace_with_digit="0", # replacement for digits replace_with_currency_symbol="", # currency replacement lang="en" # language ('en' or 'de' supported) ) """ # Input validation if not text: return "" if not isinstance(text, str): try: text = str(text) except (ValueError, TypeError) as e: logger.error("Failed to convert input to string: %s", e) return f"Error: Could not process input of type {type(text)}" # Default replacement tokens replacements = { "replace_with_url": "", "replace_with_email": "", "replace_with_phone_number": "", "replace_with_number": "", "replace_with_digit": "0", "replace_with_currency_symbol": "", "replace_with_punct": "", } # Default options default_options = { "fix_unicode": True, "to_ascii": True, "lower": True, "no_line_breaks": False, "no_urls": False, "no_emails": False, "no_phone_numbers": False, "no_numbers": False, "no_digits": False, "no_currency_symbols": False, "no_punct": False, "no_emoji": False, "lang": "en", } # Merge user options with defaults if options: default_options.update(options) # Merge all parameters params = {**default_options, **replacements} try: # Apply cleantext with parameters return clean(text, **params) except (ValueError, TypeError, AttributeError) as e: logger.error("Error cleaning text: %s", e) return f"Error during text cleaning: {str(e)}" __all__ = ["TextCleanerTool"]