OpenDeepResearch

Runtime error

File size: 5,555 Bytes

#!/usr/bin/env python
# coding=utf-8
# Copyright 2025 The Footscray Coding Collective. All rights reserved.
"""
Text cleaning tool for smolagents.

Provides a Tool implementation that wraps the cleantext library for normalizing
text content with handling for various text transformation options.
"""

# Standard library imports
import logging
from typing import Any, Dict, Optional

# Third-party imports
from cleantext import clean
from smolagents import Tool

# Configure module logger
logger = logging.getLogger(__name__)


# pylint: disable=too-few-public-methods
class TextCleanerTool(Tool):
    """A simple text cleaner tool."""

    name = "clean_text"
    description = """This tool can be used to process messy user-generated content into
    normalized text. It handles a variety of text transformation options,
    such as fixing unicode errors, transliterating to closest ASCII,
    lowercasing text, normalizing line breaks, removing punctuation,
    replacing numbers with a token, and more."""
    inputs = {
        "text": {"type": "string", "description": "The input text to clean"},
        "options": {
            "type": "object",
            "description": (
                "Optional parameters for text cleaning. Available options: "
                "fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
                "no_emails, no_phone_numbers, no_numbers, no_digits, "
                "no_currency_symbols, no_punct, no_emoji, lang"
            ),
            "optional": True,
            "nullable": True,
        },
    }
    output_type = "string"

    def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
        """
        Clean text using the cleantext library with flexible options.

        User-generated content on the Web and in social media is often dirty.
        Preprocess your scraped data with `clean-text` to create a normalized
        text representation. For instance, turn this corrupted input:

        ```
        A bunch of \\u2018new\\u2019 references, including
        [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).


        »Yóù àré     rïght &lt;3!«
        ```

        into this clean output:

        ```
        A bunch of 'new' references, including [moana](<URL>).

        "you are right <3!"
        ```

        `clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
        i.e., RegEx.

        Usage of the cleantext API:
        clean("some input",
            fix_unicode=True,          # fix various unicode errors
            to_ascii=True,             # transliterate to closest ASCII
            lower=True,                # lowercase text
            no_line_breaks=False,      # normalize line breaks
            no_urls=False,             # replace URLs with a token
            no_emails=False,           # replace email addresses with token
            no_phone_numbers=False,    # replace phone numbers with token
            no_numbers=False,          # replace all numbers with token
            no_digits=False,           # replace all digits with 0
            no_currency_symbols=False, # replace currency symbols with token
            no_punct=False,            # remove punctuations
            replace_with_punct="",     # replacement for punctuation
            replace_with_url="<URL>",  # replacement for URLs
            replace_with_email="<EMAIL>", # replacement for emails
            replace_with_phone_number="<PHONE>", # replacement for phones
            replace_with_number="<NUMBER>", # replacement for numbers
            replace_with_digit="0",    # replacement for digits
            replace_with_currency_symbol="<CUR>", # currency replacement
            lang="en"                  # language ('en' or 'de' supported)
        )
        """
        # Input validation
        if not text:
            return ""

        if not isinstance(text, str):
            try:
                text = str(text)
            except (ValueError, TypeError) as e:
                logger.error("Failed to convert input to string: %s", e)
                return f"Error: Could not process input of type {type(text)}"

        # Default replacement tokens
        replacements = {
            "replace_with_url": "<URL>",
            "replace_with_email": "<EMAIL>",
            "replace_with_phone_number": "<PHONE>",
            "replace_with_number": "<NUMBER>",
            "replace_with_digit": "0",
            "replace_with_currency_symbol": "<CUR>",
            "replace_with_punct": "",
        }

        # Default options
        default_options = {
            "fix_unicode": True,
            "to_ascii": True,
            "lower": True,
            "no_line_breaks": False,
            "no_urls": False,
            "no_emails": False,
            "no_phone_numbers": False,
            "no_numbers": False,
            "no_digits": False,
            "no_currency_symbols": False,
            "no_punct": False,
            "no_emoji": False,
            "lang": "en",
        }

        # Merge user options with defaults
        if options:
            default_options.update(options)

        # Merge all parameters
        params = {**default_options, **replacements}

        try:
            # Apply cleantext with parameters
            return clean(text, **params)
        except (ValueError, TypeError, AttributeError) as e:
            logger.error("Error cleaning text: %s", e)
            return f"Error during text cleaning: {str(e)}"


__all__ = ["TextCleanerTool"]