OpenDeepResearch

Runtime error

OpenDeepResearch / scripts /text_cleaner_tool.py

Leonardo

Sync local Space with Hub

eaaf050 verified 9 months ago

5.56 kB

	#!/usr/bin/env python
	# coding=utf-8
	# Copyright 2025 The Footscray Coding Collective. All rights reserved.
	"""
	Text cleaning tool for smolagents.

	Provides a Tool implementation that wraps the cleantext library for normalizing
	text content with handling for various text transformation options.
	"""

	# Standard library imports
	import logging
	from typing import Any, Dict, Optional

	# Third-party imports
	from cleantext import clean
	from smolagents import Tool

	# Configure module logger
	logger = logging.getLogger(__name__)


	# pylint: disable=too-few-public-methods
	class TextCleanerTool(Tool):
	"""A simple text cleaner tool."""

	name = "clean_text"
	description = """This tool can be used to process messy user-generated content into
	normalized text. It handles a variety of text transformation options,
	such as fixing unicode errors, transliterating to closest ASCII,
	lowercasing text, normalizing line breaks, removing punctuation,
	replacing numbers with a token, and more."""
	inputs = {
	"text": {"type": "string", "description": "The input text to clean"},
	"options": {
	"type": "object",
	"description": (
	"Optional parameters for text cleaning. Available options: "
	"fix_unicode, to_ascii, lower, no_line_breaks, no_urls, "
	"no_emails, no_phone_numbers, no_numbers, no_digits, "
	"no_currency_symbols, no_punct, no_emoji, lang"
	),
	"optional": True,
	"nullable": True,
	},
	}
	output_type = "string"

	def forward(self, text: str, options: Optional[Dict[str, Any]] = None) -> str:
	"""
	Clean text using the cleantext library with flexible options.

	User-generated content on the Web and in social media is often dirty.
	Preprocess your scraped data with `clean-text` to create a normalized
	text representation. For instance, turn this corrupted input:

	```
	A bunch of \\u2018new\\u2019 references, including
	[Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29).


	»Yóù àré rïght <3!«
	```

	into this clean output:

	```
	A bunch of 'new' references, including [moana](<URL>).

	"you are right <3!"
	```

	`clean-text` uses ftfy, unidecode and numerous hand-crafted rules,
	i.e., RegEx.

	Usage of the cleantext API:
	clean("some input",
	fix_unicode=True, # fix various unicode errors
	to_ascii=True, # transliterate to closest ASCII
	lower=True, # lowercase text
	no_line_breaks=False, # normalize line breaks
	no_urls=False, # replace URLs with a token
	no_emails=False, # replace email addresses with token
	no_phone_numbers=False, # replace phone numbers with token
	no_numbers=False, # replace all numbers with token
	no_digits=False, # replace all digits with 0
	no_currency_symbols=False, # replace currency symbols with token
	no_punct=False, # remove punctuations
	replace_with_punct="", # replacement for punctuation
	replace_with_url="<URL>", # replacement for URLs
	replace_with_email="<EMAIL>", # replacement for emails
	replace_with_phone_number="<PHONE>", # replacement for phones
	replace_with_number="<NUMBER>", # replacement for numbers
	replace_with_digit="0", # replacement for digits
	replace_with_currency_symbol="<CUR>", # currency replacement
	lang="en" # language ('en' or 'de' supported)
	)
	"""
	# Input validation
	if not text:
	return ""

	if not isinstance(text, str):
	try:
	text = str(text)
	except (ValueError, TypeError) as e:
	logger.error("Failed to convert input to string: %s", e)
	return f"Error: Could not process input of type {type(text)}"

	# Default replacement tokens
	replacements = {
	"replace_with_url": "<URL>",
	"replace_with_email": "<EMAIL>",
	"replace_with_phone_number": "<PHONE>",
	"replace_with_number": "<NUMBER>",
	"replace_with_digit": "0",
	"replace_with_currency_symbol": "<CUR>",
	"replace_with_punct": "",
	}

	# Default options
	default_options = {
	"fix_unicode": True,
	"to_ascii": True,
	"lower": True,
	"no_line_breaks": False,
	"no_urls": False,
	"no_emails": False,
	"no_phone_numbers": False,
	"no_numbers": False,
	"no_digits": False,
	"no_currency_symbols": False,
	"no_punct": False,
	"no_emoji": False,
	"lang": "en",
	}

	# Merge user options with defaults
	if options:
	default_options.update(options)

	# Merge all parameters
	params = {default_options, replacements}

	try:
	# Apply cleantext with parameters
	return clean(text, **params)
	except (ValueError, TypeError, AttributeError) as e:
	logger.error("Error cleaning text: %s", e)
	return f"Error during text cleaning: {str(e)}"


	__all__ = ["TextCleanerTool"]