|
|
|
print("translate100 start ...") |
|
|
|
from flask import Flask, request, jsonify, send_from_directory |
|
from flask_cors import CORS |
|
from transformers import M2M100ForConditionalGeneration |
|
from tokenization_small100 import SMALL100Tokenizer |
|
import os |
|
import torch |
|
import json |
|
import logging |
|
import time |
|
import sys |
|
import dataclasses |
|
|
|
setattr(dataclasses, '__version__', '0.8') |
|
|
|
|
|
|
|
transformers_logger = logging.getLogger("transformers") |
|
|
|
transformers_logger.setLevel(logging.ERROR) |
|
|
|
|
|
|
|
|
|
cpu_count = os.cpu_count() or 1 |
|
os.environ["OMP_NUM_THREADS"] = str(cpu_count) |
|
os.environ["MKL_NUM_THREADS"] = str(cpu_count) |
|
os.environ["TOKENIZERS_PARALLELISM"] = "true" |
|
|
|
torch.set_num_threads(os.cpu_count()) |
|
torch.set_num_interop_threads(1) |
|
|
|
app = Flask(__name__) |
|
werkzeugLog = logging.getLogger('werkzeug') |
|
werkzeugLog.setLevel(logging.ERROR) |
|
|
|
app.config['JSON_AS_ASCII'] = False |
|
CORS(app) |
|
|
|
|
|
quick = os.environ.get('TRANSLATE100_QUICK', 'False').lower() in ('true', '1', 'yes') |
|
|
|
|
|
|
|
|
|
useGPU = os.environ.get('TRANSLATE100_USE_GPU', 'True').lower() in ('true', '1', 'yes') |
|
|
|
port = os.environ.get('TRANSLATE100_PORT', '80') |
|
|
|
print("TRANSLATE100_USE_GPU: "+str(useGPU)) |
|
print("TRANSLATE100_QUICK: "+str(quick)) |
|
print("TRANSLATE100_PORT: "+str(port)) |
|
|
|
|
|
|
|
cuda_available = hasattr(torch, 'cuda') and torch.cuda.is_available() |
|
if useGPU: |
|
if cuda_available: |
|
useGPU = True |
|
else: |
|
print("TRANSLATE100_USE_GPU is true, but current cuda is not support, use CPU instead , set TRANSLATE100_USE_GPU = false") |
|
useGPU = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
language_dict_translatejs = [ |
|
{"id": "afrikaans", "name": "南非荷兰语", "serviceId": "af"}, |
|
{"id": "amharic", "name": "阿姆哈拉语", "serviceId": "am"}, |
|
{"id": "arabic", "name": "阿拉伯语", "serviceId": "ar"}, |
|
{"id": "asturian", "name": "阿斯图里亚斯语", "serviceId": "ast"}, |
|
{"id": "azerbaijani", "name": "阿塞拜疆语", "serviceId": "az"}, |
|
{"id": "bashkir", "name": "巴什基尔语", "serviceId": "ba"}, |
|
{"id": "belarusian", "name": "白俄罗斯语", "serviceId": "be"}, |
|
{"id": "bulgarian", "name": "保加利亚语", "serviceId": "bg"}, |
|
{"id": "bengali", "name": "孟加拉语", "serviceId": "bn"}, |
|
{"id": "breton", "name": "布列塔尼语", "serviceId": "br"}, |
|
{"id": "bosnian", "name": "波斯尼亚语", "serviceId": "bs"}, |
|
{"id": "cebuano", "name": "宿务语", "serviceId": "ceb"}, |
|
{"id": "czech", "name": "捷克语", "serviceId": "cs"}, |
|
{"id": "welsh", "name": "威尔士语", "serviceId": "cy"}, |
|
{"id": "danish", "name": "丹麦语", "serviceId": "da"}, |
|
{"id": "deutsch", "name": "德语", "serviceId": "de"}, |
|
{"id": "greek", "name": "希腊语", "serviceId": "el"}, |
|
{"id": "english", "name": "英语", "serviceId": "en"}, |
|
{"id": "spanish", "name": "西班牙语", "serviceId": "es"}, |
|
{"id": "estonian", "name": "爱沙尼亚语", "serviceId": "et"}, |
|
{"id": "persian", "name": "波斯语", "serviceId": "fa"}, |
|
{"id": "nigerian_fulfulde", "name": "富拉语", "serviceId": "ff"}, |
|
{"id": "finnish", "name": "芬兰语", "serviceId": "fi"}, |
|
{"id": "french", "name": "法语", "serviceId": "fr"}, |
|
{"id": "irish", "name": "爱尔兰语", "serviceId": "ga"}, |
|
{"id": "scottish_gaelic", "name": "苏格兰盖尔语", "serviceId": "gd"}, |
|
{"id": "galician", "name": "加利西亚语", "serviceId": "gl"}, |
|
{"id": "gujarati", "name": "古吉拉特语", "serviceId": "gu"}, |
|
{"id": "hausa", "name": "豪萨语", "serviceId": "ha"}, |
|
{"id": "hebrew", "name": "希伯来语", "serviceId": "he"}, |
|
{"id": "hindi", "name": "印地语", "serviceId": "hi"}, |
|
{"id": "croatian", "name": "克罗地亚语", "serviceId": "hr"}, |
|
{"id": "haitian_creole", "name": "海地克里奥尔语", "serviceId": "ht"}, |
|
{"id": "hungarian", "name": "匈牙利语", "serviceId": "hu"}, |
|
{"id": "armenian", "name": "亚美尼亚语", "serviceId": "hy"}, |
|
{"id": "indonesian", "name": "印尼语", "serviceId": "id"}, |
|
{"id": "igbo", "name": "伊博语", "serviceId": "ig"}, |
|
{"id": "ilocano", "name": "伊洛卡语", "serviceId": "ilo"}, |
|
{"id": "icelandic", "name": "冰岛语", "serviceId": "is"}, |
|
{"id": "italian", "name": "意大利语", "serviceId": "it"}, |
|
{"id": "japanese", "name": "日语", "serviceId": "ja"}, |
|
{"id": "javanese", "name": "爪哇语", "serviceId": "jv"}, |
|
{"id": "georgian", "name": "格鲁吉亚语", "serviceId": "ka"}, |
|
{"id": "kazakh", "name": "哈萨克语", "serviceId": "kk"}, |
|
{"id": "khmer", "name": "中部高棉语", "serviceId": "km"}, |
|
{"id": "kannada", "name": "卡纳达语", "serviceId": "kn"}, |
|
{"id": "korean", "name": "韩语", "serviceId": "ko"}, |
|
{"id": "luxembourgish", "name": "卢森堡语", "serviceId": "lb"}, |
|
|
|
{"id": "lingala", "name": "林加拉语", "serviceId": "ln"}, |
|
{"id": "lao", "name": "老挝语", "serviceId": "lo"}, |
|
{"id": "lithuanian", "name": "立陶宛语", "serviceId": "lt"}, |
|
{"id": "latvian", "name": "拉脱维亚语", "serviceId": "lv"}, |
|
{"id": "macedonian", "name": "马其顿语", "serviceId": "mk"}, |
|
{"id": "malayalam", "name": "马拉雅拉姆语", "serviceId": "ml"}, |
|
{"id": "mongolian", "name": "蒙古语", "serviceId": "mn"}, |
|
{"id": "marathi", "name": "马拉地语", "serviceId": "mr"}, |
|
{"id": "malay", "name": "马来语", "serviceId": "ms"}, |
|
{"id": "burmese", "name": "缅甸语", "serviceId": "my"}, |
|
{"id": "nepali", "name": "尼泊尔语", "serviceId": "ne"}, |
|
{"id": "norwegian", "name": "挪威语", "serviceId": "no"}, |
|
{"id": "occitan", "name": "奥克语(1500 年后)", "serviceId": "oc"}, |
|
{"id": "punjabi", "name": "旁遮普语", "serviceId": "pa"}, |
|
{"id": "polish", "name": "波兰语", "serviceId": "pl"}, |
|
{"id": "pashto", "name": "普什图语", "serviceId": "ps"}, |
|
{"id": "portuguese", "name": "葡萄牙语", "serviceId": "pt"}, |
|
{"id": "russian", "name": "俄语", "serviceId": "ru"}, |
|
{"id": "sindhi", "name": "信德语", "serviceId": "sd"}, |
|
{"id": "singapore", "name": "僧伽罗语", "serviceId": "si"}, |
|
{"id": "slovak", "name": "斯洛伐克语", "serviceId": "sk"}, |
|
{"id": "slovene", "name": "斯洛文尼亚语", "serviceId": "sl"}, |
|
{"id": "somali", "name": "索马里语", "serviceId": "so"}, |
|
{"id": "albanian", "name": "阿尔巴尼亚语", "serviceId": "sq"}, |
|
{"id": "serbian", "name": "塞尔维亚语", "serviceId": "sr"}, |
|
{"id": "sundanese", "name": "巽他语", "serviceId": "su"}, |
|
{"id": "swedish", "name": "瑞典语", "serviceId": "sv"}, |
|
{"id": "congo_swahili", "name": "斯瓦希里语", "serviceId": "sw"}, |
|
{"id": "tamil", "name": "泰米尔语", "serviceId": "ta"}, |
|
{"id": "thai", "name": "泰语", "serviceId": "th"}, |
|
{"id": "tagalog", "name": "他加禄语", "serviceId": "tl"}, |
|
{"id": "tswana", "name": "茨瓦纳语", "serviceId": "tn"}, |
|
{"id": "turkish", "name": "土耳其语", "serviceId": "tr"}, |
|
{"id": "ukrainian", "name": "乌克兰语", "serviceId": "uk"}, |
|
{"id": "urdu", "name": "乌尔都语", "serviceId": "ur"}, |
|
{"id": "uzbek", "name": "乌兹别克语", "serviceId": "uz"}, |
|
{"id": "vietnamese", "name": "越南语", "serviceId": "vi"}, |
|
{"id": "wolof", "name": "沃洛夫语", "serviceId": "wo"}, |
|
{"id": "afrikaans_xhosa", "name": "科萨语", "serviceId": "xh"}, |
|
{"id": "yiddish", "name": "意第绪语", "serviceId": "yi"}, |
|
{"id": "yoruba", "name": "约鲁巴语", "serviceId": "yo"}, |
|
{"id": "chinese_simplified", "name": "简体中文", "serviceId": "zh"}, |
|
{"id": "south_african_zulu", "name": "祖鲁语", "serviceId": "zu"}, |
|
{"id": "catalan", "name": "加泰罗尼亚语", "serviceId": "ca"}, |
|
{"id": "frisian", "name": "弗里西语", "serviceId": "fy"}, |
|
{"id": "malagasy", "name": "马达加斯加语", "serviceId": "mg"}, |
|
{"id": "dutch", "name": "荷兰语", "serviceId": "nl"}, |
|
{"id": "northern_sotho", "name": "北索托语", "serviceId": "ns"}, |
|
{"id": "oriya", "name": "奥里亚语", "serviceId": "or"}, |
|
{"id": "romanian", "name": "罗马尼亚语", "serviceId": "ro"}, |
|
{"id": "swati", "name": "斯威士语", "serviceId": "ss"} |
|
] |
|
|
|
|
|
def translatejsToM2m(language): |
|
|
|
for lang_item in language_dict_translatejs: |
|
if lang_item["id"] == language: |
|
return lang_item["serviceId"] |
|
return "" |
|
|
|
|
|
local_model_path = os.getcwd() |
|
|
|
|
|
|
|
if hasattr(sys, '_MEIPASS'): |
|
|
|
local_model_path = sys._MEIPASS |
|
else: |
|
|
|
local_model_path = os.getcwd() |
|
|
|
|
|
print("Loading model and tokenizer ..") |
|
|
|
|
|
model = M2M100ForConditionalGeneration.from_pretrained( |
|
local_model_path, |
|
torch_dtype=torch.float16 if useGPU else torch.float32 |
|
) |
|
|
|
|
|
|
|
|
|
|
|
def is_cpu_support_avx2(): |
|
try: |
|
import cpuinfo |
|
info = cpuinfo.get_cpu_info() |
|
|
|
|
|
flags = info.get('flags', []) |
|
|
|
if isinstance(flags, list): |
|
flags_str = ' '.join(flags).lower() |
|
else: |
|
flags_str = str(flags).lower() |
|
|
|
has_avx2 = 'avx2' in flags_str |
|
result = has_avx2 |
|
|
|
return result |
|
except ImportError: |
|
print("警告: cpuinfo库未安装,默认启用量化加速") |
|
return True |
|
except Exception as e: |
|
print("CPU检测出错: %s, 默认启用量化加速" % str(e)) |
|
return True |
|
|
|
if useGPU: |
|
model = model.to('cuda') |
|
|
|
if quick: |
|
print("Using GPU computing with quantization acceleration") |
|
|
|
model = model.half() |
|
else: |
|
print("Using GPU computing") |
|
|
|
|
|
try: |
|
|
|
torch_version = torch.__version__.split('.') |
|
major_version = int(torch_version[0]) |
|
minor_version = int(torch_version[1]) |
|
|
|
|
|
if major_version > 2 or (major_version == 2 and minor_version >= 0): |
|
print(f"PyTorch version {torch.__version__} support torch.compile, Compiling model ...") |
|
|
|
model = torch.compile(model, mode='max-autotune') |
|
print("Model compilation completed, acceleration enabled") |
|
else: |
|
print(f"PyTorch version {torch.__version__} not support torch.compile,ignore ...") |
|
except Exception as e: |
|
print(f"Error checking PyTorch version or compiling model: {str(e)}") |
|
|
|
|
|
elif quick and is_cpu_support_avx2(): |
|
|
|
print("Using CPU computing to perform int8 quantization acceleration") |
|
model = torch.quantization.quantize_dynamic( |
|
model, {torch.nn.Linear}, dtype=torch.qint8 |
|
) |
|
|
|
if torch.backends.mkldnn.enabled: |
|
try: |
|
|
|
test_tensor = torch.randn(1, 3, 224, 224) |
|
test_tensor = test_tensor.to(memory_format=torch.channels_last) |
|
model = model.to(memory_format=torch.channels_last) |
|
print("Channels_last memory format optimization enabled") |
|
except RuntimeError: |
|
print("CPU supports AVX2 but does not support channels_last, disabled") |
|
|
|
else: |
|
print("Using CPU computation (quantization not enabled, old architecture does not support AVX2 instruction set)") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model.eval() |
|
|
|
|
|
|
|
tokenizer = SMALL100Tokenizer.from_pretrained( |
|
local_model_path, |
|
sp_model_path=os.path.join(local_model_path, "sentencepiece.bpe.model") |
|
) |
|
print("Model and tokenizer loading completed!") |
|
|
|
|
|
|
|
supported_langs = set(tokenizer.lang_code_to_id.keys()) |
|
|
|
translate_support_langs = set(item["id"] for item in language_dict_translatejs) |
|
|
|
|
|
|
|
def translate_single(text, target_lang): |
|
"""翻译单个文本,返回(翻译结果,该文本的tokens数)""" |
|
try: |
|
tokenizer.tgt_lang = target_lang |
|
|
|
encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) |
|
|
|
|
|
|
|
|
|
if useGPU: |
|
|
|
encoded = {k: v.to('cuda') for k, v in encoded.items()} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if useGPU: |
|
with torch.inference_mode(), torch.amp.autocast('cuda'): |
|
if quick: |
|
generated_tokens = model.generate( |
|
**encoded, |
|
max_length=512, |
|
num_beams=1, |
|
do_sample=False, |
|
early_stopping=False, |
|
repetition_penalty=1.5, |
|
use_cache=True |
|
) |
|
else: |
|
generated_tokens = model.generate( |
|
**encoded, |
|
max_length=512, |
|
num_beams=3, |
|
do_sample=True, |
|
early_stopping=True, |
|
repetition_penalty=1.5, |
|
temperature=0.7, |
|
top_k=50, |
|
top_p=0.9, |
|
use_cache=True, |
|
forced_bos_token_id=tokenizer.lang_code_to_id.get(target_lang, None) |
|
) |
|
else: |
|
with torch.inference_mode(): |
|
if quick: |
|
generated_tokens = model.generate( |
|
**encoded, |
|
max_length=512, |
|
num_beams=1, |
|
do_sample=False, |
|
early_stopping=False, |
|
repetition_penalty=1.5, |
|
use_cache=True |
|
) |
|
else: |
|
generated_tokens = model.generate( |
|
**encoded, |
|
max_length=512, |
|
num_beams=2, |
|
do_sample=True, |
|
early_stopping=True, |
|
repetition_penalty=1.5, |
|
use_cache=True |
|
) |
|
|
|
translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) |
|
|
|
|
|
|
|
special_token_ids = set(tokenizer.all_special_ids) |
|
|
|
|
|
total_tokens = 0 |
|
for ids in encoded["input_ids"]: |
|
|
|
content_tokens = [id.item() for id in ids if id.item() not in special_token_ids] |
|
|
|
|
|
total_tokens += len(content_tokens) |
|
|
|
|
|
return translations, total_tokens |
|
|
|
except Exception as e: |
|
return "翻译失败:%s" % str(e), 0 |
|
|
|
|
|
def translate_batch(text_list, target_lang): |
|
"""翻译文本数组,返回(结果数组,总tokens数)""" |
|
results = [] |
|
total_tokens = 0 |
|
for text in text_list: |
|
if not isinstance(text, str): |
|
results.append("无效输入:%s(必须是字符串)" % text) |
|
continue |
|
|
|
|
|
translation, tokens = translate_single(text, target_lang) |
|
|
|
|
|
results.append(translation[0]) |
|
total_tokens += tokens |
|
|
|
return results, total_tokens |
|
|
|
|
|
@app.route('/translate.json', methods=['POST']) |
|
def translate(): |
|
|
|
start_time = time.perf_counter() |
|
|
|
try: |
|
data = request.get_json(force=True) |
|
except: |
|
data = request.form.to_dict() |
|
if not data: |
|
raw_data = request.data.decode('utf-8').strip() |
|
if raw_data: |
|
parts = raw_data.split('&') |
|
for part in parts: |
|
if '=' in part: |
|
k, v = part.split('=', 1) |
|
data[k] = v |
|
|
|
|
|
if not data or "text" not in data or "to" not in data: |
|
elapsed_time = (time.perf_counter() - start_time) * 1000 |
|
return jsonify({ |
|
"result": 0, |
|
"info": "缺少参数!请传入 'text'(待翻译内容,支持单文本或数组)和 'to'(目标语言代码)", |
|
"time": int(elapsed_time) |
|
}), 400 |
|
|
|
text_input = data["text"] |
|
translatejs_to_lang = data["to"].lower() |
|
target_lang = translatejsToM2m(translatejs_to_lang) |
|
|
|
if target_lang == "": |
|
return jsonify({ |
|
"result": 0, |
|
"info": "语言 "+translatejs_to_lang+" 不支持" |
|
}), 400 |
|
original_from = data.get("from") |
|
|
|
|
|
|
|
if target_lang not in supported_langs: |
|
return jsonify({ |
|
"result": 0, |
|
"info": "不支持的语言!支持的代码:%s" % sorted(supported_langs) |
|
}), 400 |
|
|
|
|
|
try: |
|
|
|
if isinstance(text_input, str): |
|
try: |
|
text_list = json.loads(text_input) |
|
if not isinstance(text_list, list): |
|
text_list = [text_input] |
|
except: |
|
text_list = [text_input] |
|
elif isinstance(text_input, list): |
|
text_list = text_input |
|
else: |
|
return jsonify({ |
|
"result": 0, |
|
"info": "text参数必须是字符串或数组" |
|
}), 400 |
|
|
|
|
|
translated_results, total_tokens = translate_batch(text_list, target_lang) |
|
|
|
elapsed_time = (time.perf_counter() - start_time) * 1000 |
|
response_data = { |
|
"result": 1, |
|
"text": translated_results, |
|
"to":translatejs_to_lang, |
|
"tokens": total_tokens, |
|
"time": int(elapsed_time) |
|
} |
|
|
|
if original_from: |
|
response_data["from"] = original_from |
|
return jsonify(response_data) |
|
except Exception as e: |
|
|
|
elapsed_time = (time.perf_counter() - start_time) * 1000 |
|
return jsonify({ |
|
"result": 0, |
|
"info": "处理失败:%s" % str(e), |
|
"time": int(elapsed_time) |
|
}), 500 |
|
|
|
|
|
|
|
@app.route('/language.json',methods=['POST','GET']) |
|
def get_supported_languages(): |
|
|
|
|
|
response = jsonify({ |
|
"list": language_dict_translatejs, |
|
"result": 1, |
|
"info":"success" |
|
}) |
|
|
|
|
|
return response |
|
|
|
|
|
@app.route('/') |
|
def index(): |
|
html = f"""<span>Welcome use</span> <span class="ignore"> translate100 </span><span>, its original intention is to provide</span> <span><a href='https://github.com/xnx3/translate' class="ignore"> translate.js </a><span>with translation switching support between 100 languages.</span> |
|
<br><span>my email</span>: <span class="ignore">[email protected]</span> |
|
<br> |
|
<script src='/translate.js'></script> |
|
<script> |
|
translate.request.api.host=window.location.origin+'/'; |
|
translate.request.api.ip = ''; |
|
translate.request.api.connectTest = ''; |
|
translate.request.api.init = ''; |
|
translate.whole.enableAll(); |
|
translate.setAutoDiscriminateLocalLanguage(); |
|
translate.progress.api.startUITip(); |
|
translate.nomenclature.append('english','chinese_simplified',` |
|
with translation switching support between 100 languages.=支持100种语言之间的翻译切换。 |
|
its original intention is to provide=其初衷是提供 |
|
`); |
|
translate.execute(); |
|
</script>""" |
|
return html |
|
|
|
|
|
@app.route('/translate.js') |
|
def serve_translate_js(): |
|
return send_from_directory('resources', 'translate.js') |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
print(f"The system is running and you can use it normally now\nAccess port number: {port}") |
|
app.run(host='0.0.0.0', port=port, debug=True, use_reloader=False) |
|
|
|
|