Spaces:
Paused
Paused
File size: 3,544 Bytes
1852d76 b0744ce 1852d76 b0744ce 1852d76 b0744ce 1852d76 fe80c45 b0744ce 1852d76 b0744ce 15812ac b0744ce 15812ac 1852d76 b0744ce 1852d76 15812ac 1852d76 15812ac 1852d76 b0744ce 8a2ca70 1852d76 b0744ce 8a2ca70 b0744ce 456be9f b0744ce 456be9f b0744ce 1852d76 b0744ce 317a57f b0744ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import re
import gradio as gr
import spaces
import torch
from omegaconf import OmegaConf
from transformers import pipeline
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
def load_pipe(model_id: str):
return pipeline(
"automatic-speech-recognition",
model=model_id,
max_new_tokens=128,
chunk_length_s=30,
batch_size=8,
torch_dtype=torch_dtype,
device=device,
)
OmegaConf.register_new_resolver("load_pipe", load_pipe)
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
@spaces.GPU
def automatic_speech_recognition(model_id: str, dialect_id: str, audio_file: str):
model = models_config[model_id]["model"]
generate_kwargs = {
"task": "transcribe",
"language": "id",
"num_beams": 5,
}
if models_config[model_id]["dialect_mapping"] is not None:
generate_kwargs["prompt_ids"] = torch.from_numpy(
model.tokenizer.get_prompt_ids(dialect_id)
).to(device)
result = model(audio_file, generate_kwargs=generate_kwargs)["text"].replace(
f" {dialect_id}", ""
)
if result[-1] not in ".!?":
result = result + "."
sentences = re.split(r"[.!?] ", result)
for i in range(len(sentences)):
sentences[i] = sentences[i][0].upper() + sentences[i][1:]
return " ".join(sentences)
def when_model_selected(model_id: str):
model_config = models_config[model_id]
if model_config["dialect_mapping"] is not None:
dialect_drop_down_choices = [
(k, v) for k, v in model_config["dialect_mapping"].items()
]
return gr.update(
choices=dialect_drop_down_choices,
value=dialect_drop_down_choices[0][1],
)
else:
return gr.update(visible=False)
def get_title():
with open("DEMO.md") as tong:
return tong.readline().strip("# ")
demo = gr.Blocks(
title=get_title(),
css="@import url(https://tauhu.tw/tauhu-oo.css);",
theme=gr.themes.Default(
font=(
"tauhu-oo",
gr.themes.GoogleFont("Source Sans Pro"),
"ui-sans-serif",
"system-ui",
"sans-serif",
)
),
)
with demo:
default_model_id = list(models_config.keys())[0]
model_drop_down = gr.Dropdown(
models_config.keys(),
value=default_model_id,
label="模型",
)
dialect_drop_down = gr.Radio(
choices=[
"test"
# (k, v)
# for k, v in models_config[default_model_id]["dialect_mapping"].items()
],
# value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
label="族別",
visible=False,
)
model_drop_down.input(
when_model_selected,
inputs=[model_drop_down],
outputs=[dialect_drop_down],
)
with open("DEMO.md") as tong:
gr.Markdown(tong.read())
gr.Interface(
automatic_speech_recognition,
inputs=[
model_drop_down,
dialect_drop_down,
gr.Audio(
label="上傳或錄音",
type="filepath",
waveform_options=gr.WaveformOptions(
sample_rate=16000,
),
),
],
outputs=[
gr.Text(interactive=False, label="辨識結果"),
],
allow_flagging="auto",
)
demo.launch()
|