Swahili-ASR

Running on Zero

App Files Files Community

Swahili-ASR / app.py

badrex

Update app.py

d982d70 verified 21 days ago

raw

history blame contribute delete

4.8 kB

	import os
	import torchaudio
	import gradio as gr
	import spaces
	import torch
	from transformers import AutoProcessor, AutoModelForCTC

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# load examples
	examples = []
	examples_dir = "examples"
	if os.path.exists(examples_dir):
	for filename in os.listdir(examples_dir):
	if filename.endswith((".wav", ".mp3", ".ogg")):
	examples.append([os.path.join(examples_dir, filename)])

	# Load model and processor
	MODEL_PATH = "badrex/w2v-bert-2.0-swahili-asr"
	processor = AutoProcessor.from_pretrained(MODEL_PATH)
	model = AutoModelForCTC.from_pretrained(MODEL_PATH)

	# move model and processor to device
	model = model.to(device)
	#processor = processor.to(device)

	@spaces.GPU()
	def process_audio(audio_path):
	"""Process audio with return the generated respotextnse.

	Args:
	audio_path: Path to the audio file to be transcribed.
	Returns:
	String containing the transcribed text from the audio file, or an error message
	if the audio file is missing.
	"""
	if not audio_path:
	return "Please upload an audio file."

	# get audio array
	audio_array, sample_rate = torchaudio.load(audio_path)

	# if sample rate is not 16000, resample to 16000
	if sample_rate != 16000:
	audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array)

	#audio_array = audio_array.to(device)

	inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
	inputs = {k: v.to(device) for k, v in inputs.items()}

	#inputs = inputs.to(device, dtype=torch.bfloat16)

	with torch.no_grad():
	logits = model(**inputs).logits

	outputs = torch.argmax(logits, dim=-1)

	decoded_outputs = processor.batch_decode(
	outputs,
	skip_special_tokens=True
	)

	return decoded_outputs[0].strip()


	# Define Gradio interface
	with gr.Blocks(title="Voxtral Demo") as demo:
	gr.Markdown("# Swahili-ASR 🎙️ Speech Recognition for Swahili Language 🥥")
	#gr.Markdown("Developed with ❤ by [Badr al-Absi](https://badrex.github.io/)")
	gr.Markdown(
	'Developed with <span style="color:red;">❤</span> by <a href="https://badrex.github.io/">Badr al-Absi</a>'
	)
	gr.Markdown(
	"""### Hi there 👋🏼

	This is a demo for [badrex/w2v-bert-2.0-swahili-asr](https://huggingface.co/badrex/w2v-bert-2.0-swahili-asr),
	a robust Transformer-based automatic speech recognition (ASR) system for Swahili language that was trained on 400+ hours of human-transcribed speech.
	"""
	)

	gr.Markdown("Simply upload an audio file 📤 or record yourself speaking 🎙️⏺️ to try out the model!")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(type="filepath", label="Upload Audio")
	submit_btn = gr.Button("Transcribe Audio", variant="primary")

	with gr.Column():
	output_text = gr.Textbox(label="Text Transcription", lines=10)

	submit_btn.click(
	fn=process_audio,
	inputs=[audio_input],
	outputs=output_text
	)

	gr.Examples(
	examples=examples if examples else None,
	inputs=[audio_input],
	)

	# Launch the app
	if __name__ == "__main__":
	demo.queue().launch() #share=False, ssr_mode=False, mcp_server=True


	# demo = gr.Interface(
	# fn=transcribe,
	# inputs=gr.Audio(),
	# outputs="text",
	# title="<div></div>",
	# description="""
	# <div class="centered-content">
	# <div>
	# <p>
	# Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a> ☕
	# </p>
	# <br>
	# <p style="font-size: 15px; line-height: 1.8;">
	# Hi there 👋🏼
	# <br>
	# <br>
	# This is a demo for <a href="https://huggingface.co/badrex/w2v-bert-2.0-swahili-asr" style="color: #2563eb;"> badrex/w2v-bert-2.0-swahili-asr</a>, a robust Transformer-based automatic speech recognition (ASR) system for Swahili language.
	# The underlying ASR model was trained on more than 400 hours of transcribed speech.
	# <br>
	# <p style="font-size: 15px; line-height: 1.8;">
	# Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
	# </p>
	# </div>
	# </div>
	# """,
	# examples=examples if examples else None,
	# cache_examples=False,
	# flagging_mode=None,
	# )

	# if __name__ == "__main__":
	# demo.launch()