Swahili-ASR

Running on Zero

App Files Files Community

badrex commited on 26 days ago

Commit

df23ecf

verified ·

1 Parent(s): 2a3e1bb

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -26

app.py CHANGED Viewed

@@ -12,42 +12,57 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 if HF_TOKEN:
     login(token=HF_TOKEN)
-MODEL_ID = "badrex/w2v-bert-2.0-swahili-asr"
-transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
-@spaces.GPU
-def transcribe(audio):
-    sr, y = audio
-    # convert to mono if stereo
-    if y.ndim > 1:
-        y = y.mean(axis=1)
-    # ensure it's float32
-    y = y.astype(np.float32)
-    # normalize audio
-    if np.max(np.abs(y)) > 0:
-        y /= np.max(np.abs(y))
-    # convert to tensor for torchaudio
-    y_tensor = torch.from_numpy(y)
-    # add batch dimension if missing
-    if y_tensor.ndim == 1:
-        y_tensor = y_tensor.unsqueeze(0)
-    # resample to 16kHz if needed
-    if sr != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-        y_tensor = resampler(y_tensor)
-        sr = 16000
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    return transcriber({"sampling_rate": sr, "raw": y})["text"]
 examples = []
 examples_dir = "examples"

 if HF_TOKEN:
     login(token=HF_TOKEN)
+#MODEL_ID = "badrex/w2v-bert-2.0-swahili-asr"
+#transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
+# Load model and processor
+MODEL_PATH = "badrex/w2v-bert-2.0-swahili-asr"
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model = AutoModelForCTC.from_pretrained(MODEL_PATH)
+# move model and processor to device
+model = model.to(device)
+#processor = processor.to(device)
+@spaces.GPU()
+def transcribe(audio_path):
+    """Process audio with return the generated respotextnse.
+    Args:
+        audio_path: Path to the audio file to be transcribed.
+    Returns:
+        String containing the transcribed text from the audio file, or an error message
+        if the audio file is missing.
+    """
+    if not audio_path:
+        return "Please upload an audio file."
+    # get audio array
+    audio_array, sample_rate = torchaudio.load(audio_path)
+    # if sample rate is not 16000, resample to 16000
+    if sample_rate != 16000:
+        audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array)
+    #audio_array = audio_array.to(device)
+    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    #inputs = inputs.to(device, dtype=torch.bfloat16)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    outputs = torch.argmax(logits, dim=-1)
+    decoded_outputs = processor.batch_decode(
+        outputs,
+        skip_special_tokens=True
+    )
+    return decoded_outputs[0].strip()
 examples = []
 examples_dir = "examples"