Spaces:

RSHVR
/

Command_RTC

Sleeping

App Files Files Community

RSHVR commited on Mar 30

Commit

8d98b9d

verified ·

1 Parent(s): eb62218

Update stt.py

Browse files

Files changed (1) hide show

stt.py +71 -50

stt.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import os
 import torch
 import torchaudio
 import spaces
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 # Create directories
 os.makedirs("transcriptions", exist_ok=True)
@@ -20,63 +25,79 @@ WHISPER_MODEL_SIZES = {
     'large': 'openai/whisper-large-v3',
 }
-# Synchronous function with GPU decorator
-@spaces.GPU
-def _transcribe_audio_gpu(audio_file_path, model_size="base", language="en"):
-    global whisper_model, whisper_processor
-    try:
         # Get model identifier
-        model_id = WHISPER_MODEL_SIZES.get(model_size.lower(), WHISPER_MODEL_SIZES['base'])
-        # Load model and processor on first use or if model size changes
         if whisper_model is None or whisper_processor is None or (whisper_model and whisper_model.config._name_or_path != model_id):
-            print(f"Loading Whisper {model_size} model...")
             whisper_processor = WhisperProcessor.from_pretrained(model_id)
             whisper_model = WhisperForConditionalGeneration.from_pretrained(model_id)
             print(f"Model loaded on device: {whisper_model.device}")
-        # Process audio
-        speech_array, sample_rate = torchaudio.load(audio_file_path)
-        # Convert to mono if needed
-        if speech_array.shape[0] > 1:
-            speech_array = torch.mean(speech_array, dim=0, keepdim=True)
-        # Resample to 16kHz if needed
-        if sample_rate != 16000:
-            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-            speech_array = resampler(speech_array)
-        # Prepare inputs for the model
-        input_features = whisper_processor(
-            speech_array.squeeze().numpy(),
-            sampling_rate=16000,
-            return_tensors="pt"
-        ).input_features
-        # Generate transcription
-        generation_kwargs = {}
-        if language:
-            forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=language, task="transcribe")
-            generation_kwargs["forced_decoder_ids"] = forced_decoder_ids
-        # Run the model
-        with torch.no_grad():
-            predicted_ids = whisper_model.generate(input_features, **generation_kwargs)
-        # Decode the output
-        transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
-        # Return the transcribed text
-        return transcription[0]
-    except Exception as e:
-        print(f"Error during transcription: {str(e)}")
-        return ""
-# Async wrapper that calls the GPU function
 async def transcribe_audio(audio_file_path, model_size="base", language="en"):
-    # Call the GPU-decorated function
-    return _transcribe_audio_gpu(audio_file_path, model_size, language)

+# stt.py
 import os
 import torch
 import torchaudio
 import spaces
+import numpy as np
+from typing import Tuple
+from numpy.typing import NDArray
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import tempfile
 # Create directories
 os.makedirs("transcriptions", exist_ok=True)
     'large': 'openai/whisper-large-v3',
 }
+class WhisperSTTModel:
+    def __init__(self, model_size="base", language="en"):
+        self.model_size = model_size
+        self.language = language
+        self._initialize_model()
+    @spaces.GPU
+    def _initialize_model(self):
+        global whisper_model, whisper_processor
         # Get model identifier
+        model_id = WHISPER_MODEL_SIZES.get(self.model_size.lower(), WHISPER_MODEL_SIZES['base'])
+        # Load model and processor if not already loaded
         if whisper_model is None or whisper_processor is None or (whisper_model and whisper_model.config._name_or_path != model_id):
+            print(f"Loading Whisper {self.model_size} model...")
             whisper_processor = WhisperProcessor.from_pretrained(model_id)
             whisper_model = WhisperForConditionalGeneration.from_pretrained(model_id)
             print(f"Model loaded on device: {whisper_model.device}")
+    @spaces.GPU
+    def stt(self, audio: Tuple[int, NDArray[np.float32]]) -> str:
+        """Transcribe audio to text following the STTModel protocol"""
+        sample_rate, audio_array = audio
+        try:
+            # Convert to mono if needed
+            if len(audio_array.shape) > 1 and audio_array.shape[0] > 1:
+                audio_array = np.mean(audio_array, axis=0)
+            # Convert numpy array to torch tensor
+            speech_array = torch.tensor(audio_array).unsqueeze(0)
+            # Resample to 16kHz if needed
+            if sample_rate != 16000:
+                resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+                speech_array = resampler(speech_array)
+            # Prepare inputs for the model
+            input_features = whisper_processor(
+                speech_array.squeeze().numpy(),
+                sampling_rate=16000,
+                return_tensors="pt"
+            ).input_features
+            # Generate transcription
+            generation_kwargs = {}
+            if self.language:
+                forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=self.language, task="transcribe")
+                generation_kwargs["forced_decoder_ids"] = forced_decoder_ids
+            # Run the model
+            with torch.no_grad():
+                predicted_ids = whisper_model.generate(input_features, **generation_kwargs)
+            # Decode the output
+            transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
+            # Return the transcribed text
+            return transcription[0]
+        except Exception as e:
+            print(f"Error during transcription: {str(e)}")
+            return ""
+# Create a singleton instance for easy import
+whisper_stt = WhisperSTTModel(model_size="base", language="en")
+# Legacy function for backward compatibility
 async def transcribe_audio(audio_file_path, model_size="base", language="en"):
+    """For compatibility with older code"""
+    # Load audio from file
+    speech_array, sample_rate = torchaudio.load(audio_file_path)
+    # Use the new model to transcribe
+    return whisper_stt.stt((sample_rate, speech_array.squeeze().numpy()))