import gradio as gr import torch import torchaudio from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification from queue import Queue import threading import numpy as np # Check for device device = "cuda" if torch.cuda.is_available() else "cpu" # Model setup model_name = "Hatman/audio-emotion-detection" feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name).to(device) # Real-time audio processing setup def preprocess_audio_chunk(audio_chunk, sampling_rate): resampled_waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(audio_chunk) return {'speech': resampled_waveform.numpy().flatten(), 'sampling_rate': 16000} def inference_chunk(audio_chunk, sampling_rate): example = preprocess_audio_chunk(audio_chunk, sampling_rate) inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) emotion = model.config.id2label[predicted_ids.item()] return emotion # Queue for processing audio chunks audio_queue = Queue() results_queue = Queue() # Thread for processing audio in real-time def audio_processing_thread(): while True: if not audio_queue.empty(): audio_chunk, sampling_rate = audio_queue.get() emotion = inference_chunk(audio_chunk, sampling_rate) results_queue.put(emotion) processing_thread = threading.Thread(target=audio_processing_thread, daemon=True) processing_thread.start() # Gradio interface for real-time streaming def real_time_inference_live(microphone_audio): waveform = torch.tensor(microphone_audio["array"]).float() sampling_rate = microphone_audio["sampling_rate"] # Chunk size in samples (5 seconds chunks) chunk_size = int(5 * sampling_rate) # Process each chunk and collect live emotions emotions = [] for start in range(0, len(waveform), chunk_size): end = min(start + chunk_size, len(waveform)) audio_chunk = waveform[start:end] if audio_chunk.size(0) > 0: audio_queue.put((audio_chunk, sampling_rate)) # Retrieve results from the results queue while not results_queue.empty(): emotion = results_queue.get() emotions.append(emotion) return "\n".join(emotions) with gr.Blocks() as demo: gr.Markdown("# Live Emotion Detection from Audio") audio_input = gr.Audio(streaming=True, label="Real-Time Audio Input", type="numpy") emotion_output = gr.Textbox(label="Detected Emotions", lines=10) def stream_audio_live(audio): return real_time_inference_live(audio) audio_input.stream(stream_audio_live, outputs=emotion_output) gr.Markdown("This application processes audio in 5-second chunks and detects emotions in real-time.") demo.launch(share=True)