import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification import torchaudio import torch speech_classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er") text_tokenizer = AutoTokenizer.from_pretrained("tae898/emoberta-base") text_model = AutoModelForSequenceClassification.from_pretrained("tae898/emoberta-base") text_model.gradient_checkpointing_enable() def predict_emotion(audio, text): results = {} if audio is not None: waveform, sr = torchaudio.load(audio) preds = speech_classifier(waveform.squeeze().numpy(), sampling_rate=sr, top_k=3) results["audio_emotion"] = preds[0]["label"] if text is not None and text.strip() != "": inputs = text_tokenizer(text, return_tensors="pt") with torch.no_grad(): outputs = text_model(**inputs) emotion = text_model.config.id2label[torch.argmax(outputs.logits)] results["text_emotion"] = emotion return results # Building the UI gradio_ui = gr.Interface( fn=gradio_combined, inputs=[ gr.Audio(label="🎤 Upload or Record Speech", sources=["microphone", "upload"], type="filepath"), gr.Textbox(label="💬 Enter Text Emotion", placeholder="Type something...") ], outputs="json", title="🎭 Multimodal Emotion Recognizer", description="Use either speech or text — the model detects the emotion automatically!" ) # Mount Gradio at /gradio app = gr.mount_gradio_app(app, gradio_ui, path="/gradio") gradio_ui.launch(share=True)