Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
import numpy as np | |
from transformers import ( | |
AutoModel, AutoProcessor, AutoFeatureExtractor, | |
AutoTokenizer, pipeline | |
) | |
import warnings | |
warnings.filterwarnings("ignore") | |
def test_single_model(model_name): | |
"""Test compatibility of a single model""" | |
if not model_name.strip(): | |
return "Please enter a model name" | |
result_text = f"π Testing Model: {model_name}\n" | |
result_text += "=" * 60 + "\n\n" | |
try: | |
# 1. Load model | |
result_text += "1οΈβ£ Loading Model...\n" | |
model = AutoModel.from_pretrained(model_name) | |
result_text += " β Model loaded successfully\n" | |
result_text += f" π Model type: {model.config.model_type}\n" | |
result_text += f" ποΈ Model class: {model.__class__.__name__}\n\n" | |
# 2. Check model architecture | |
result_text += "2οΈβ£ Checking Model Architecture...\n" | |
if hasattr(model.config, 'hidden_size'): | |
result_text += f" π’ Hidden size: {model.config.hidden_size}\n" | |
if hasattr(model.config, 'num_hidden_layers'): | |
result_text += f" π Number of layers: {model.config.num_hidden_layers}\n" | |
if hasattr(model.config, 'vocab_size'): | |
result_text += f" π Vocabulary size: {model.config.vocab_size}\n" | |
result_text += "\n" | |
# 3. Try to load processor | |
result_text += "3οΈβ£ Loading Processor...\n" | |
processor = None | |
supports_audio = False | |
try: | |
processor = AutoProcessor.from_pretrained(model_name) | |
result_text += f" β Processor loaded successfully: {processor.__class__.__name__}\n" | |
supports_audio = True | |
except: | |
try: | |
processor = AutoFeatureExtractor.from_pretrained(model_name) | |
result_text += f" β Feature extractor loaded successfully: {processor.__class__.__name__}\n" | |
supports_audio = True | |
except: | |
result_text += " β Cannot load audio processor\n" | |
supports_audio = False | |
result_text += "\n" | |
# 4. Check input requirements | |
result_text += "4οΈβ£ Checking Input Requirements...\n" | |
sampling_rate = 16000 # Default value | |
if processor and supports_audio: | |
if hasattr(processor, 'sampling_rate'): | |
sampling_rate = processor.sampling_rate | |
result_text += f" π΅ Sampling rate: {sampling_rate} Hz\n" | |
if hasattr(processor, 'feature_size'): | |
result_text += f" π Feature dimension: {processor.feature_size}\n" | |
if hasattr(processor, 'return_attention_mask'): | |
result_text += f" π Supports attention mask: {processor.return_attention_mask}\n" | |
result_text += "\n" | |
# 5. Test inference | |
result_text += "5οΈβ£ Testing Inference...\n" | |
if supports_audio: | |
try: | |
# Create dummy audio data (2 seconds) | |
dummy_audio = np.random.randn(sampling_rate * 2).astype(np.float32) | |
# Process audio | |
inputs = processor(dummy_audio, sampling_rate=sampling_rate, return_tensors="pt") | |
# Model inference | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Check output | |
if hasattr(outputs, 'last_hidden_state'): | |
shape = outputs.last_hidden_state.shape | |
result_text += f" β Inference successful! Hidden state shape: {shape}\n" | |
elif hasattr(outputs, 'logits'): | |
shape = outputs.logits.shape | |
result_text += f" β Inference successful! Logits shape: {shape}\n" | |
else: | |
result_text += f" β Inference successful! Output type: {type(outputs)}\n" | |
except Exception as e: | |
result_text += f" β Inference failed: {str(e)}\n" | |
else: | |
result_text += " β οΈ Audio input not supported, skipping inference test\n" | |
result_text += "\n" | |
# 6. Multilingual support check | |
result_text += "6οΈβ£ Multilingual Support Check...\n" | |
multilingual = False | |
if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000: | |
result_text += f" β Likely supports multiple languages (large vocabulary: {model.config.vocab_size})\n" | |
multilingual = True | |
elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual', 'cross-lingual']): | |
result_text += " β Supports multiple languages based on model name\n" | |
multilingual = True | |
else: | |
result_text += " β Multilingual support unclear\n" | |
result_text += "\n" | |
# 7. Depression detection suitability scoring | |
result_text += "7οΈβ£ Depression Detection Suitability Assessment...\n" | |
score = 0 | |
max_score = 15 | |
# Most important: Specifically for depression/mental health detection (6 points) | |
depression_keywords = ['depression', 'mental-health', 'psychological', 'mood', 'phq'] | |
emotion_keywords = ['emotion', 'sentiment', 'affective', 'feeling'] | |
if any(keyword in model_name.lower() for keyword in depression_keywords): | |
score += 6 | |
result_text += " π― Specifically for depression/mental health detection (+6 points)\n" | |
elif any(keyword in model_name.lower() for keyword in emotion_keywords): | |
score += 3 | |
result_text += " π For emotion recognition, potentially applicable (+3 points)\n" | |
# Basic requirement: Audio input support (2 points) | |
if supports_audio: | |
score += 2 | |
result_text += " π΅ Supports audio input (+2 points)\n" | |
else: | |
result_text += " β Does not support audio input (0 points)\n" | |
# Multilingual support (2 points) | |
if multilingual: | |
score += 2 | |
result_text += " π Supports multiple languages (+2 points)\n" | |
# Architecture suitability (2 points) | |
if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']: | |
score += 2 | |
result_text += " ποΈ Excellent speech representation learning architecture (+2 points)\n" | |
elif model.config.model_type == 'whisper': | |
score += 1 | |
result_text += " β οΈ Whisper architecture needs modification for classification (+1 point)\n" | |
# Check if configured for classification | |
if hasattr(model.config, 'num_labels'): | |
if model.config.num_labels == 2: | |
score += 1 | |
result_text += f" β Binary classification task configuration (likely depression detection) (+1 point)\n" | |
else: | |
score += 0.5 | |
result_text += f" β οΈ Multi-class task ({model.config.num_labels} classes) (+0.5 points)\n" | |
# Check for training dataset clues | |
daic_keywords = ['daic', 'wizard-of-oz', 'depression-detection', 'clinical'] | |
if any(keyword in model_name.lower() for keyword in daic_keywords): | |
score += 2 | |
result_text += " π Possibly trained on clinical depression datasets (+2 points)\n" | |
result_text += f"\nπ― Depression Detection Suitability Score: {score}/{max_score}\n" | |
# 8. Recommendations | |
result_text += "\n8οΈβ£ Usage Recommendations...\n" | |
if score >= 12: | |
result_text += " π Highly recommended! Specifically for depression detection, very suitable\n" | |
elif score >= 8: | |
result_text += " π Recommended, may need some fine-tuning\n" | |
elif score >= 5: | |
result_text += " β οΈ Use with caution, may need significant modification\n" | |
else: | |
result_text += " β Not recommended, suggest finding specialized depression detection models\n" | |
# 9. Further inspection suggestions | |
result_text += "\n9οΈβ£ Further Inspection Suggestions...\n" | |
result_text += " π Check model card for training data description\n" | |
result_text += " π Check if DAIC-WOZ or other depression datasets are mentioned\n" | |
result_text += " π Check papers or documentation for task description\n" | |
result_text += " π§ͺ Test with small samples to see if model output matches depression detection expectations\n" | |
return result_text | |
except Exception as e: | |
error_msg = f"β Model test failed: {str(e)}\n" | |
error_msg += "\nPossible causes:\n" | |
error_msg += "β’ Incorrect model name\n" | |
error_msg += "β’ Model requires special permissions\n" | |
error_msg += "β’ Network connection issues\n" | |
error_msg += "β’ Model architecture incompatibility\n" | |
return error_msg | |
def test_recommended_models(): | |
"""Test recommended model list""" | |
recommended_models = [ | |
"facebook/wav2vec2-large-xlsr-53", | |
"microsoft/wavlm-large", | |
"harshit345/xlsr-wav2vec-speech-emotion-recognition", | |
"audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim", | |
"speechbrain/emotion-recognition-wav2vec2-IEMOCAP" | |
] | |
result_text = "π Batch Testing Recommended Models\n" | |
result_text += "=" * 60 + "\n\n" | |
results = [] | |
for i, model_name in enumerate(recommended_models, 1): | |
result_text += f"π Testing {i}/{len(recommended_models)}: {model_name}\n" | |
result_text += "-" * 50 + "\n" | |
try: | |
# Simplified quick test | |
model = AutoModel.from_pretrained(model_name) | |
# Check audio support | |
supports_audio = False | |
try: | |
processor = AutoProcessor.from_pretrained(model_name) | |
supports_audio = True | |
except: | |
try: | |
processor = AutoFeatureExtractor.from_pretrained(model_name) | |
supports_audio = True | |
except: | |
pass | |
# Check multilingual | |
multilingual = False | |
if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000: | |
multilingual = True | |
elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual']): | |
multilingual = True | |
# Calculate simplified score | |
score = 0 | |
if supports_audio: | |
score += 3 | |
if multilingual: | |
score += 2 | |
if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']: | |
score += 3 | |
results.append({ | |
'name': model_name, | |
'score': score, | |
'audio': supports_audio, | |
'multilingual': multilingual, | |
'type': model.config.model_type | |
}) | |
result_text += f"β Loaded successfully | Audio: {'β ' if supports_audio else 'β'} | Multilingual: {'β ' if multilingual else 'β'} | Score: {score}/8\n\n" | |
except Exception as e: | |
result_text += f"β Loading failed: {str(e)}\n\n" | |
# Sort and recommend | |
results.sort(key=lambda x: x['score'], reverse=True) | |
result_text += "π Recommendation Rankings:\n" | |
result_text += "=" * 40 + "\n" | |
for i, model in enumerate(results, 1): | |
result_text += f"{i}. {model['name']}\n" | |
result_text += f" Score: {model['score']}/8 | Type: {model['type']}\n\n" | |
return result_text | |
# Create Gradio interface | |
with gr.Blocks(title="π€ Depression Detection Model Compatibility Test") as app: | |
gr.Markdown(""" | |
# π€ Depression Detection Model Compatibility Test Tool | |
This tool helps you quickly test whether Hugging Face models are suitable for depression detection tasks. | |
## Features: | |
- β Check model loading compatibility | |
- π΅ Verify audio input support | |
- π Assess multilingual capabilities | |
- π Suitability scoring (0-15 points) | |
- π‘ Usage recommendations | |
""") | |
with gr.Tab("Single Model Test"): | |
with gr.Row(): | |
model_input = gr.Textbox( | |
placeholder="Enter model name, e.g.: facebook/wav2vec2-large-xlsr-53", | |
label="π Model Name", | |
value="ireneminhee/speech-to-depression" | |
) | |
test_btn = gr.Button("π Start Test", variant="primary") | |
result_output = gr.Textbox( | |
label="π Test Results", | |
lines=25, | |
max_lines=50 | |
) | |
test_btn.click( | |
fn=test_single_model, | |
inputs=[model_input], | |
outputs=[result_output] | |
) | |
with gr.Tab("Recommended Models Batch Test"): | |
gr.Markdown(""" | |
### π Recommended Depression Detection Candidate Models | |
These models perform well in speech emotion recognition and multilingual support: | |
- `facebook/wav2vec2-large-xlsr-53` - Multilingual speech representation learning | |
- `microsoft/wavlm-large` - Speech understanding specialized model | |
- `harshit345/xlsr-wav2vec-speech-emotion-recognition` - Emotion recognition | |
- `audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim` - Emotion dimension recognition | |
- `speechbrain/emotion-recognition-wav2vec2-IEMOCAP` - Emotion classification | |
""") | |
batch_test_btn = gr.Button("π Batch Test Recommended Models", variant="primary") | |
batch_result_output = gr.Textbox( | |
label="π Batch Test Results", | |
lines=20, | |
max_lines=50 | |
) | |
batch_test_btn.click( | |
fn=test_recommended_models, | |
inputs=[], | |
outputs=[batch_result_output] | |
) | |
with gr.Tab("Usage Instructions"): | |
gr.Markdown(""" | |
## π Usage Instructions | |
### Scoring Criteria (Redesigned): | |
- **Depression-specific model** (+6 points): Specifically for depression/mental health detection | |
- **Emotion recognition model** (+3 points): For emotion recognition, potentially applicable | |
- **Audio support** (+2 points): Whether the model can process audio input | |
- **Multilingual support** (+2 points): Support for Chinese, English, German, Russian | |
- **Architecture suitability** (+2 points): Whether model architecture is suitable for speech classification | |
- **Classification configuration** (+1 point): Whether configured for classification tasks | |
- **Clinical datasets** (+2 points): Whether trained on clinical depression datasets | |
### Score Interpretation: | |
- **12-15 points**: π Highly recommended, specialized depression detection model | |
- **8-11 points**: π Recommended, may need fine-tuning | |
- **5-7 points**: β οΈ Use with caution, needs modification | |
- **0-4 points**: β Not recommended | |
### Next Steps: | |
1. Select the top 2-3 models with highest scores | |
2. Conduct in-depth testing in Google Colab | |
3. Fine-tune using DAIC-WOZ dataset | |
4. Final evaluation with your multilingual data | |
""") | |
# Launch application | |
if __name__ == "__main__": | |
app.launch() |