Spaces:

Tantan18
/

depression-model-tester

Sleeping

App Files Files Community

depression-model-tester / app.py

Tantan18

Update app.py

9c98934 verified 2 months ago

raw

history blame contribute delete

15.8 kB

	import gradio as gr
	import torch
	import numpy as np
	from transformers import (
	AutoModel, AutoProcessor, AutoFeatureExtractor,
	AutoTokenizer, pipeline
	)
	import warnings
	warnings.filterwarnings("ignore")

	def test_single_model(model_name):
	"""Test compatibility of a single model"""

	if not model_name.strip():
	return "Please enter a model name"

	result_text = f"🔍 Testing Model: {model_name}\n"
	result_text += "=" * 60 + "\n\n"

	try:
	# 1. Load model
	result_text += "1️⃣ Loading Model...\n"
	model = AutoModel.from_pretrained(model_name)
	result_text += " ✅ Model loaded successfully\n"
	result_text += f" 📊 Model type: {model.config.model_type}\n"
	result_text += f" 🏗️ Model class: {model.__class__.__name__}\n\n"

	# 2. Check model architecture
	result_text += "2️⃣ Checking Model Architecture...\n"
	if hasattr(model.config, 'hidden_size'):
	result_text += f" 🔢 Hidden size: {model.config.hidden_size}\n"
	if hasattr(model.config, 'num_hidden_layers'):
	result_text += f" 📚 Number of layers: {model.config.num_hidden_layers}\n"
	if hasattr(model.config, 'vocab_size'):
	result_text += f" 📖 Vocabulary size: {model.config.vocab_size}\n"
	result_text += "\n"

	# 3. Try to load processor
	result_text += "3️⃣ Loading Processor...\n"
	processor = None
	supports_audio = False

	try:
	processor = AutoProcessor.from_pretrained(model_name)
	result_text += f" ✅ Processor loaded successfully: {processor.__class__.__name__}\n"
	supports_audio = True
	except:
	try:
	processor = AutoFeatureExtractor.from_pretrained(model_name)
	result_text += f" ✅ Feature extractor loaded successfully: {processor.__class__.__name__}\n"
	supports_audio = True
	except:
	result_text += " ❌ Cannot load audio processor\n"
	supports_audio = False
	result_text += "\n"

	# 4. Check input requirements
	result_text += "4️⃣ Checking Input Requirements...\n"
	sampling_rate = 16000 # Default value
	if processor and supports_audio:
	if hasattr(processor, 'sampling_rate'):
	sampling_rate = processor.sampling_rate
	result_text += f" 🎵 Sampling rate: {sampling_rate} Hz\n"
	if hasattr(processor, 'feature_size'):
	result_text += f" 📏 Feature dimension: {processor.feature_size}\n"
	if hasattr(processor, 'return_attention_mask'):
	result_text += f" 🎭 Supports attention mask: {processor.return_attention_mask}\n"
	result_text += "\n"

	# 5. Test inference
	result_text += "5️⃣ Testing Inference...\n"
	if supports_audio:
	try:
	# Create dummy audio data (2 seconds)
	dummy_audio = np.random.randn(sampling_rate * 2).astype(np.float32)

	# Process audio
	inputs = processor(dummy_audio, sampling_rate=sampling_rate, return_tensors="pt")

	# Model inference
	with torch.no_grad():
	outputs = model(**inputs)

	# Check output
	if hasattr(outputs, 'last_hidden_state'):
	shape = outputs.last_hidden_state.shape
	result_text += f" ✅ Inference successful! Hidden state shape: {shape}\n"
	elif hasattr(outputs, 'logits'):
	shape = outputs.logits.shape
	result_text += f" ✅ Inference successful! Logits shape: {shape}\n"
	else:
	result_text += f" ✅ Inference successful! Output type: {type(outputs)}\n"

	except Exception as e:
	result_text += f" ❌ Inference failed: {str(e)}\n"
	else:
	result_text += " ⚠️ Audio input not supported, skipping inference test\n"
	result_text += "\n"

	# 6. Multilingual support check
	result_text += "6️⃣ Multilingual Support Check...\n"
	multilingual = False

	if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000:
	result_text += f" ✅ Likely supports multiple languages (large vocabulary: {model.config.vocab_size})\n"
	multilingual = True
	elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual', 'cross-lingual']):
	result_text += " ✅ Supports multiple languages based on model name\n"
	multilingual = True
	else:
	result_text += " ❓ Multilingual support unclear\n"
	result_text += "\n"

	# 7. Depression detection suitability scoring
	result_text += "7️⃣ Depression Detection Suitability Assessment...\n"
	score = 0
	max_score = 15

	# Most important: Specifically for depression/mental health detection (6 points)
	depression_keywords = ['depression', 'mental-health', 'psychological', 'mood', 'phq']
	emotion_keywords = ['emotion', 'sentiment', 'affective', 'feeling']

	if any(keyword in model_name.lower() for keyword in depression_keywords):
	score += 6
	result_text += " 🎯 Specifically for depression/mental health detection (+6 points)\n"
	elif any(keyword in model_name.lower() for keyword in emotion_keywords):
	score += 3
	result_text += " 😊 For emotion recognition, potentially applicable (+3 points)\n"

	# Basic requirement: Audio input support (2 points)
	if supports_audio:
	score += 2
	result_text += " 🎵 Supports audio input (+2 points)\n"
	else:
	result_text += " ❌ Does not support audio input (0 points)\n"

	# Multilingual support (2 points)
	if multilingual:
	score += 2
	result_text += " 🌍 Supports multiple languages (+2 points)\n"

	# Architecture suitability (2 points)
	if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']:
	score += 2
	result_text += " 🏗️ Excellent speech representation learning architecture (+2 points)\n"
	elif model.config.model_type == 'whisper':
	score += 1
	result_text += " ⚠️ Whisper architecture needs modification for classification (+1 point)\n"

	# Check if configured for classification
	if hasattr(model.config, 'num_labels'):
	if model.config.num_labels == 2:
	score += 1
	result_text += f" ✅ Binary classification task configuration (likely depression detection) (+1 point)\n"
	else:
	score += 0.5
	result_text += f" ⚠️ Multi-class task ({model.config.num_labels} classes) (+0.5 points)\n"

	# Check for training dataset clues
	daic_keywords = ['daic', 'wizard-of-oz', 'depression-detection', 'clinical']
	if any(keyword in model_name.lower() for keyword in daic_keywords):
	score += 2
	result_text += " 📊 Possibly trained on clinical depression datasets (+2 points)\n"

	result_text += f"\n🎯 Depression Detection Suitability Score: {score}/{max_score}\n"

	# 8. Recommendations
	result_text += "\n8️⃣ Usage Recommendations...\n"
	if score >= 12:
	result_text += " 🌟 Highly recommended! Specifically for depression detection, very suitable\n"
	elif score >= 8:
	result_text += " 👍 Recommended, may need some fine-tuning\n"
	elif score >= 5:
	result_text += " ⚠️ Use with caution, may need significant modification\n"
	else:
	result_text += " ❌ Not recommended, suggest finding specialized depression detection models\n"

	# 9. Further inspection suggestions
	result_text += "\n9️⃣ Further Inspection Suggestions...\n"
	result_text += " 🔍 Check model card for training data description\n"
	result_text += " 📊 Check if DAIC-WOZ or other depression datasets are mentioned\n"
	result_text += " 📝 Check papers or documentation for task description\n"
	result_text += " 🧪 Test with small samples to see if model output matches depression detection expectations\n"

	return result_text

	except Exception as e:
	error_msg = f"❌ Model test failed: {str(e)}\n"
	error_msg += "\nPossible causes:\n"
	error_msg += "• Incorrect model name\n"
	error_msg += "• Model requires special permissions\n"
	error_msg += "• Network connection issues\n"
	error_msg += "• Model architecture incompatibility\n"
	return error_msg

	def test_recommended_models():
	"""Test recommended model list"""

	recommended_models = [
	"facebook/wav2vec2-large-xlsr-53",
	"microsoft/wavlm-large",
	"harshit345/xlsr-wav2vec-speech-emotion-recognition",
	"audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim",
	"speechbrain/emotion-recognition-wav2vec2-IEMOCAP"
	]

	result_text = "🔍 Batch Testing Recommended Models\n"
	result_text += "=" * 60 + "\n\n"

	results = []

	for i, model_name in enumerate(recommended_models, 1):
	result_text += f"📊 Testing {i}/{len(recommended_models)}: {model_name}\n"
	result_text += "-" * 50 + "\n"

	try:
	# Simplified quick test
	model = AutoModel.from_pretrained(model_name)

	# Check audio support
	supports_audio = False
	try:
	processor = AutoProcessor.from_pretrained(model_name)
	supports_audio = True
	except:
	try:
	processor = AutoFeatureExtractor.from_pretrained(model_name)
	supports_audio = True
	except:
	pass

	# Check multilingual
	multilingual = False
	if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000:
	multilingual = True
	elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual']):
	multilingual = True

	# Calculate simplified score
	score = 0
	if supports_audio:
	score += 3
	if multilingual:
	score += 2
	if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']:
	score += 3

	results.append({
	'name': model_name,
	'score': score,
	'audio': supports_audio,
	'multilingual': multilingual,
	'type': model.config.model_type
	})

	result_text += f"✅ Loaded successfully \| Audio: {'✅' if supports_audio else '❌'} \| Multilingual: {'✅' if multilingual else '❌'} \| Score: {score}/8\n\n"

	except Exception as e:
	result_text += f"❌ Loading failed: {str(e)}\n\n"

	# Sort and recommend
	results.sort(key=lambda x: x['score'], reverse=True)

	result_text += "🏆 Recommendation Rankings:\n"
	result_text += "=" * 40 + "\n"

	for i, model in enumerate(results, 1):
	result_text += f"{i}. {model['name']}\n"
	result_text += f" Score: {model['score']}/8 \| Type: {model['type']}\n\n"

	return result_text

	# Create Gradio interface
	with gr.Blocks(title="🤖 Depression Detection Model Compatibility Test") as app:
	gr.Markdown("""
	# 🤖 Depression Detection Model Compatibility Test Tool

	This tool helps you quickly test whether Hugging Face models are suitable for depression detection tasks.

	## Features:
	- ✅ Check model loading compatibility
	- 🎵 Verify audio input support
	- 🌍 Assess multilingual capabilities
	- 📊 Suitability scoring (0-15 points)
	- 💡 Usage recommendations
	""")

	with gr.Tab("Single Model Test"):
	with gr.Row():
	model_input = gr.Textbox(
	placeholder="Enter model name, e.g.: facebook/wav2vec2-large-xlsr-53",
	label="🔍 Model Name",
	value="ireneminhee/speech-to-depression"
	)
	test_btn = gr.Button("🚀 Start Test", variant="primary")

	result_output = gr.Textbox(
	label="📋 Test Results",
	lines=25,
	max_lines=50
	)

	test_btn.click(
	fn=test_single_model,
	inputs=[model_input],
	outputs=[result_output]
	)

	with gr.Tab("Recommended Models Batch Test"):
	gr.Markdown("""
	### 🌟 Recommended Depression Detection Candidate Models

	These models perform well in speech emotion recognition and multilingual support:
	- `facebook/wav2vec2-large-xlsr-53` - Multilingual speech representation learning
	- `microsoft/wavlm-large` - Speech understanding specialized model
	- `harshit345/xlsr-wav2vec-speech-emotion-recognition` - Emotion recognition
	- `audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim` - Emotion dimension recognition
	- `speechbrain/emotion-recognition-wav2vec2-IEMOCAP` - Emotion classification
	""")

	batch_test_btn = gr.Button("🔍 Batch Test Recommended Models", variant="primary")
	batch_result_output = gr.Textbox(
	label="📊 Batch Test Results",
	lines=20,
	max_lines=50
	)

	batch_test_btn.click(
	fn=test_recommended_models,
	inputs=[],
	outputs=[batch_result_output]
	)

	with gr.Tab("Usage Instructions"):
	gr.Markdown("""
	## 📖 Usage Instructions

	### Scoring Criteria (Redesigned):
	- Depression-specific model (+6 points): Specifically for depression/mental health detection
	- Emotion recognition model (+3 points): For emotion recognition, potentially applicable
	- Audio support (+2 points): Whether the model can process audio input
	- Multilingual support (+2 points): Support for Chinese, English, German, Russian
	- Architecture suitability (+2 points): Whether model architecture is suitable for speech classification
	- Classification configuration (+1 point): Whether configured for classification tasks
	- Clinical datasets (+2 points): Whether trained on clinical depression datasets

	### Score Interpretation:
	- 12-15 points: 🌟 Highly recommended, specialized depression detection model
	- 8-11 points: 👍 Recommended, may need fine-tuning
	- 5-7 points: ⚠️ Use with caution, needs modification
	- 0-4 points: ❌ Not recommended

	### Next Steps:
	1. Select the top 2-3 models with highest scores
	2. Conduct in-depth testing in Google Colab
	3. Fine-tune using DAIC-WOZ dataset
	4. Final evaluation with your multilingual data
	""")

	# Launch application
	if __name__ == "__main__":
	app.launch()