Spaces:

tugrulkaya
/

audio-reasoning-explorer

Running

App Files Files Community

audio-reasoning-explorer / app.py

tugrulkaya

Update app.py

d700258 verified 9 days ago

raw

history blame

30.3 kB

	"""
	🎧 Audio Reasoning & Step-Audio-R1 Explorer
	Interactive Hugging Face Space for exploring audio reasoning concepts

	Author: Mehmet Tuğrul Kaya
	"""

	import gradio as gr

	# ============================================
	# CONTENT DATA (İçerik Verileri)
	# ============================================

	INTRO_CONTENT = """
	# 🎧 Audio Reasoning & Step-Audio-R1

	## Teaching AI to Think About Sound

	Step-Audio-R1 is the first audio language model to successfully unlock reasoning capabilities in the audio domain.
	This space explores the groundbreaking concepts behind audio reasoning and the innovative MGRD framework.

	### 🎯 Key Achievement
	> "Can audio intelligence truly benefit from deliberate thinking?" — YES!

	Step-Audio-R1 proves that reasoning is a transferable capability across modalities when properly grounded in acoustic features.

	---

	### 📊 Quick Stats

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Model Size \| 32B parameters (Qwen2.5 LLM) \|
	\| Audio Encoder \| Qwen2 (25 Hz, frozen) \|
	\| Performance \| Surpasses Gemini 2.5 Pro \|
	\| Innovation \| First successful audio reasoning model \|

	---

	Navigate through the tabs to explore different aspects of audio reasoning!
	"""

	# Audio Reasoning Types Data
	REASONING_TYPES = {
	"Factual Reasoning": {
	"emoji": "📋",
	"description": "Extracting concrete information from audio",
	"example_question": "What date is mentioned in this conversation?",
	"example_audio": "A business call discussing a meeting scheduled for March 15th",
	"what_model_does": "Identifies specific facts, numbers, names, dates from speech content",
	"challenge": "Requires accurate speech recognition + information extraction"
	},
	"Procedural Reasoning": {
	"emoji": "📝",
	"description": "Understanding step-by-step processes and sequences",
	"example_question": "What is the third step in this instruction set?",
	"example_audio": "A cooking tutorial explaining how to make pasta",
	"what_model_does": "Tracks sequential information, understands ordering and dependencies",
	"challenge": "Must maintain context across long audio segments"
	},
	"Normative Reasoning": {
	"emoji": "⚖️",
	"description": "Evaluating social, ethical, or behavioral norms",
	"example_question": "Is the speaker behaving appropriately in this dialogue?",
	"example_audio": "A customer service call with an upset customer",
	"what_model_does": "Assesses tone, politeness, social appropriateness based on context",
	"challenge": "Requires understanding of social norms + prosodic analysis"
	},
	"Contextual Reasoning": {
	"emoji": "🌍",
	"description": "Inferring environmental and situational context",
	"example_question": "Where might this sound have been recorded?",
	"example_audio": "Background noise with birds, wind, and distant traffic",
	"what_model_does": "Analyzes ambient sounds to determine location/situation",
	"challenge": "Must process non-speech audio elements"
	},
	"Causal Reasoning": {
	"emoji": "🔗",
	"description": "Establishing cause-effect relationships",
	"example_question": "Why might this sound event have occurred?",
	"example_audio": "A loud crash followed by glass breaking",
	"what_model_does": "Infers causality from sound sequences and patterns",
	"challenge": "Requires world knowledge + temporal understanding"
	}
	}

	# The Problem Content
	PROBLEM_CONTENT = """
	## 🚫 The Inverted Scaling Anomaly

	### The Paradox
	Traditional audio language models showed a strange behavior: they performed WORSE when reasoning longer!

	This is the opposite of what happens in text models (like GPT-4, Claude) where more thinking = better answers.

	### Root Cause: Textual Surrogate Reasoning

	```
	🔊 Audio Input
	↓
	📝 Model converts to text (transcript)
	↓
	🧠 Reasons over TEXT, not SOUND
	↓
	❌ Acoustic features IGNORED
	↓
	💀 Performance degrades with longer reasoning
	```

	### Why Does This Happen?

	1. Text-based initialization: Models are fine-tuned from text LLMs
	2. Inherited patterns: They learn to reason like text models
	3. Modality mismatch: Audio is treated as "text with extra steps"
	4. Lost information: Tone, emotion, prosody, ambient sounds are ignored

	### Real Example

	Audio: Person says "Sure, I'll do it" in a sarcastic, annoyed tone

	\| Approach \| Interpretation \|
	\|----------\|---------------\|
	\| Textual Surrogate ❌ \| "Person agrees to do the task" \|
	\| Acoustic-Grounded ✅ \| "Person is reluctant/annoyed, may not follow through" \|

	The acoustic-grounded approach captures the TRUE meaning!
	"""

	# MGRD Content
	MGRD_CONTENT = """
	## 🔬 MGRD: Modality-Grounded Reasoning Distillation

	MGRD is the key innovation that makes Step-Audio-R1 work. It's an iterative training framework that teaches the model to reason over actual acoustic features instead of text surrogates.

	### The MGRD Pipeline

	```
	┌─────────────────────────────────────────────────┐
	│ MGRD ITERATIVE PROCESS │
	├─────────────────────────────────────────────────┤
	│ │
	│ START: Text-based reasoning (inherited) │
	│ ↓ │
	│ ITERATION 1: Generate reasoning chains │
	│ ↓ │
	│ FILTER: Remove textual surrogate chains │
	│ ↓ │
	│ SELECT: Keep acoustically-grounded chains │
	│ ↓ │
	│ RETRAIN: Update model with filtered data │
	│ ↓ │
	│ REPEAT until "Native Audio Think" emerges │
	│ ↓ │
	│ RESULT: Model reasons over acoustic features! │
	│ │
	└─────────────────────────────────────────────────┘
	```

	### Three Training Stages

	\| Stage \| Name \| What Happens \|
	\|-------\|------\|--------------\|
	\| 1 \| Cold-Start \| SFT + RLVR to establish basic audio understanding \|
	\| 2 \| Iterative Distillation \| Filter and refine reasoning chains \|
	\| 3 \| Native Audio Think \| Model develops true acoustic reasoning \|

	### What Makes a "Good" Reasoning Chain?

	❌ Bad (Textual Surrogate):
	> "The speaker says 'I'm fine' so they must be feeling okay."

	✅ Good (Acoustically-Grounded):
	> "The speaker's voice shows elevated pitch (+15%), faster tempo, and slight tremor, indicating stress despite saying 'I'm fine'. The background noise suggests a busy environment which may be contributing to their tension."

	The good chain references actual acoustic features!
	"""

	# Architecture Content
	ARCHITECTURE_CONTENT = """
	## 🏗️ Step-Audio-R1 Architecture

	Step-Audio-R1 builds on Step-Audio 2 with three main components:

	```
	┌─────────────────────────────────────────────────────────────┐
	│ STEP-AUDIO-R1 ARCHITECTURE │
	├─────────────────────────────────────────────────────────────┤
	│ │
	│ 🎤 AUDIO INPUT (waveform) │
	│ │ │
	│ ▼ │
	│ ┌─────────────────────────────────┐ │
	│ │ AUDIO ENCODER │ │
	│ │ • Qwen2 Audio Encoder │ │
	│ │ • 25 Hz frame rate │ │
	│ │ • FROZEN during training │ │
	│ └─────────────────────────────────┘ │
	│ │ │
	│ ▼ │
	│ ┌─────────────────────────────────┐ │
	│ │ AUDIO ADAPTOR │ │
	│ │ • 2x downsampling │ │
	│ │ • 12.5 Hz output │ │
	│ │ • Bridge to LLM │ │
	│ └─────────────────────────────────┘ │
	│ │ │
	│ ▼ │
	│ ┌─────────────────────────────────┐ │
	│ │ LLM DECODER │ │
	│ │ • Qwen2.5 32B │ │
	│ │ • Core reasoning engine │ │
	│ │ • Outputs: Think → Response │ │
	│ └─────────────────────────────────┘ │
	│ │ │
	│ ▼ │
	│ 📝 TEXT OUTPUT │
	│ <thinking>...</thinking> │
	│ <response>...</response> │
	│ │
	└─────────────────────────────────────────────────────────────┘
	```

	### Component Details

	\| Component \| Model \| Frame Rate \| Status \|
	\|-----------\|-------\|------------\|--------\|
	\| Audio Encoder \| Qwen2 Audio \| 25 Hz \| Frozen \|
	\| Audio Adaptor \| Custom MLP \| 12.5 Hz (2x down) \| Trainable \|
	\| LLM Decoder \| Qwen2.5 32B \| N/A \| Trainable \|

	### Output Format

	The model produces structured reasoning:

	```xml
	<thinking>
	1. Acoustic Analysis: [describes sound properties]
	2. Pattern Recognition: [identifies key features]
	3. Inference: [draws conclusions from audio]
	</thinking>

	<response>
	[Final answer based on acoustic reasoning]
	</response>
	```
	"""

	# Benchmarks Data
	BENCHMARK_DATA = """
	## 📊 Benchmark Results

	Step-Audio-R1 was evaluated on comprehensive audio understanding benchmarks:

	### MMAU (Massive Multi-Task Audio Understanding)
	- 10,000 audio clips with human-annotated Q&A
	- 27 distinct skills tested
	- Covers: Speech, Environmental Sounds, Music

	### Performance Comparison

	\| Model \| MMAU Avg \| vs Gemini 2.5 Pro \|
	\|-------\|----------\|-------------------\|
	\| Step-Audio-R1 \| ~78% \| +12% ✅ \|
	\| Gemini 3 Pro \| ~77% \| +11% \|
	\| Gemini 2.5 Pro \| ~66% \| baseline \|
	\| GPT-4o Audio \| ~55% \| -11% \|
	\| Qwen2.5-Omni \| ~52% \| -14% \|

	### The Breakthrough: Test-Time Compute Scaling

	```
	BEFORE Step-Audio-R1:
	More thinking → ❌ Worse performance (inverted scaling)

	AFTER Step-Audio-R1:
	More thinking → ✅ Better performance (normal scaling)
	```

	This is the first time test-time compute scaling works for audio!

	### Domain Performance

	\| Domain \| Step-Audio-R1 \| Previous SOTA \|
	\|--------\|---------------\|---------------\|
	\| Speech \| 🟢 High \| Medium \|
	\| Sound \| 🟢 High \| Medium \|
	\| Music \| 🟢 High \| Low \|
	"""

	# Applications Content
	APPLICATIONS_CONTENT = """
	## 🚀 Practical Applications

	Audio reasoning enables many new AI capabilities:

	### 1. 🎙️ Advanced Voice Assistants
	- Understand complex multi-step instructions
	- Detect user emotion and adjust responses
	- Handle ambiguous requests intelligently

	### 2. 📞 Call Center Analytics
	- Analyze customer sentiment in real-time
	- Detect escalation patterns before they happen
	- Extract action items from conversations

	### 3. ♿ Accessibility Tools
	- Rich audio descriptions for hearing impaired
	- Environmental sound narration
	- Music content analysis and description

	### 4. 🔒 Security & Monitoring
	- Anomalous sound event detection
	- Contextual threat assessment
	- Multi-source audio analysis

	### 5. 🎓 Education & Learning
	- Pronunciation analysis for language learning
	- Music performance evaluation
	- Lecture comprehension and Q&A

	### Example: Meeting Analysis

	```
	📥 Input: [30-minute team meeting recording]

	🤔 Step-Audio-R1 Analysis:

	<thinking>
	1. Speaker identification: 4 distinct voices detected
	2. Topic tracking: Budget discussion (0-10min),
	Project timeline (10-20min), Action items (20-30min)
	3. Sentiment analysis:
	- Speaker A: Confident, leading discussion
	- Speaker B: Concerned (elevated pitch during budget section)
	- Speaker C: Disengaged (low energy, minimal contributions)
	- Speaker D: Supportive, mediating tensions
	4. Key moments: Tension spike at 8:42 (disagreement on budget)
	</thinking>

	<response>
	Meeting Summary:
	- Main topics: Q3 budget allocation, Project Alpha timeline
	- Key decision: Budget approved with 10% reduction
	- Action items: 3 identified (assigned to Speakers A, B, D)
	- Team dynamics: Some tension around budget, resolved by end
	- Follow-up recommended: 1-on-1 with Speaker C (low engagement)
	</response>
	```
	"""

	# Resources Content
	RESOURCES_CONTENT = """
	## 📚 Resources & Links

	### 📄 Papers
	\| Paper \| Link \|
	\|-------\|------\|
	\| Step-Audio-R1 Technical Report \| [arXiv:2511.15848](https://arxiv.org/abs/2511.15848) \|
	\| MMAU Benchmark \| [arXiv:2410.19168](https://arxiv.org/abs/2410.19168) \|
	\| Audio-Reasoner \| [arXiv:2503.02318](https://arxiv.org/abs/2503.02318) \|
	\| SpeechR Benchmark \| [arXiv:2508.02018](https://arxiv.org/abs/2508.02018) \|

	### 💻 Code & Models
	\| Resource \| Link \|
	\|----------\|------\|
	\| Step-Audio-R1 GitHub \| [github.com/stepfun-ai/Step-Audio-R1](https://github.com/stepfun-ai/Step-Audio-R1) \|
	\| Step-Audio-R1 Demo \| [stepaudiollm.github.io/step-audio-r1](https://stepaudiollm.github.io/step-audio-r1/) \|
	\| HuggingFace Collection \| [huggingface.co/collections/stepfun-ai/step-audio-r1](https://huggingface.co/collections/stepfun-ai/step-audio-r1) \|
	\| AudioBench \| [github.com/AudioLLMs/AudioBench](https://github.com/AudioLLMs/AudioBench) \|

	### 📖 Key Concepts Glossary

	\| Term \| Full Name \| Description \|
	\|------\|-----------\|-------------\|
	\| LALM \| Large Audio Language Model \| AI model that understands and reasons over audio \|
	\| CoT \| Chain-of-Thought \| Step-by-step reasoning approach \|
	\| MGRD \| Modality-Grounded Reasoning Distillation \| Training framework for acoustic reasoning \|
	\| TSR \| Textual Surrogate Reasoning \| Problem where model reasons over text instead of audio \|
	\| RLVR \| Reinforcement Learning with Verified Rewards \| Training with binary correctness rewards \|
	\| SFT \| Supervised Fine-Tuning \| Standard fine-tuning on labeled data \|

	### 📝 Citation

	```bibtex
	@article{stepaudioR1,
	title={Step-Audio-R1 Technical Report},
	author={Tian, Fei and others},
	journal={arXiv preprint arXiv:2511.15848},
	year={2025}
	}
	```

	---

	### 👤 About This Space

	Created by Mehmet Tuğrul Kaya
	- 🐙 GitHub: [@mtkaya](https://github.com/mtkaya)
	- 🤗 HuggingFace: [tugrulkaya](https://huggingface.co/tugrulkaya)

	This educational space explores the concepts behind Step-Audio-R1 and audio reasoning.
	"""

	# ============================================
	# HELPER FUNCTIONS (Yardımcı Fonksiyonlar)
	# ============================================

	def get_reasoning_type_info(reasoning_type):
	"""Get detailed information about a reasoning type"""
	if reasoning_type not in REASONING_TYPES:
	return "Please select a reasoning type"

	info = REASONING_TYPES[reasoning_type]

	output = f"""
	## {info['emoji']} {reasoning_type}

	### Description
	{info['description']}

	### Example Question
	> "{info['example_question']}"

	### Example Audio Scenario
	🎧 {info['example_audio']}

	### What the Model Does
	{info['what_model_does']}

	### Key Challenge
	⚠️ {info['challenge']}

	---

	### How Step-Audio-R1 Handles This

	Unlike traditional models that would convert this to text first, Step-Audio-R1:

	1. Analyzes acoustic features directly from the audio waveform
	2. Generates reasoning chains grounded in sound properties
	3. Produces answers that account for non-verbal information

	This is what makes it the first true audio reasoning model!
	"""
	return output


	def create_comparison_chart():
	"""Create model comparison data"""
	return """
	### 📊 Model Comparison on MMAU Benchmark

	\| Rank \| Model \| Score \| Type \|
	\|------\|-------\|-------\|------\|
	\| 🥇 \| Step-Audio-R1 \| ~78% \| Open \|
	\| 🥈 \| Gemini 3 Pro \| ~77% \| Proprietary \|
	\| 🥉 \| Gemini 2.5 Pro \| ~66% \| Proprietary \|
	\| 4 \| Audio Flamingo 3 \| ~60% \| Open \|
	\| 5 \| GPT-4o Audio \| ~55% \| Proprietary \|
	\| 6 \| Qwen2.5-Omni \| ~52% \| Open \|

	Key Insight: Step-Audio-R1 is the first open model to match proprietary SOTA!
	"""


	def generate_demo_reasoning(scenario):
	"""Generate example reasoning for demo scenarios"""
	scenarios = {
	"Customer Service Call": """
	## 🎧 Demo: Customer Service Call Analysis

	Scenario: A customer calls about a billing issue

	### Simulated Audio Description
	- Customer voice: Female, middle-aged, American English
	- Tone: Initially frustrated, becomes calmer
	- Agent voice: Male, professional, empathetic tone
	- Background: Quiet call center environment
	- Duration: ~3 minutes

	### Step-Audio-R1 Reasoning (Simulated)

	```xml
	<thinking>
	1. SPEAKER ANALYSIS:
	- Customer: F0 mean 220Hz (elevated), speech rate 4.2 syl/sec (fast)
	- Initial segment shows tension markers: pitch variation +40%
	- Gradual calming: pitch stabilizes by minute 2

	2. AGENT ANALYSIS:
	- Consistent calm prosody throughout
	- Strategic pauses after customer complaints
	- Mirroring technique detected (matching customer's pace)

	3. EMOTIONAL ARC:
	- 0:00-0:45: Customer frustrated (anger markers)
	- 0:45-1:30: Tension peak, interruption detected
	- 1:30-2:30: De-escalation successful
	- 2:30-3:00: Resolution, positive closing

	4. ACOUSTIC CONTEXT:
	- Low background noise suggests professional environment
	- No hold music interruptions
	- Clear audio quality on both sides
	</thinking>

	<response>
	This customer service interaction shows successful de-escalation.
	The customer initially displayed frustration (elevated pitch, fast
	speech) but the agent's calm, empathetic approach led to resolution.
	Key success factor: Agent's strategic use of pauses and mirroring.
	Customer satisfaction likely: HIGH (based on closing tone).
	</response>
	```
	""",
	"Meeting Recording": """
	## 🎧 Demo: Meeting Recording Analysis

	Scenario: Team standup meeting (15 minutes)

	### Simulated Audio Description
	- 4 speakers in conference room
	- Mixed engagement levels
	- Some background office noise
	- One speaker joins via video call

	### Step-Audio-R1 Reasoning (Simulated)

	```xml
	<thinking>
	1. SPEAKER IDENTIFICATION:
	- Speaker A (Lead): Clear, confident, 65% of speaking time
	- Speaker B (Remote): Slight compression artifacts, engaged
	- Speaker C (Junior): Hesitant prosody, brief contributions
	- Speaker D (Senior): Minimal input, possible disengagement

	2. TOPIC TRACKING:
	- 0:00-5:00: Sprint review (Speaker A leading)
	- 5:00-10:00: Blockers discussion (B raises concern)
	- 10:00-15:00: Planning next steps

	3. ENGAGEMENT ANALYSIS:
	- Speaker C shows uncertainty (rising intonation on statements)
	- Speaker D energy levels low (F0 variance minimal)
	- Remote Speaker B most engaged despite audio quality

	4. ACOUSTIC ENVIRONMENT:
	- Room reverb suggests medium conference room
	- HVAC noise consistent throughout
	- Remote connection stable (no dropouts)

	5. KEY MOMENTS:
	- 6:23: Tension spike (overlapping speech, raised voices)
	- 12:45: Agreement reached (synchronized acknowledgments)
	</thinking>

	<response>
	Meeting Analysis Summary:
	- Duration: 15 minutes, 4 participants
	- Key discussion: Sprint blockers and resolution
	- Action items identified: 3 (assigned to Speakers A, B, C)
	- Team dynamics note: Speaker D showed low engagement -
	recommend follow-up. Speaker C may need support (uncertainty
	markers detected in their updates).
	- Overall meeting effectiveness: MODERATE
	</response>
	```
	""",
	"Podcast Episode": """
	## 🎧 Demo: Podcast Episode Analysis

	Scenario: Tech podcast interview (45 minutes)

	### Simulated Audio Description
	- Host and guest conversation
	- Professional studio recording
	- Music intro/outro
	- Natural conversational flow

	### Step-Audio-R1 Reasoning (Simulated)

	```xml
	<thinking>
	1. AUDIO QUALITY ASSESSMENT:
	- Studio-quality recording (low noise floor)
	- Two distinct microphones detected
	- Music segments: Intro (0:00-0:30), Outro (44:30-45:00)
	- No significant audio issues

	2. SPEAKER DYNAMICS:
	- Host: Experienced interviewer, good pacing
	- Guest: Expert on topic, occasional filler words
	- Turn-taking: Natural, minimal interruptions
	- Rapport: Strong (laughter at 12:34, 23:45, 38:12)

	3. CONTENT STRUCTURE:
	- Introduction: 0:30-3:00
	- Main discussion: 3:00-40:00
	- Audience Q&A: 40:00-44:30
	- Closing: 44:30-45:00

	4. ENGAGEMENT INDICATORS:
	- Energy peaks: 15:00 (exciting announcement)
	- Deep technical discussion: 25:00-32:00
	- Emotional moment: 35:00 (personal story shared)

	5. PROSODIC ANALYSIS:
	- Guest becomes more animated on specific topics
	- Host maintains consistent energy throughout
	- Pace accelerates during exciting segments
	</thinking>

	<response>
	Podcast Episode Analysis:
	- Format: Interview style, 45 minutes
	- Quality: Professional studio recording
	- Key topics: [Inferred from discussion patterns]
	- Highlight moments: 15:00 (announcement), 35:00 (personal story)
	- Listener engagement prediction: HIGH
	- Recommended clips for promotion: 15:00-16:30, 35:00-36:45
	- Overall quality: EXCELLENT (clear audio, good rapport,
	well-structured conversation)
	</response>
	```
	""",
	"Music Analysis": """
	## 🎧 Demo: Music Analysis

	Scenario: Unknown music track analysis

	### Simulated Audio Description
	- Instrumental track
	- ~4 minutes duration
	- Multiple instruments
	- Studio production

	### Step-Audio-R1 Reasoning (Simulated)

	```xml
	<thinking>
	1. ACOUSTIC ANALYSIS:
	- Tempo: ~120 BPM (moderate, danceable)
	- Key: A minor (melancholic but energetic)
	- Time signature: 4/4
	- Dynamic range: 12 dB (well-mastered)

	2. INSTRUMENTATION:
	- Drums: Electronic, four-on-the-floor pattern
	- Bass: Synthesizer, prominent low-end
	- Lead: Analog-style synth, saw wave
	- Pads: Ambient, reverb-heavy
	- No vocals detected

	3. STRUCTURE:
	- Intro: 0:00-0:30 (buildup)
	- Verse 1: 0:30-1:30 (main groove)
	- Build: 1:30-2:00 (tension)
	- Drop: 2:00-2:30 (energy peak)
	- Verse 2: 2:30-3:30 (variation)
	- Outro: 3:30-4:00 (fadeout)

	4. PRODUCTION ANALYSIS:
	- Sidechain compression detected on pads
	- Stereo width: Wide (good separation)
	- Reference similar to: Melodic techno genre
	- Production quality: Professional

	5. EMOTIONAL CHARACTER:
	- Overall mood: Driving but melancholic
	- Energy arc: Building → Peak → Sustain → Release
	</thinking>

	<response>
	Music Analysis Summary:
	- Genre: Melodic Techno / Progressive House
	- Tempo: 120 BPM
	- Key: A minor
	- Duration: ~4 minutes
	- Mood: Energetic yet melancholic
	- Production: Professional quality, well-mastered
	- Use cases: DJ sets, workout playlists, focus music
	- Similar artists: [Based on production style]
	- Standout elements: Strong bass design, effective buildup
	</response>
	```
	"""
	}

	return scenarios.get(scenario, "Please select a scenario")


	# ============================================
	# GRADIO INTERFACE (Arayüz)
	# ============================================

	# Custom CSS
	custom_css = """
	.gradio-container {
	max-width: 1200px !important;
	}
	.tab-nav button {
	font-size: 16px !important;
	}
	.prose h1 {
	color: #FF6B35 !important;
	}
	.prose h2 {
	color: #4ECDC4 !important;
	border-bottom: 2px solid #4ECDC4;
	padding-bottom: 5px;
	}
	.highlight-box {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	padding: 20px;
	border-radius: 10px;
	color: white;
	}
	"""

	# Build the interface
	with gr.Blocks(css=custom_css, title="🎧 Audio Reasoning Explorer", theme=gr.themes.Soft()) as demo:

	# Header
	gr.Markdown("""
	<div style="text-align: center; padding: 20px;">
	<h1>🎧 Audio Reasoning & Step-Audio-R1 Explorer</h1>
	<p style="font-size: 18px; color: #666;">
	Interactive guide to understanding how AI learns to think about sound
	</p>
	</div>
	""")

	# Main tabs
	with gr.Tabs():

	# Tab 1: Introduction
	with gr.TabItem("🏠 Introduction", id=0):
	gr.Markdown(INTRO_CONTENT)

	# Tab 2: Audio Reasoning Types
	with gr.TabItem("🧠 Reasoning Types", id=1):
	gr.Markdown("## 🧠 Types of Audio Reasoning\n\nSelect a reasoning type to learn more:")

	with gr.Row():
	with gr.Column(scale=1):
	reasoning_dropdown = gr.Dropdown(
	choices=list(REASONING_TYPES.keys()),
	label="Select Reasoning Type",
	value="Factual Reasoning"
	)

	gr.Markdown("### Quick Overview")
	for rtype, info in REASONING_TYPES.items():
	gr.Markdown(f"{info['emoji']} {rtype}: {info['description']}")

	with gr.Column(scale=2):
	reasoning_output = gr.Markdown(
	value=get_reasoning_type_info("Factual Reasoning")
	)

	reasoning_dropdown.change(
	fn=get_reasoning_type_info,
	inputs=[reasoning_dropdown],
	outputs=[reasoning_output]
	)

	# Tab 3: The Problem
	with gr.TabItem("🚫 The Problem", id=2):
	gr.Markdown(PROBLEM_CONTENT)

	# Tab 4: MGRD Solution
	with gr.TabItem("🔬 MGRD Solution", id=3):
	gr.Markdown(MGRD_CONTENT)

	# Tab 5: Architecture
	with gr.TabItem("🏗️ Architecture", id=4):
	gr.Markdown(ARCHITECTURE_CONTENT)

	# Tab 6: Benchmarks
	with gr.TabItem("📊 Benchmarks", id=5):
	gr.Markdown(BENCHMARK_DATA)
	gr.Markdown(create_comparison_chart())

	# Tab 7: Interactive Demo
	with gr.TabItem("🎮 Interactive Demo", id=6):
	gr.Markdown("""
	## 🎮 Interactive Audio Reasoning Demo

	See how Step-Audio-R1 would analyze different audio scenarios!

	*Note: This is a simulation showing the reasoning process.
	The actual model processes real audio input.*
	""")

	with gr.Row():
	with gr.Column(scale=1):
	scenario_dropdown = gr.Dropdown(
	choices=[
	"Customer Service Call",
	"Meeting Recording",
	"Podcast Episode",
	"Music Analysis"
	],
	label="Select Audio Scenario",
	value="Customer Service Call"
	)

	analyze_btn = gr.Button("🔍 Analyze Scenario", variant="primary")

	gr.Markdown("""
	### What This Shows

	Each scenario demonstrates:
	1. Acoustic analysis - What the model "hears"
	2. Reasoning process - Step-by-step thinking
	3. Final output - Actionable insights

	This is the power of audio reasoning!
	""")

	with gr.Column(scale=2):
	demo_output = gr.Markdown(
	value=generate_demo_reasoning("Customer Service Call")
	)

	analyze_btn.click(
	fn=generate_demo_reasoning,
	inputs=[scenario_dropdown],
	outputs=[demo_output]
	)

	# Tab 8: Applications
	with gr.TabItem("🚀 Applications", id=7):
	gr.Markdown(APPLICATIONS_CONTENT)

	# Tab 9: Resources
	with gr.TabItem("📚 Resources", id=8):
	gr.Markdown(RESOURCES_CONTENT)

	# Footer
	gr.Markdown("""
	---
	<div style="text-align: center; padding: 20px; color: #666;">
	<p>Created by <strong>Mehmet Tuğrul Kaya</strong> \|
	<a href="https://github.com/mtkaya">GitHub</a> \|
	<a href="https://huggingface.co/tugrulkaya">HuggingFace</a></p>
	<p>🎧 Sound Speaks, AI Listens and Thinks 🧠</p>
	</div>
	""")

	# Launch
	if __name__ == "__main__":
	demo.launch()