"""
๐ง Audio Reasoning & Step-Audio-R1 Explorer
Interactive Hugging Face Space for exploring audio reasoning concepts
Author: Mehmet Tuฤrul Kaya
"""
import gradio as gr
# ============================================
# CONTENT DATA (ฤฐรงerik Verileri)
# ============================================
INTRO_CONTENT = """
# ๐ง Audio Reasoning & Step-Audio-R1
## Teaching AI to Think About Sound
**Step-Audio-R1** is the first audio language model to successfully unlock reasoning capabilities in the audio domain.
This space explores the groundbreaking concepts behind audio reasoning and the innovative MGRD framework.
### ๐ฏ Key Achievement
> *"Can audio intelligence truly benefit from deliberate thinking?"* โ **YES!**
Step-Audio-R1 proves that reasoning is a **transferable capability across modalities** when properly grounded in acoustic features.
---
### ๐ Quick Stats
| Metric | Value |
|--------|-------|
| **Model Size** | 32B parameters (Qwen2.5 LLM) |
| **Audio Encoder** | Qwen2 (25 Hz, frozen) |
| **Performance** | Surpasses Gemini 2.5 Pro |
| **Innovation** | First successful audio reasoning model |
---
*Navigate through the tabs to explore different aspects of audio reasoning!*
"""
# Audio Reasoning Types Data
REASONING_TYPES = {
"Factual Reasoning": {
"emoji": "๐",
"description": "Extracting concrete information from audio",
"example_question": "What date is mentioned in this conversation?",
"example_audio": "A business call discussing a meeting scheduled for March 15th",
"what_model_does": "Identifies specific facts, numbers, names, dates from speech content",
"challenge": "Requires accurate speech recognition + information extraction"
},
"Procedural Reasoning": {
"emoji": "๐",
"description": "Understanding step-by-step processes and sequences",
"example_question": "What is the third step in this instruction set?",
"example_audio": "A cooking tutorial explaining how to make pasta",
"what_model_does": "Tracks sequential information, understands ordering and dependencies",
"challenge": "Must maintain context across long audio segments"
},
"Normative Reasoning": {
"emoji": "โ๏ธ",
"description": "Evaluating social, ethical, or behavioral norms",
"example_question": "Is the speaker behaving appropriately in this dialogue?",
"example_audio": "A customer service call with an upset customer",
"what_model_does": "Assesses tone, politeness, social appropriateness based on context",
"challenge": "Requires understanding of social norms + prosodic analysis"
},
"Contextual Reasoning": {
"emoji": "๐",
"description": "Inferring environmental and situational context",
"example_question": "Where might this sound have been recorded?",
"example_audio": "Background noise with birds, wind, and distant traffic",
"what_model_does": "Analyzes ambient sounds to determine location/situation",
"challenge": "Must process non-speech audio elements"
},
"Causal Reasoning": {
"emoji": "๐",
"description": "Establishing cause-effect relationships",
"example_question": "Why might this sound event have occurred?",
"example_audio": "A loud crash followed by glass breaking",
"what_model_does": "Infers causality from sound sequences and patterns",
"challenge": "Requires world knowledge + temporal understanding"
}
}
# The Problem Content
PROBLEM_CONTENT = """
## ๐ซ The Inverted Scaling Anomaly
### The Paradox
Traditional audio language models showed a **strange behavior**: they performed **WORSE** when reasoning longer!
This is the opposite of what happens in text models (like GPT-4, Claude) where more thinking = better answers.
### Root Cause: Textual Surrogate Reasoning
```
๐ Audio Input
โ
๐ Model converts to text (transcript)
โ
๐ง Reasons over TEXT, not SOUND
โ
โ Acoustic features IGNORED
โ
๐ Performance degrades with longer reasoning
```
### Why Does This Happen?
1. **Text-based initialization**: Models are fine-tuned from text LLMs
2. **Inherited patterns**: They learn to reason like text models
3. **Modality mismatch**: Audio is treated as "text with extra steps"
4. **Lost information**: Tone, emotion, prosody, ambient sounds are ignored
### Real Example
**Audio**: Person says "Sure, I'll do it" in a *sarcastic, annoyed tone*
| Approach | Interpretation |
|----------|---------------|
| **Textual Surrogate** โ | "Person agrees to do the task" |
| **Acoustic-Grounded** โ
| "Person is reluctant/annoyed, may not follow through" |
The acoustic-grounded approach captures the TRUE meaning!
"""
# MGRD Content
MGRD_CONTENT = """
## ๐ฌ MGRD: Modality-Grounded Reasoning Distillation
MGRD is the **key innovation** that makes Step-Audio-R1 work. It's an iterative training framework that teaches the model to reason over actual acoustic features instead of text surrogates.
### The MGRD Pipeline
```
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ MGRD ITERATIVE PROCESS โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ START: Text-based reasoning (inherited) โ
โ โ โ
โ ITERATION 1: Generate reasoning chains โ
โ โ โ
โ FILTER: Remove textual surrogate chains โ
โ โ โ
โ SELECT: Keep acoustically-grounded chains โ
โ โ โ
โ RETRAIN: Update model with filtered data โ
โ โ โ
โ REPEAT until "Native Audio Think" emerges โ
โ โ โ
โ RESULT: Model reasons over acoustic features! โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
```
### Three Training Stages
| Stage | Name | What Happens |
|-------|------|--------------|
| **1** | Cold-Start | SFT + RLVR to establish basic audio understanding |
| **2** | Iterative Distillation | Filter and refine reasoning chains |
| **3** | Native Audio Think | Model develops true acoustic reasoning |
### What Makes a "Good" Reasoning Chain?
**โ Bad (Textual Surrogate):**
> "The speaker says 'I'm fine' so they must be feeling okay."
**โ
Good (Acoustically-Grounded):**
> "The speaker's voice shows elevated pitch (+15%), faster tempo, and slight tremor, indicating stress despite saying 'I'm fine'. The background noise suggests a busy environment which may be contributing to their tension."
The good chain references **actual acoustic features**!
"""
# Architecture Content
ARCHITECTURE_CONTENT = """
## ๐๏ธ Step-Audio-R1 Architecture
Step-Audio-R1 builds on Step-Audio 2 with three main components:
```
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ STEP-AUDIO-R1 ARCHITECTURE โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ ๐ค AUDIO INPUT (waveform) โ
โ โ โ
โ โผ โ
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
โ โ AUDIO ENCODER โ โ
โ โ โข Qwen2 Audio Encoder โ โ
โ โ โข 25 Hz frame rate โ โ
โ โ โข FROZEN during training โ โ
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
โ โ โ
โ โผ โ
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
โ โ AUDIO ADAPTOR โ โ
โ โ โข 2x downsampling โ โ
โ โ โข 12.5 Hz output โ โ
โ โ โข Bridge to LLM โ โ
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
โ โ โ
โ โผ โ
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
โ โ LLM DECODER โ โ
โ โ โข Qwen2.5 32B โ โ
โ โ โข Core reasoning engine โ โ
โ โ โข Outputs: Think โ Response โ โ
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
โ โ โ
โ โผ โ
โ ๐ TEXT OUTPUT โ
โ ... โ
โ ... โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
```
### Component Details
| Component | Model | Frame Rate | Status |
|-----------|-------|------------|--------|
| Audio Encoder | Qwen2 Audio | 25 Hz | Frozen |
| Audio Adaptor | Custom MLP | 12.5 Hz (2x down) | Trainable |
| LLM Decoder | Qwen2.5 32B | N/A | Trainable |
### Output Format
The model produces structured reasoning:
```xml
1. Acoustic Analysis: [describes sound properties]
2. Pattern Recognition: [identifies key features]
3. Inference: [draws conclusions from audio]
[Final answer based on acoustic reasoning]
```
"""
# Benchmarks Data
BENCHMARK_DATA = """
## ๐ Benchmark Results
Step-Audio-R1 was evaluated on comprehensive audio understanding benchmarks:
### MMAU (Massive Multi-Task Audio Understanding)
- **10,000** audio clips with human-annotated Q&A
- **27** distinct skills tested
- Covers: Speech, Environmental Sounds, Music
### Performance Comparison
| Model | MMAU Avg | vs Gemini 2.5 Pro |
|-------|----------|-------------------|
| **Step-Audio-R1** | **~78%** | **+12%** โ
|
| Gemini 3 Pro | ~77% | +11% |
| Gemini 2.5 Pro | ~66% | baseline |
| GPT-4o Audio | ~55% | -11% |
| Qwen2.5-Omni | ~52% | -14% |
### The Breakthrough: Test-Time Compute Scaling
```
BEFORE Step-Audio-R1:
More thinking โ โ Worse performance (inverted scaling)
AFTER Step-Audio-R1:
More thinking โ โ
Better performance (normal scaling)
```
**This is the first time test-time compute scaling works for audio!**
### Domain Performance
| Domain | Step-Audio-R1 | Previous SOTA |
|--------|---------------|---------------|
| Speech | ๐ข High | Medium |
| Sound | ๐ข High | Medium |
| Music | ๐ข High | Low |
"""
# Applications Content
APPLICATIONS_CONTENT = """
## ๐ Practical Applications
Audio reasoning enables many new AI capabilities:
### 1. ๐๏ธ Advanced Voice Assistants
- Understand complex multi-step instructions
- Detect user emotion and adjust responses
- Handle ambiguous requests intelligently
### 2. ๐ Call Center Analytics
- Analyze customer sentiment in real-time
- Detect escalation patterns before they happen
- Extract action items from conversations
### 3. โฟ Accessibility Tools
- Rich audio descriptions for hearing impaired
- Environmental sound narration
- Music content analysis and description
### 4. ๐ Security & Monitoring
- Anomalous sound event detection
- Contextual threat assessment
- Multi-source audio analysis
### 5. ๐ Education & Learning
- Pronunciation analysis for language learning
- Music performance evaluation
- Lecture comprehension and Q&A
### Example: Meeting Analysis
```
๐ฅ Input: [30-minute team meeting recording]
๐ค Step-Audio-R1 Analysis:
1. Speaker identification: 4 distinct voices detected
2. Topic tracking: Budget discussion (0-10min),
Project timeline (10-20min), Action items (20-30min)
3. Sentiment analysis:
- Speaker A: Confident, leading discussion
- Speaker B: Concerned (elevated pitch during budget section)
- Speaker C: Disengaged (low energy, minimal contributions)
- Speaker D: Supportive, mediating tensions
4. Key moments: Tension spike at 8:42 (disagreement on budget)
Meeting Summary:
- Main topics: Q3 budget allocation, Project Alpha timeline
- Key decision: Budget approved with 10% reduction
- Action items: 3 identified (assigned to Speakers A, B, D)
- Team dynamics: Some tension around budget, resolved by end
- Follow-up recommended: 1-on-1 with Speaker C (low engagement)
```
"""
# Resources Content
RESOURCES_CONTENT = """
## ๐ Resources & Links
### ๐ Papers
| Paper | Link |
|-------|------|
| Step-Audio-R1 Technical Report | [arXiv:2511.15848](https://arxiv.org/abs/2511.15848) |
| MMAU Benchmark | [arXiv:2410.19168](https://arxiv.org/abs/2410.19168) |
| Audio-Reasoner | [arXiv:2503.02318](https://arxiv.org/abs/2503.02318) |
| SpeechR Benchmark | [arXiv:2508.02018](https://arxiv.org/abs/2508.02018) |
### ๐ป Code & Models
| Resource | Link |
|----------|------|
| Step-Audio-R1 GitHub | [github.com/stepfun-ai/Step-Audio-R1](https://github.com/stepfun-ai/Step-Audio-R1) |
| Step-Audio-R1 Demo | [stepaudiollm.github.io/step-audio-r1](https://stepaudiollm.github.io/step-audio-r1/) |
| HuggingFace Collection | [huggingface.co/collections/stepfun-ai/step-audio-r1](https://huggingface.co/collections/stepfun-ai/step-audio-r1) |
| AudioBench | [github.com/AudioLLMs/AudioBench](https://github.com/AudioLLMs/AudioBench) |
### ๐ Key Concepts Glossary
| Term | Full Name | Description |
|------|-----------|-------------|
| **LALM** | Large Audio Language Model | AI model that understands and reasons over audio |
| **CoT** | Chain-of-Thought | Step-by-step reasoning approach |
| **MGRD** | Modality-Grounded Reasoning Distillation | Training framework for acoustic reasoning |
| **TSR** | Textual Surrogate Reasoning | Problem where model reasons over text instead of audio |
| **RLVR** | Reinforcement Learning with Verified Rewards | Training with binary correctness rewards |
| **SFT** | Supervised Fine-Tuning | Standard fine-tuning on labeled data |
### ๐ Citation
```bibtex
@article{stepaudioR1,
title={Step-Audio-R1 Technical Report},
author={Tian, Fei and others},
journal={arXiv preprint arXiv:2511.15848},
year={2025}
}
```
---
### ๐ค About This Space
Created by **Mehmet Tuฤrul Kaya**
- ๐ GitHub: [@mtkaya](https://github.com/mtkaya)
- ๐ค HuggingFace: [tugrulkaya](https://huggingface.co/tugrulkaya)
*This educational space explores the concepts behind Step-Audio-R1 and audio reasoning.*
"""
# ============================================
# HELPER FUNCTIONS (Yardฤฑmcฤฑ Fonksiyonlar)
# ============================================
def get_reasoning_type_info(reasoning_type):
"""Get detailed information about a reasoning type"""
if reasoning_type not in REASONING_TYPES:
return "Please select a reasoning type"
info = REASONING_TYPES[reasoning_type]
output = f"""
## {info['emoji']} {reasoning_type}
### Description
{info['description']}
### Example Question
> *"{info['example_question']}"*
### Example Audio Scenario
๐ง {info['example_audio']}
### What the Model Does
{info['what_model_does']}
### Key Challenge
โ ๏ธ {info['challenge']}
---
### How Step-Audio-R1 Handles This
Unlike traditional models that would convert this to text first, Step-Audio-R1:
1. **Analyzes acoustic features** directly from the audio waveform
2. **Generates reasoning chains** grounded in sound properties
3. **Produces answers** that account for non-verbal information
This is what makes it the first true **audio reasoning** model!
"""
return output
def create_comparison_chart():
"""Create model comparison data"""
return """
### ๐ Model Comparison on MMAU Benchmark
| Rank | Model | Score | Type |
|------|-------|-------|------|
| ๐ฅ | **Step-Audio-R1** | ~78% | Open |
| ๐ฅ | Gemini 3 Pro | ~77% | Proprietary |
| ๐ฅ | Gemini 2.5 Pro | ~66% | Proprietary |
| 4 | Audio Flamingo 3 | ~60% | Open |
| 5 | GPT-4o Audio | ~55% | Proprietary |
| 6 | Qwen2.5-Omni | ~52% | Open |
**Key Insight**: Step-Audio-R1 is the first **open** model to match proprietary SOTA!
"""
def generate_demo_reasoning(scenario):
"""Generate example reasoning for demo scenarios"""
scenarios = {
"Customer Service Call": """
## ๐ง Demo: Customer Service Call Analysis
**Scenario**: A customer calls about a billing issue
### Simulated Audio Description
- Customer voice: Female, middle-aged, American English
- Tone: Initially frustrated, becomes calmer
- Agent voice: Male, professional, empathetic tone
- Background: Quiet call center environment
- Duration: ~3 minutes
### Step-Audio-R1 Reasoning (Simulated)
```xml
1. SPEAKER ANALYSIS:
- Customer: F0 mean 220Hz (elevated), speech rate 4.2 syl/sec (fast)
- Initial segment shows tension markers: pitch variation +40%
- Gradual calming: pitch stabilizes by minute 2
2. AGENT ANALYSIS:
- Consistent calm prosody throughout
- Strategic pauses after customer complaints
- Mirroring technique detected (matching customer's pace)
3. EMOTIONAL ARC:
- 0:00-0:45: Customer frustrated (anger markers)
- 0:45-1:30: Tension peak, interruption detected
- 1:30-2:30: De-escalation successful
- 2:30-3:00: Resolution, positive closing
4. ACOUSTIC CONTEXT:
- Low background noise suggests professional environment
- No hold music interruptions
- Clear audio quality on both sides
This customer service interaction shows successful de-escalation.
The customer initially displayed frustration (elevated pitch, fast
speech) but the agent's calm, empathetic approach led to resolution.
Key success factor: Agent's strategic use of pauses and mirroring.
Customer satisfaction likely: HIGH (based on closing tone).
```
""",
"Meeting Recording": """
## ๐ง Demo: Meeting Recording Analysis
**Scenario**: Team standup meeting (15 minutes)
### Simulated Audio Description
- 4 speakers in conference room
- Mixed engagement levels
- Some background office noise
- One speaker joins via video call
### Step-Audio-R1 Reasoning (Simulated)
```xml
1. SPEAKER IDENTIFICATION:
- Speaker A (Lead): Clear, confident, 65% of speaking time
- Speaker B (Remote): Slight compression artifacts, engaged
- Speaker C (Junior): Hesitant prosody, brief contributions
- Speaker D (Senior): Minimal input, possible disengagement
2. TOPIC TRACKING:
- 0:00-5:00: Sprint review (Speaker A leading)
- 5:00-10:00: Blockers discussion (B raises concern)
- 10:00-15:00: Planning next steps
3. ENGAGEMENT ANALYSIS:
- Speaker C shows uncertainty (rising intonation on statements)
- Speaker D energy levels low (F0 variance minimal)
- Remote Speaker B most engaged despite audio quality
4. ACOUSTIC ENVIRONMENT:
- Room reverb suggests medium conference room
- HVAC noise consistent throughout
- Remote connection stable (no dropouts)
5. KEY MOMENTS:
- 6:23: Tension spike (overlapping speech, raised voices)
- 12:45: Agreement reached (synchronized acknowledgments)
Meeting Analysis Summary:
- Duration: 15 minutes, 4 participants
- Key discussion: Sprint blockers and resolution
- Action items identified: 3 (assigned to Speakers A, B, C)
- Team dynamics note: Speaker D showed low engagement -
recommend follow-up. Speaker C may need support (uncertainty
markers detected in their updates).
- Overall meeting effectiveness: MODERATE
```
""",
"Podcast Episode": """
## ๐ง Demo: Podcast Episode Analysis
**Scenario**: Tech podcast interview (45 minutes)
### Simulated Audio Description
- Host and guest conversation
- Professional studio recording
- Music intro/outro
- Natural conversational flow
### Step-Audio-R1 Reasoning (Simulated)
```xml
1. AUDIO QUALITY ASSESSMENT:
- Studio-quality recording (low noise floor)
- Two distinct microphones detected
- Music segments: Intro (0:00-0:30), Outro (44:30-45:00)
- No significant audio issues
2. SPEAKER DYNAMICS:
- Host: Experienced interviewer, good pacing
- Guest: Expert on topic, occasional filler words
- Turn-taking: Natural, minimal interruptions
- Rapport: Strong (laughter at 12:34, 23:45, 38:12)
3. CONTENT STRUCTURE:
- Introduction: 0:30-3:00
- Main discussion: 3:00-40:00
- Audience Q&A: 40:00-44:30
- Closing: 44:30-45:00
4. ENGAGEMENT INDICATORS:
- Energy peaks: 15:00 (exciting announcement)
- Deep technical discussion: 25:00-32:00
- Emotional moment: 35:00 (personal story shared)
5. PROSODIC ANALYSIS:
- Guest becomes more animated on specific topics
- Host maintains consistent energy throughout
- Pace accelerates during exciting segments
Podcast Episode Analysis:
- Format: Interview style, 45 minutes
- Quality: Professional studio recording
- Key topics: [Inferred from discussion patterns]
- Highlight moments: 15:00 (announcement), 35:00 (personal story)
- Listener engagement prediction: HIGH
- Recommended clips for promotion: 15:00-16:30, 35:00-36:45
- Overall quality: EXCELLENT (clear audio, good rapport,
well-structured conversation)
```
""",
"Music Analysis": """
## ๐ง Demo: Music Analysis
**Scenario**: Unknown music track analysis
### Simulated Audio Description
- Instrumental track
- ~4 minutes duration
- Multiple instruments
- Studio production
### Step-Audio-R1 Reasoning (Simulated)
```xml
1. ACOUSTIC ANALYSIS:
- Tempo: ~120 BPM (moderate, danceable)
- Key: A minor (melancholic but energetic)
- Time signature: 4/4
- Dynamic range: 12 dB (well-mastered)
2. INSTRUMENTATION:
- Drums: Electronic, four-on-the-floor pattern
- Bass: Synthesizer, prominent low-end
- Lead: Analog-style synth, saw wave
- Pads: Ambient, reverb-heavy
- No vocals detected
3. STRUCTURE:
- Intro: 0:00-0:30 (buildup)
- Verse 1: 0:30-1:30 (main groove)
- Build: 1:30-2:00 (tension)
- Drop: 2:00-2:30 (energy peak)
- Verse 2: 2:30-3:30 (variation)
- Outro: 3:30-4:00 (fadeout)
4. PRODUCTION ANALYSIS:
- Sidechain compression detected on pads
- Stereo width: Wide (good separation)
- Reference similar to: Melodic techno genre
- Production quality: Professional
5. EMOTIONAL CHARACTER:
- Overall mood: Driving but melancholic
- Energy arc: Building โ Peak โ Sustain โ Release
Music Analysis Summary:
- Genre: Melodic Techno / Progressive House
- Tempo: 120 BPM
- Key: A minor
- Duration: ~4 minutes
- Mood: Energetic yet melancholic
- Production: Professional quality, well-mastered
- Use cases: DJ sets, workout playlists, focus music
- Similar artists: [Based on production style]
- Standout elements: Strong bass design, effective buildup
```
"""
}
return scenarios.get(scenario, "Please select a scenario")
# ============================================
# GRADIO INTERFACE (Arayรผz)
# ============================================
# Custom CSS
custom_css = """
.gradio-container {
max-width: 1200px !important;
}
.tab-nav button {
font-size: 16px !important;
}
.prose h1 {
color: #FF6B35 !important;
}
.prose h2 {
color: #4ECDC4 !important;
border-bottom: 2px solid #4ECDC4;
padding-bottom: 5px;
}
.highlight-box {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 20px;
border-radius: 10px;
color: white;
}
"""
# Build the interface
with gr.Blocks(css=custom_css, title="๐ง Audio Reasoning Explorer", theme=gr.themes.Soft()) as demo:
# Header
gr.Markdown("""
๐ง Audio Reasoning & Step-Audio-R1 Explorer
Interactive guide to understanding how AI learns to think about sound
""")
# Main tabs
with gr.Tabs():
# Tab 1: Introduction
with gr.TabItem("๐ Introduction", id=0):
gr.Markdown(INTRO_CONTENT)
# Tab 2: Audio Reasoning Types
with gr.TabItem("๐ง Reasoning Types", id=1):
gr.Markdown("## ๐ง Types of Audio Reasoning\n\nSelect a reasoning type to learn more:")
with gr.Row():
with gr.Column(scale=1):
reasoning_dropdown = gr.Dropdown(
choices=list(REASONING_TYPES.keys()),
label="Select Reasoning Type",
value="Factual Reasoning"
)
gr.Markdown("### Quick Overview")
for rtype, info in REASONING_TYPES.items():
gr.Markdown(f"{info['emoji']} **{rtype}**: {info['description']}")
with gr.Column(scale=2):
reasoning_output = gr.Markdown(
value=get_reasoning_type_info("Factual Reasoning")
)
reasoning_dropdown.change(
fn=get_reasoning_type_info,
inputs=[reasoning_dropdown],
outputs=[reasoning_output]
)
# Tab 3: The Problem
with gr.TabItem("๐ซ The Problem", id=2):
gr.Markdown(PROBLEM_CONTENT)
# Tab 4: MGRD Solution
with gr.TabItem("๐ฌ MGRD Solution", id=3):
gr.Markdown(MGRD_CONTENT)
# Tab 5: Architecture
with gr.TabItem("๐๏ธ Architecture", id=4):
gr.Markdown(ARCHITECTURE_CONTENT)
# Tab 6: Benchmarks
with gr.TabItem("๐ Benchmarks", id=5):
gr.Markdown(BENCHMARK_DATA)
gr.Markdown(create_comparison_chart())
# Tab 7: Interactive Demo
with gr.TabItem("๐ฎ Interactive Demo", id=6):
gr.Markdown("""
## ๐ฎ Interactive Audio Reasoning Demo
See how Step-Audio-R1 would analyze different audio scenarios!
*Note: This is a simulation showing the reasoning process.
The actual model processes real audio input.*
""")
with gr.Row():
with gr.Column(scale=1):
scenario_dropdown = gr.Dropdown(
choices=[
"Customer Service Call",
"Meeting Recording",
"Podcast Episode",
"Music Analysis"
],
label="Select Audio Scenario",
value="Customer Service Call"
)
analyze_btn = gr.Button("๐ Analyze Scenario", variant="primary")
gr.Markdown("""
### What This Shows
Each scenario demonstrates:
1. **Acoustic analysis** - What the model "hears"
2. **Reasoning process** - Step-by-step thinking
3. **Final output** - Actionable insights
This is the power of **audio reasoning**!
""")
with gr.Column(scale=2):
demo_output = gr.Markdown(
value=generate_demo_reasoning("Customer Service Call")
)
analyze_btn.click(
fn=generate_demo_reasoning,
inputs=[scenario_dropdown],
outputs=[demo_output]
)
# Tab 8: Applications
with gr.TabItem("๐ Applications", id=7):
gr.Markdown(APPLICATIONS_CONTENT)
# Tab 9: Resources
with gr.TabItem("๐ Resources", id=8):
gr.Markdown(RESOURCES_CONTENT)
# Footer
gr.Markdown("""
---
Created by Mehmet Tuฤrul Kaya |
GitHub |
HuggingFace
๐ง Sound Speaks, AI Listens and Thinks ๐ง
""")
# Launch
if __name__ == "__main__":
demo.launch()