OmniMind-Orchestrator / ui /voice_interface.py
mgbam's picture
Upload voice_interface.py
8d35b9c verified
raw
history blame
5.1 kB
"""
ElevenLabs Voice Interface - For $2K + AirPods Pro Prize
Voice-first enterprise AI interaction.
"""
import os
from typing import Optional, AsyncGenerator
import asyncio
try:
from elevenlabs import ElevenLabs, VoiceSettings
from elevenlabs.client import AsyncElevenLabs
ELEVENLABS_AVAILABLE = True
except ImportError:
ELEVENLABS_AVAILABLE = False
print("[WARNING] ElevenLabs not installed")
class VoiceInterface:
"""
Voice-first interface for OmniMind using ElevenLabs.
Prize Integration: ElevenLabs Category Award ($2K + AirPods Pro)
- Natural conversational AI
- Streaming voice responses
- Enterprise-grade voice quality
"""
def __init__(self):
self.api_key = os.getenv("ELEVENLABS_API_KEY")
if not ELEVENLABS_AVAILABLE or not self.api_key:
self.client = None
print("[WARNING] ElevenLabs not configured")
return
self.client = AsyncElevenLabs(api_key=self.api_key)
# Voice configurations for different personas
self.voices = {
"professional": "ErXwobaYiN019PkySvjV", # Antoni - professional male
"friendly": "EXAVITQu4vr4xnSDxMaL", # Sarah - friendly female
"executive": "VR6AewLTigWG4xSOukaG", # Arnold - authoritative male
}
self.current_voice = "professional"
async def text_to_speech(
self,
text: str,
voice: str = "professional",
stream: bool = True
) -> AsyncGenerator[bytes, None]:
"""
Convert text to speech with streaming support.
Args:
text: Text to convert
voice: Voice persona (professional, friendly, executive)
stream: Stream audio chunks for real-time playback
Yields:
Audio chunks (bytes)
"""
if not self.client:
# Return empty generator if not configured
return
yield
voice_id = self.voices.get(voice, self.voices["professional"])
if stream:
# Streaming for real-time responses
audio_stream = await self.client.text_to_speech.convert_as_stream(
text=text,
voice_id=voice_id,
model_id="eleven_turbo_v2_5", # Fastest model
voice_settings=VoiceSettings(
stability=0.5,
similarity_boost=0.75,
style=0.5,
use_speaker_boost=True
)
)
async for chunk in audio_stream:
yield chunk
else:
# Non-streaming for complete audio
audio = await self.client.text_to_speech.convert(
text=text,
voice_id=voice_id,
model_id="eleven_turbo_v2_5",
voice_settings=VoiceSettings(
stability=0.5,
similarity_boost=0.75,
style=0.5,
use_speaker_boost=True
)
)
yield audio
async def speech_to_text(self, audio_data: bytes) -> str:
"""
Convert speech to text (using OpenAI Whisper as ElevenLabs doesn't have STT).
Args:
audio_data: Audio bytes (WAV format)
Returns:
Transcribed text
"""
# ElevenLabs doesn't have STT, so we use OpenAI Whisper
from openai import AsyncOpenAI
openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Save audio temporarily
import tempfile
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_data)
audio_path = f.name
try:
with open(audio_path, "rb") as audio_file:
transcript = await openai_client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
return transcript.text
finally:
# Cleanup
import os
os.unlink(audio_path)
async def get_available_voices(self):
"""Get list of available voices"""
if not self.client:
return {"status": "unavailable", "voices": []}
voices = await self.client.voices.get_all()
return {
"status": "success",
"voices": [
{
"voice_id": voice.voice_id,
"name": voice.name,
"category": voice.category
}
for voice in voices.voices
]
}
def set_voice(self, voice_name: str):
"""Set the current voice persona"""
if voice_name in self.voices:
self.current_voice = voice_name
return True
return False
# Global voice interface
voice = VoiceInterface()