Spaces:

MCP-1st-Birthday
/

OmniMind-Orchestrator

Running

App Files Files Community

OmniMind-Orchestrator / ui /voice_interface.py

mgbam

Upload voice_interface.py

8d35b9c verified 19 days ago

raw

history blame

5.1 kB

	"""
	ElevenLabs Voice Interface - For $2K + AirPods Pro Prize

	Voice-first enterprise AI interaction.
	"""

	import os
	from typing import Optional, AsyncGenerator
	import asyncio

	try:
	from elevenlabs import ElevenLabs, VoiceSettings
	from elevenlabs.client import AsyncElevenLabs
	ELEVENLABS_AVAILABLE = True
	except ImportError:
	ELEVENLABS_AVAILABLE = False
	print("[WARNING] ElevenLabs not installed")


	class VoiceInterface:
	"""
	Voice-first interface for OmniMind using ElevenLabs.

	Prize Integration: ElevenLabs Category Award ($2K + AirPods Pro)
	- Natural conversational AI
	- Streaming voice responses
	- Enterprise-grade voice quality
	"""

	def __init__(self):
	self.api_key = os.getenv("ELEVENLABS_API_KEY")

	if not ELEVENLABS_AVAILABLE or not self.api_key:
	self.client = None
	print("[WARNING] ElevenLabs not configured")
	return

	self.client = AsyncElevenLabs(api_key=self.api_key)

	# Voice configurations for different personas
	self.voices = {
	"professional": "ErXwobaYiN019PkySvjV", # Antoni - professional male
	"friendly": "EXAVITQu4vr4xnSDxMaL", # Sarah - friendly female
	"executive": "VR6AewLTigWG4xSOukaG", # Arnold - authoritative male
	}

	self.current_voice = "professional"

	async def text_to_speech(
	self,
	text: str,
	voice: str = "professional",
	stream: bool = True
	) -> AsyncGenerator[bytes, None]:
	"""
	Convert text to speech with streaming support.

	Args:
	text: Text to convert
	voice: Voice persona (professional, friendly, executive)
	stream: Stream audio chunks for real-time playback

	Yields:
	Audio chunks (bytes)
	"""
	if not self.client:
	# Return empty generator if not configured
	return
	yield

	voice_id = self.voices.get(voice, self.voices["professional"])

	if stream:
	# Streaming for real-time responses
	audio_stream = await self.client.text_to_speech.convert_as_stream(
	text=text,
	voice_id=voice_id,
	model_id="eleven_turbo_v2_5", # Fastest model
	voice_settings=VoiceSettings(
	stability=0.5,
	similarity_boost=0.75,
	style=0.5,
	use_speaker_boost=True
	)
	)

	async for chunk in audio_stream:
	yield chunk
	else:
	# Non-streaming for complete audio
	audio = await self.client.text_to_speech.convert(
	text=text,
	voice_id=voice_id,
	model_id="eleven_turbo_v2_5",
	voice_settings=VoiceSettings(
	stability=0.5,
	similarity_boost=0.75,
	style=0.5,
	use_speaker_boost=True
	)
	)

	yield audio

	async def speech_to_text(self, audio_data: bytes) -> str:
	"""
	Convert speech to text (using OpenAI Whisper as ElevenLabs doesn't have STT).

	Args:
	audio_data: Audio bytes (WAV format)

	Returns:
	Transcribed text
	"""
	# ElevenLabs doesn't have STT, so we use OpenAI Whisper
	from openai import AsyncOpenAI

	openai_client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	# Save audio temporarily
	import tempfile
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	f.write(audio_data)
	audio_path = f.name

	try:
	with open(audio_path, "rb") as audio_file:
	transcript = await openai_client.audio.transcriptions.create(
	model="whisper-1",
	file=audio_file
	)

	return transcript.text

	finally:
	# Cleanup
	import os
	os.unlink(audio_path)

	async def get_available_voices(self):
	"""Get list of available voices"""
	if not self.client:
	return {"status": "unavailable", "voices": []}

	voices = await self.client.voices.get_all()

	return {
	"status": "success",
	"voices": [
	{
	"voice_id": voice.voice_id,
	"name": voice.name,
	"category": voice.category
	}
	for voice in voices.voices
	]
	}

	def set_voice(self, voice_name: str):
	"""Set the current voice persona"""
	if voice_name in self.voices:
	self.current_voice = voice_name
	return True
	return False


	# Global voice interface
	voice = VoiceInterface()