Spaces:
Build error
Build error
File size: 14,442 Bytes
c7077c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 |
# voice.py (Updated with Azure OpenAI integration)
import os
import speech_recognition as sr
import google.generativeai as genai
import tempfile
import logging
from io import BytesIO
import re
import pygame
from translate import Translator
import base64
import streamlit as st
from google.cloud import texttospeech
import json
from openai import AzureOpenAI
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Get API keys from environment variables
gemini_api_key = os.getenv('GEMINI_API_KEY', "AIzaSyCZL29aqWTmP_NTzkGILK4Kujx_MuyRAs4")
google_tts_credentials = os.getenv('GOOGLE_TTS_CREDENTIALS', "D:/AI and Data Science/Projects/AI DoctorV2/tamiltextspeech-458116-147b3efcaf84.json")
# Azure OpenAI configuration
AZURE_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
MODEL_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")
# Initialize Azure OpenAI client
try:
azure_client = AzureOpenAI(
api_key=AZURE_API_KEY,
azure_endpoint=AZURE_ENDPOINT,
api_version=API_VERSION
)
logger.info("Azure OpenAI client initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize Azure OpenAI client: {str(e)}")
azure_client = None
# Initialize Google TTS client
try:
# Set credentials from JSON file
if os.path.exists(google_tts_credentials):
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_tts_credentials
tts_client = texttospeech.TextToSpeechClient()
logger.info("Google Text-to-Speech client initialized successfully")
else:
logger.warning(f"Google TTS credentials file not found: {google_tts_credentials}")
tts_client = None
except Exception as e:
logger.error(f"Failed to initialize Google TTS: {str(e)}")
tts_client = None
# Configure Gemini for translations only
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel('gemini-1.5-pro')
def listen_tamil():
"""Listen to Tamil speech with improved end detection and error handling"""
recognizer = sr.Recognizer()
with sr.Microphone() as source:
logger.info("Listening for Tamil speech...")
# Adjust for ambient noise
recognizer.adjust_for_ambient_noise(source, duration=1.5) # Increased duration
# Improve speech detection with better pause threshold
recognizer.pause_threshold = 1.0 # Increased pause threshold for better recognition
recognizer.energy_threshold = 300 # Adjust sensitivity
try:
st.info("🎤 Listening... Please speak in Tamil")
audio = recognizer.listen(source, timeout=15, phrase_time_limit=30) # Extended timeout
logger.info("Speech detected, processing...")
st.success("✅ Speech recorded! Processing...")
except sr.WaitTimeoutError:
logger.error("No speech detected")
st.error("❌ No speech detected. Please try again.")
return None
try:
# Using Google's speech recognition with Tamil language
tamil_text = recognizer.recognize_google(audio, language='ta-IN')
logger.info(f"Recognized Tamil text: {tamil_text}")
return tamil_text
except sr.UnknownValueError:
logger.error("Could not understand audio")
st.error("❌ Could not understand the speech. Please try again more clearly.")
return None
except sr.RequestError as e:
logger.error(f"Speech recognition service error: {e}")
st.error("❌ Speech recognition service error. Please try again later.")
return None
def translate_tamil_to_english(tamil_text):
"""Translate Tamil text to English while preserving numbers"""
if not tamil_text:
return ""
# Extract numbers from the text
numbers = re.findall(r'\d+\.?\d*', tamil_text)
# Replace numbers with placeholders
for i, num in enumerate(numbers):
tamil_text = tamil_text.replace(num, f'NUM{i}PLACEHOLDER')
try:
# Use Gemini for more accurate translation
prompt = f"""Translate this Tamil text to English accurately, preserving the exact meaning:
{tamil_text}
Return only the translation, nothing else."""
response = model.generate_content(prompt)
translation = response.text
# Fallback to basic translator if Gemini fails
if not translation or len(translation) < 5:
translator = Translator(to_lang="en", from_lang="ta")
translation = translator.translate(tamil_text)
# Restore numbers
for i, num in enumerate(numbers):
translation = translation.replace(f'NUM{i}PLACEHOLDER', num)
# Clean up any artifacts
translation = re.sub(r'\s+', ' ', translation).strip()
logger.info(f"Translation result: {translation}")
return translation
except Exception as e:
logger.error(f"Translation error: {e}")
# Try fallback translator
try:
translator = Translator(to_lang="en", from_lang="ta")
return translator.translate(tamil_text)
except:
return tamil_text # Return original if translation fails
def translate_english_to_tamil(english_text):
"""Translate English text to Tamil while preserving numbers"""
if not english_text:
return ""
# Extract numbers from the text
numbers = re.findall(r'\d+\.?\d*', english_text)
# Replace numbers with placeholders
for i, num in enumerate(numbers):
english_text = english_text.replace(num, f'NUM{i}PLACEHOLDER')
try:
# Use Gemini for more accurate translation
prompt = f"""Translate this English text to Tamil accurately, preserving the exact meaning:
{english_text}
Return only the translation, nothing else."""
response = model.generate_content(prompt)
translation = response.text
# Fallback to basic translator if Gemini fails
if not translation or len(translation) < 5:
translator = Translator(to_lang="ta", from_lang="en")
translation = translator.translate(english_text)
# Restore numbers
for i, num in enumerate(numbers):
translation = translation.replace(f'NUM{i}PLACEHOLDER', num)
# Clean up any artifacts
translation = re.sub(r'\s+', ' ', translation).strip()
logger.info(f"Translation to Tamil: {translation}")
return translation
except Exception as e:
logger.error(f"Translation error: {e}")
# Try fallback translator
try:
translator = Translator(to_lang="ta", from_lang="en")
return translator.translate(english_text)
except:
return english_text # Return original if translation fails
def process_with_azure_openai(english_text, medical_summary):
"""Process medical report with Azure OpenAI using empathetic approach"""
if not english_text or not medical_summary:
return "No data available to process."
if not azure_client:
logger.error("Azure OpenAI client not initialized")
return "Sorry, the AI service is currently unavailable."
try:
prompt = f"""You are a compassionate medical assistant. Analyze the medical report and respond to the user's question.
User's question: {english_text}
Requirements:
1. Respond only if the question relates to the medical report
2. Keep the response under 100 words
3. Use simple, non-medical language when possible
4. Focus on answering the specific question
5. Be empathetic and reassuring (avoid causing panic)
6. Include positive, actionable health improvement suggestions
7. Use phrases like "Don't worry", "You can improve this by", "This is manageable"
Medical Report:
{medical_summary}
"""
response = azure_client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=400
)
processed_text = response.choices[0].message.content
logger.info("Successfully processed query with Azure OpenAI")
return processed_text
except Exception as e:
logger.error(f"Error processing with Azure OpenAI: {str(e)}")
return "I apologize, but I couldn't process your question about the medical report."
def text_to_speech(text, output_file="output.mp3"):
"""Convert text to speech using Google TTS"""
if not text:
logger.warning("No text provided for speech synthesis")
return None
try:
if tts_client:
# Configure the synthesis input
synthesis_input = texttospeech.SynthesisInput(text=text)
# Build the voice request, selecting Tamil language and female voice
voice = texttospeech.VoiceSelectionParams(
language_code="ta-IN",
ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
)
# Select the audio file type with improved settings
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=0.9, # Slightly slower for better comprehension
pitch=0.0, # Normal pitch
volume_gain_db=1.0 # Slightly louder
)
# Perform the text-to-speech request
response = tts_client.synthesize_speech(
input=synthesis_input,
voice=voice,
audio_config=audio_config
)
# Save the response to a file
with open(output_file, "wb") as out:
out.write(response.audio_content)
logger.info(f"Audio content written to file {output_file}")
# Return audio bytes for streaming
audio_bytes = BytesIO(response.audio_content)
return audio_bytes
else:
logger.warning("Google TTS client not available")
return None
except Exception as e:
logger.error(f"Error in text-to-speech: {e}")
return None
def play_audio(audio_file):
"""Play audio file using pygame"""
try:
pygame.mixer.init()
pygame.mixer.music.load(audio_file)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
pygame.time.Clock().tick(10)
except Exception as e:
logger.error(f"Error playing audio: {e}")
def get_base64_audio(audio_file):
"""Convert audio file to base64 for embedding"""
with open(audio_file, "rb") as f:
data = f.read()
return base64.b64encode(data).decode()
def play_audio_response(audio_file):
"""Play audio file automatically in browser"""
if audio_file and os.path.exists(audio_file):
try:
# Create HTML with autoplay audio element
audio_html = f"""
<audio id="response_audio" autoplay="true">
<source src="data:audio/mp3;base64,{get_base64_audio(audio_file)}" type="audio/mp3">
</audio>
<script>
// Ensure audio plays automatically
var audio = document.getElementById("response_audio");
audio.play().catch(function(error) {{
console.error("Audio playback failed:", error);
}});
</script>
"""
st.components.v1.html(audio_html, height=0)
logger.info("Audio playback triggered")
except Exception as e:
logger.error(f"Error in auto-play: {e}")
def get_medical_report_answer(medical_summary, tamil_text=None):
"""Process a voice query about the medical report"""
# If tamil_text is not provided, listen for it
if not tamil_text:
tamil_text = listen_tamil()
if not tamil_text:
return {
"original_query": None,
"translated_query": None,
"english_response": "No speech detected. Please try again.",
"tamil_response": "பேச்சு இல்லை. மீண்டும் முயற்சிக்கவும்.",
"audio_file": None
}
# Step 2: Translate Tamil to English
english_query = translate_tamil_to_english(tamil_text)
# Step 3: Process with Azure OpenAI instead of Gemini
english_response = process_with_azure_openai(english_query, medical_summary)
# Step 4: Translate response back to Tamil
tamil_response = translate_english_to_tamil(english_response)
# Add empathetic phrases in Tamil if they're not already present
empathetic_phrases = [
"கவலைப்பட வேண்டாம்", # Don't worry
"இது கையாளக்கூடியது", # This is manageable
"இதை மேம்படுத்த முடியும்" # You can improve this
]
# Check if at least one empathetic phrase is present
has_empathetic_phrase = any(phrase in tamil_response for phrase in empathetic_phrases)
# Add an empathetic phrase at the beginning if none found
if not has_empathetic_phrase:
tamil_response = f"{empathetic_phrases[0]}. {tamil_response}"
# Step 5: Convert to speech
audio_file = "response_audio.mp3"
audio_data = text_to_speech(tamil_response, audio_file)
# Log success or failure of audio generation
if audio_data:
logger.info("Audio response generated successfully")
else:
logger.warning("Failed to generate audio response")
return {
"original_query": tamil_text,
"translated_query": english_query,
"english_response": english_response,
"tamil_response": tamil_response,
"audio_file": audio_file if audio_data else None
} |