# voice.py (Updated with Azure OpenAI integration) import os import speech_recognition as sr import google.generativeai as genai import tempfile import logging from io import BytesIO import re import pygame from translate import Translator import base64 import streamlit as st from google.cloud import texttospeech import json from openai import AzureOpenAI # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Get API keys from environment variables gemini_api_key = os.getenv('GEMINI_API_KEY', "AIzaSyCZL29aqWTmP_NTzkGILK4Kujx_MuyRAs4") google_tts_credentials = os.getenv('GOOGLE_TTS_CREDENTIALS', "D:/AI and Data Science/Projects/AI DoctorV2/tamiltextspeech-458116-147b3efcaf84.json") # Azure OpenAI configuration AZURE_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") AZURE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") MODEL_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME") API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview") # Initialize Azure OpenAI client try: azure_client = AzureOpenAI( api_key=AZURE_API_KEY, azure_endpoint=AZURE_ENDPOINT, api_version=API_VERSION ) logger.info("Azure OpenAI client initialized successfully") except Exception as e: logger.error(f"Failed to initialize Azure OpenAI client: {str(e)}") azure_client = None # Initialize Google TTS client try: # Set credentials from JSON file if os.path.exists(google_tts_credentials): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_tts_credentials tts_client = texttospeech.TextToSpeechClient() logger.info("Google Text-to-Speech client initialized successfully") else: logger.warning(f"Google TTS credentials file not found: {google_tts_credentials}") tts_client = None except Exception as e: logger.error(f"Failed to initialize Google TTS: {str(e)}") tts_client = None # Configure Gemini for translations only genai.configure(api_key=gemini_api_key) model = genai.GenerativeModel('gemini-1.5-pro') def listen_tamil(): """Listen to Tamil speech with improved end detection and error handling""" recognizer = sr.Recognizer() with sr.Microphone() as source: logger.info("Listening for Tamil speech...") # Adjust for ambient noise recognizer.adjust_for_ambient_noise(source, duration=1.5) # Increased duration # Improve speech detection with better pause threshold recognizer.pause_threshold = 1.0 # Increased pause threshold for better recognition recognizer.energy_threshold = 300 # Adjust sensitivity try: st.info("🎤 Listening... Please speak in Tamil") audio = recognizer.listen(source, timeout=15, phrase_time_limit=30) # Extended timeout logger.info("Speech detected, processing...") st.success("✅ Speech recorded! Processing...") except sr.WaitTimeoutError: logger.error("No speech detected") st.error("❌ No speech detected. Please try again.") return None try: # Using Google's speech recognition with Tamil language tamil_text = recognizer.recognize_google(audio, language='ta-IN') logger.info(f"Recognized Tamil text: {tamil_text}") return tamil_text except sr.UnknownValueError: logger.error("Could not understand audio") st.error("❌ Could not understand the speech. Please try again more clearly.") return None except sr.RequestError as e: logger.error(f"Speech recognition service error: {e}") st.error("❌ Speech recognition service error. Please try again later.") return None def translate_tamil_to_english(tamil_text): """Translate Tamil text to English while preserving numbers""" if not tamil_text: return "" # Extract numbers from the text numbers = re.findall(r'\d+\.?\d*', tamil_text) # Replace numbers with placeholders for i, num in enumerate(numbers): tamil_text = tamil_text.replace(num, f'NUM{i}PLACEHOLDER') try: # Use Gemini for more accurate translation prompt = f"""Translate this Tamil text to English accurately, preserving the exact meaning: {tamil_text} Return only the translation, nothing else.""" response = model.generate_content(prompt) translation = response.text # Fallback to basic translator if Gemini fails if not translation or len(translation) < 5: translator = Translator(to_lang="en", from_lang="ta") translation = translator.translate(tamil_text) # Restore numbers for i, num in enumerate(numbers): translation = translation.replace(f'NUM{i}PLACEHOLDER', num) # Clean up any artifacts translation = re.sub(r'\s+', ' ', translation).strip() logger.info(f"Translation result: {translation}") return translation except Exception as e: logger.error(f"Translation error: {e}") # Try fallback translator try: translator = Translator(to_lang="en", from_lang="ta") return translator.translate(tamil_text) except: return tamil_text # Return original if translation fails def translate_english_to_tamil(english_text): """Translate English text to Tamil while preserving numbers""" if not english_text: return "" # Extract numbers from the text numbers = re.findall(r'\d+\.?\d*', english_text) # Replace numbers with placeholders for i, num in enumerate(numbers): english_text = english_text.replace(num, f'NUM{i}PLACEHOLDER') try: # Use Gemini for more accurate translation prompt = f"""Translate this English text to Tamil accurately, preserving the exact meaning: {english_text} Return only the translation, nothing else.""" response = model.generate_content(prompt) translation = response.text # Fallback to basic translator if Gemini fails if not translation or len(translation) < 5: translator = Translator(to_lang="ta", from_lang="en") translation = translator.translate(english_text) # Restore numbers for i, num in enumerate(numbers): translation = translation.replace(f'NUM{i}PLACEHOLDER', num) # Clean up any artifacts translation = re.sub(r'\s+', ' ', translation).strip() logger.info(f"Translation to Tamil: {translation}") return translation except Exception as e: logger.error(f"Translation error: {e}") # Try fallback translator try: translator = Translator(to_lang="ta", from_lang="en") return translator.translate(english_text) except: return english_text # Return original if translation fails def process_with_azure_openai(english_text, medical_summary): """Process medical report with Azure OpenAI using empathetic approach""" if not english_text or not medical_summary: return "No data available to process." if not azure_client: logger.error("Azure OpenAI client not initialized") return "Sorry, the AI service is currently unavailable." try: prompt = f"""You are a compassionate medical assistant. Analyze the medical report and respond to the user's question. User's question: {english_text} Requirements: 1. Respond only if the question relates to the medical report 2. Keep the response under 100 words 3. Use simple, non-medical language when possible 4. Focus on answering the specific question 5. Be empathetic and reassuring (avoid causing panic) 6. Include positive, actionable health improvement suggestions 7. Use phrases like "Don't worry", "You can improve this by", "This is manageable" Medical Report: {medical_summary} """ response = azure_client.chat.completions.create( model=MODEL_NAME, messages=[{"role": "user", "content": prompt}], temperature=0.3, max_tokens=400 ) processed_text = response.choices[0].message.content logger.info("Successfully processed query with Azure OpenAI") return processed_text except Exception as e: logger.error(f"Error processing with Azure OpenAI: {str(e)}") return "I apologize, but I couldn't process your question about the medical report." def text_to_speech(text, output_file="output.mp3"): """Convert text to speech using Google TTS""" if not text: logger.warning("No text provided for speech synthesis") return None try: if tts_client: # Configure the synthesis input synthesis_input = texttospeech.SynthesisInput(text=text) # Build the voice request, selecting Tamil language and female voice voice = texttospeech.VoiceSelectionParams( language_code="ta-IN", ssml_gender=texttospeech.SsmlVoiceGender.FEMALE ) # Select the audio file type with improved settings audio_config = texttospeech.AudioConfig( audio_encoding=texttospeech.AudioEncoding.MP3, speaking_rate=0.9, # Slightly slower for better comprehension pitch=0.0, # Normal pitch volume_gain_db=1.0 # Slightly louder ) # Perform the text-to-speech request response = tts_client.synthesize_speech( input=synthesis_input, voice=voice, audio_config=audio_config ) # Save the response to a file with open(output_file, "wb") as out: out.write(response.audio_content) logger.info(f"Audio content written to file {output_file}") # Return audio bytes for streaming audio_bytes = BytesIO(response.audio_content) return audio_bytes else: logger.warning("Google TTS client not available") return None except Exception as e: logger.error(f"Error in text-to-speech: {e}") return None def play_audio(audio_file): """Play audio file using pygame""" try: pygame.mixer.init() pygame.mixer.music.load(audio_file) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): pygame.time.Clock().tick(10) except Exception as e: logger.error(f"Error playing audio: {e}") def get_base64_audio(audio_file): """Convert audio file to base64 for embedding""" with open(audio_file, "rb") as f: data = f.read() return base64.b64encode(data).decode() def play_audio_response(audio_file): """Play audio file automatically in browser""" if audio_file and os.path.exists(audio_file): try: # Create HTML with autoplay audio element audio_html = f""" """ st.components.v1.html(audio_html, height=0) logger.info("Audio playback triggered") except Exception as e: logger.error(f"Error in auto-play: {e}") def get_medical_report_answer(medical_summary, tamil_text=None): """Process a voice query about the medical report""" # If tamil_text is not provided, listen for it if not tamil_text: tamil_text = listen_tamil() if not tamil_text: return { "original_query": None, "translated_query": None, "english_response": "No speech detected. Please try again.", "tamil_response": "பேச்சு இல்லை. மீண்டும் முயற்சிக்கவும்.", "audio_file": None } # Step 2: Translate Tamil to English english_query = translate_tamil_to_english(tamil_text) # Step 3: Process with Azure OpenAI instead of Gemini english_response = process_with_azure_openai(english_query, medical_summary) # Step 4: Translate response back to Tamil tamil_response = translate_english_to_tamil(english_response) # Add empathetic phrases in Tamil if they're not already present empathetic_phrases = [ "கவலைப்பட வேண்டாம்", # Don't worry "இது கையாளக்கூடியது", # This is manageable "இதை மேம்படுத்த முடியும்" # You can improve this ] # Check if at least one empathetic phrase is present has_empathetic_phrase = any(phrase in tamil_response for phrase in empathetic_phrases) # Add an empathetic phrase at the beginning if none found if not has_empathetic_phrase: tamil_response = f"{empathetic_phrases[0]}. {tamil_response}" # Step 5: Convert to speech audio_file = "response_audio.mp3" audio_data = text_to_speech(tamil_response, audio_file) # Log success or failure of audio generation if audio_data: logger.info("Audio response generated successfully") else: logger.warning("Failed to generate audio response") return { "original_query": tamil_text, "translated_query": english_query, "english_response": english_response, "tamil_response": tamil_response, "audio_file": audio_file if audio_data else None }