Spaces:

tamilprabaharan
/

AI_Doctor

Build error

App Files Files Community

AI_Doctor / voice.py

tamilprabaharan

Initial commit of AI Doctor App

c7077c5 5 months ago

raw

history blame contribute delete

14.4 kB

	# voice.py (Updated with Azure OpenAI integration)
	import os
	import speech_recognition as sr
	import google.generativeai as genai
	import tempfile
	import logging
	from io import BytesIO
	import re
	import pygame
	from translate import Translator
	import base64
	import streamlit as st
	from google.cloud import texttospeech
	import json
	from openai import AzureOpenAI

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Get API keys from environment variables
	gemini_api_key = os.getenv('GEMINI_API_KEY', "AIzaSyCZL29aqWTmP_NTzkGILK4Kujx_MuyRAs4")
	google_tts_credentials = os.getenv('GOOGLE_TTS_CREDENTIALS', "D:/AI and Data Science/Projects/AI DoctorV2/tamiltextspeech-458116-147b3efcaf84.json")

	# Azure OpenAI configuration
	AZURE_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
	AZURE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
	MODEL_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
	API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview")

	# Initialize Azure OpenAI client
	try:
	azure_client = AzureOpenAI(
	api_key=AZURE_API_KEY,
	azure_endpoint=AZURE_ENDPOINT,
	api_version=API_VERSION
	)
	logger.info("Azure OpenAI client initialized successfully")
	except Exception as e:
	logger.error(f"Failed to initialize Azure OpenAI client: {str(e)}")
	azure_client = None

	# Initialize Google TTS client
	try:
	# Set credentials from JSON file
	if os.path.exists(google_tts_credentials):
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_tts_credentials
	tts_client = texttospeech.TextToSpeechClient()
	logger.info("Google Text-to-Speech client initialized successfully")
	else:
	logger.warning(f"Google TTS credentials file not found: {google_tts_credentials}")
	tts_client = None
	except Exception as e:
	logger.error(f"Failed to initialize Google TTS: {str(e)}")
	tts_client = None

	# Configure Gemini for translations only
	genai.configure(api_key=gemini_api_key)
	model = genai.GenerativeModel('gemini-1.5-pro')

	def listen_tamil():
	"""Listen to Tamil speech with improved end detection and error handling"""
	recognizer = sr.Recognizer()
	with sr.Microphone() as source:
	logger.info("Listening for Tamil speech...")
	# Adjust for ambient noise
	recognizer.adjust_for_ambient_noise(source, duration=1.5) # Increased duration

	# Improve speech detection with better pause threshold
	recognizer.pause_threshold = 1.0 # Increased pause threshold for better recognition
	recognizer.energy_threshold = 300 # Adjust sensitivity

	try:
	st.info("🎤 Listening... Please speak in Tamil")
	audio = recognizer.listen(source, timeout=15, phrase_time_limit=30) # Extended timeout
	logger.info("Speech detected, processing...")
	st.success("✅ Speech recorded! Processing...")
	except sr.WaitTimeoutError:
	logger.error("No speech detected")
	st.error("❌ No speech detected. Please try again.")
	return None

	try:
	# Using Google's speech recognition with Tamil language
	tamil_text = recognizer.recognize_google(audio, language='ta-IN')
	logger.info(f"Recognized Tamil text: {tamil_text}")
	return tamil_text
	except sr.UnknownValueError:
	logger.error("Could not understand audio")
	st.error("❌ Could not understand the speech. Please try again more clearly.")
	return None
	except sr.RequestError as e:
	logger.error(f"Speech recognition service error: {e}")
	st.error("❌ Speech recognition service error. Please try again later.")
	return None

	def translate_tamil_to_english(tamil_text):
	"""Translate Tamil text to English while preserving numbers"""
	if not tamil_text:
	return ""

	# Extract numbers from the text
	numbers = re.findall(r'\d+\.?\d*', tamil_text)

	# Replace numbers with placeholders
	for i, num in enumerate(numbers):
	tamil_text = tamil_text.replace(num, f'NUM{i}PLACEHOLDER')

	try:
	# Use Gemini for more accurate translation
	prompt = f"""Translate this Tamil text to English accurately, preserving the exact meaning:

	{tamil_text}

	Return only the translation, nothing else."""

	response = model.generate_content(prompt)
	translation = response.text

	# Fallback to basic translator if Gemini fails
	if not translation or len(translation) < 5:
	translator = Translator(to_lang="en", from_lang="ta")
	translation = translator.translate(tamil_text)

	# Restore numbers
	for i, num in enumerate(numbers):
	translation = translation.replace(f'NUM{i}PLACEHOLDER', num)

	# Clean up any artifacts
	translation = re.sub(r'\s+', ' ', translation).strip()
	logger.info(f"Translation result: {translation}")

	return translation

	except Exception as e:
	logger.error(f"Translation error: {e}")
	# Try fallback translator
	try:
	translator = Translator(to_lang="en", from_lang="ta")
	return translator.translate(tamil_text)
	except:
	return tamil_text # Return original if translation fails

	def translate_english_to_tamil(english_text):
	"""Translate English text to Tamil while preserving numbers"""
	if not english_text:
	return ""

	# Extract numbers from the text
	numbers = re.findall(r'\d+\.?\d*', english_text)

	# Replace numbers with placeholders
	for i, num in enumerate(numbers):
	english_text = english_text.replace(num, f'NUM{i}PLACEHOLDER')

	try:
	# Use Gemini for more accurate translation
	prompt = f"""Translate this English text to Tamil accurately, preserving the exact meaning:

	{english_text}

	Return only the translation, nothing else."""

	response = model.generate_content(prompt)
	translation = response.text

	# Fallback to basic translator if Gemini fails
	if not translation or len(translation) < 5:
	translator = Translator(to_lang="ta", from_lang="en")
	translation = translator.translate(english_text)

	# Restore numbers
	for i, num in enumerate(numbers):
	translation = translation.replace(f'NUM{i}PLACEHOLDER', num)

	# Clean up any artifacts
	translation = re.sub(r'\s+', ' ', translation).strip()
	logger.info(f"Translation to Tamil: {translation}")

	return translation

	except Exception as e:
	logger.error(f"Translation error: {e}")
	# Try fallback translator
	try:
	translator = Translator(to_lang="ta", from_lang="en")
	return translator.translate(english_text)
	except:
	return english_text # Return original if translation fails

	def process_with_azure_openai(english_text, medical_summary):
	"""Process medical report with Azure OpenAI using empathetic approach"""
	if not english_text or not medical_summary:
	return "No data available to process."

	if not azure_client:
	logger.error("Azure OpenAI client not initialized")
	return "Sorry, the AI service is currently unavailable."

	try:
	prompt = f"""You are a compassionate medical assistant. Analyze the medical report and respond to the user's question.

	User's question: {english_text}

	Requirements:
	1. Respond only if the question relates to the medical report
	2. Keep the response under 100 words
	3. Use simple, non-medical language when possible
	4. Focus on answering the specific question
	5. Be empathetic and reassuring (avoid causing panic)
	6. Include positive, actionable health improvement suggestions
	7. Use phrases like "Don't worry", "You can improve this by", "This is manageable"

	Medical Report:
	{medical_summary}
	"""

	response = azure_client.chat.completions.create(
	model=MODEL_NAME,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.3,
	max_tokens=400
	)

	processed_text = response.choices[0].message.content
	logger.info("Successfully processed query with Azure OpenAI")
	return processed_text

	except Exception as e:
	logger.error(f"Error processing with Azure OpenAI: {str(e)}")
	return "I apologize, but I couldn't process your question about the medical report."

	def text_to_speech(text, output_file="output.mp3"):
	"""Convert text to speech using Google TTS"""
	if not text:
	logger.warning("No text provided for speech synthesis")
	return None

	try:
	if tts_client:
	# Configure the synthesis input
	synthesis_input = texttospeech.SynthesisInput(text=text)

	# Build the voice request, selecting Tamil language and female voice
	voice = texttospeech.VoiceSelectionParams(
	language_code="ta-IN",
	ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
	)

	# Select the audio file type with improved settings
	audio_config = texttospeech.AudioConfig(
	audio_encoding=texttospeech.AudioEncoding.MP3,
	speaking_rate=0.9, # Slightly slower for better comprehension
	pitch=0.0, # Normal pitch
	volume_gain_db=1.0 # Slightly louder
	)

	# Perform the text-to-speech request
	response = tts_client.synthesize_speech(
	input=synthesis_input,
	voice=voice,
	audio_config=audio_config
	)

	# Save the response to a file
	with open(output_file, "wb") as out:
	out.write(response.audio_content)
	logger.info(f"Audio content written to file {output_file}")

	# Return audio bytes for streaming
	audio_bytes = BytesIO(response.audio_content)
	return audio_bytes
	else:
	logger.warning("Google TTS client not available")
	return None

	except Exception as e:
	logger.error(f"Error in text-to-speech: {e}")
	return None

	def play_audio(audio_file):
	"""Play audio file using pygame"""
	try:
	pygame.mixer.init()
	pygame.mixer.music.load(audio_file)
	pygame.mixer.music.play()
	while pygame.mixer.music.get_busy():
	pygame.time.Clock().tick(10)
	except Exception as e:
	logger.error(f"Error playing audio: {e}")

	def get_base64_audio(audio_file):
	"""Convert audio file to base64 for embedding"""
	with open(audio_file, "rb") as f:
	data = f.read()
	return base64.b64encode(data).decode()

	def play_audio_response(audio_file):
	"""Play audio file automatically in browser"""
	if audio_file and os.path.exists(audio_file):
	try:
	# Create HTML with autoplay audio element
	audio_html = f"""
	<audio id="response_audio" autoplay="true">
	<source src="data:audio/mp3;base64,{get_base64_audio(audio_file)}" type="audio/mp3">
	</audio>
	<script>
	// Ensure audio plays automatically
	var audio = document.getElementById("response_audio");
	audio.play().catch(function(error) {{
	console.error("Audio playback failed:", error);
	}});
	</script>
	"""
	st.components.v1.html(audio_html, height=0)
	logger.info("Audio playback triggered")
	except Exception as e:
	logger.error(f"Error in auto-play: {e}")

	def get_medical_report_answer(medical_summary, tamil_text=None):
	"""Process a voice query about the medical report"""
	# If tamil_text is not provided, listen for it
	if not tamil_text:
	tamil_text = listen_tamil()

	if not tamil_text:
	return {
	"original_query": None,
	"translated_query": None,
	"english_response": "No speech detected. Please try again.",
	"tamil_response": "பேச்சு இல்லை. மீண்டும் முயற்சிக்கவும்.",
	"audio_file": None
	}

	# Step 2: Translate Tamil to English
	english_query = translate_tamil_to_english(tamil_text)

	# Step 3: Process with Azure OpenAI instead of Gemini
	english_response = process_with_azure_openai(english_query, medical_summary)

	# Step 4: Translate response back to Tamil
	tamil_response = translate_english_to_tamil(english_response)

	# Add empathetic phrases in Tamil if they're not already present
	empathetic_phrases = [
	"கவலைப்பட வேண்டாம்", # Don't worry
	"இது கையாளக்கூடியது", # This is manageable
	"இதை மேம்படுத்த முடியும்" # You can improve this
	]

	# Check if at least one empathetic phrase is present
	has_empathetic_phrase = any(phrase in tamil_response for phrase in empathetic_phrases)

	# Add an empathetic phrase at the beginning if none found
	if not has_empathetic_phrase:
	tamil_response = f"{empathetic_phrases[0]}. {tamil_response}"

	# Step 5: Convert to speech
	audio_file = "response_audio.mp3"
	audio_data = text_to_speech(tamil_response, audio_file)

	# Log success or failure of audio generation
	if audio_data:
	logger.info("Audio response generated successfully")
	else:
	logger.warning("Failed to generate audio response")

	return {
	"original_query": tamil_text,
	"translated_query": english_query,
	"english_response": english_response,
	"tamil_response": tamil_response,
	"audio_file": audio_file if audio_data else None
	}