import os import streamlit as st from groq import Groq from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline from espnet2.bin.tts_inference import Text2Speech import soundfile as sf from pydub import AudioSegment import io from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase import av import numpy as np import nltk # Download NLTK data nltk.download("averaged_perceptron_tagger") nltk.download("cmudict") # Load Groq API key from environment secrets GROQ_API_KEY = os.getenv("GROQ_API_KEY") if not GROQ_API_KEY: st.error("Groq API key not found. Please add it as a secret.") st.stop() # Initialize Groq client groq_client = Groq(api_key=GROQ_API_KEY) # Load models @st.cache_resource # Use st.cache_resource for caching models def load_models(): # Speech-to-Text processor = AutoProcessor.from_pretrained("openai/whisper-small") stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small") stt_pipe = pipeline( "automatic-speech-recognition", model=stt_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, return_timestamps=True # Enable timestamps for long-form audio ) # Text-to-Speech tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet") return stt_pipe, tts_model stt_pipe, tts_model = load_models() # Audio recorder class AudioRecorder(AudioProcessorBase): def __init__(self): self.audio_frames = [] def recv(self, frame: av.AudioFrame) -> av.AudioFrame: self.audio_frames.append(frame.to_ndarray()) return frame # Streamlit app st.title("Voice and Text Chatbot") # Sidebar for mode selection mode = st.sidebar.radio("Select Mode", ["Text Chatbot", "Voice Chatbot"]) if mode == "Text Chatbot": # Text Chatbot st.header("Text Chatbot") user_input = st.text_input("Enter your message:") if user_input: try: # Generate response using Groq API chat_completion = groq_client.chat.completions.create( messages=[{"role": "user", "content": user_input}], model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1024 ) response = chat_completion.choices[0].message.content st.write("Generated Response:", response) # Convert response to speech speech, *_ = tts_model(response, spembs=tts_model.spembs[0]) # Use the first speaker embedding sf.write("response.wav", speech, 22050) st.audio("response.wav") except Exception as e: st.error(f"Error generating response: {e}") elif mode == "Voice Chatbot": # Voice Chatbot st.header("Voice Chatbot") # Audio recorder st.write("Record your voice:") webrtc_ctx = webrtc_streamer( key="audio-recorder", mode=WebRtcMode.SENDONLY, audio_processor_factory=AudioRecorder, media_stream_constraints={"audio": True, "video": False}, ) if webrtc_ctx.audio_processor: st.write("Recording... Press 'Stop' to finish recording.") # Save recorded audio to a WAV file if st.button("Stop and Process Recording"): audio_frames = webrtc_ctx.audio_processor.audio_frames if audio_frames: # Combine audio frames into a single array audio_data = np.concatenate(audio_frames) # Save as WAV file sf.write("recorded_audio.wav", audio_data, samplerate=16000) st.success("Recording saved as recorded_audio.wav") # Process the recorded audio speech, _ = sf.read("recorded_audio.wav") output = stt_pipe(speech) # Transcribe with timestamps # Display the full transcribed text st.write("Transcribed Text:", output['text']) # Display the text with timestamps (optional) if 'chunks' in output: st.write("Transcribed Text with Timestamps:") for chunk in output['chunks']: st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}") # Generate response using Groq API try: chat_completion = groq_client.chat.completions.create( messages=[{"role": "user", "content": output['text']}], model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1024 ) response = chat_completion.choices[0].message.content st.write("Generated Response:", response) # Convert response to speech speech, *_ = tts_model(response, spembs=tts_model.spembs[0]) # Use the first speaker embedding sf.write("response.wav", speech, 22050) st.audio("response.wav") except Exception as e: st.error(f"Error generating response: {e}") else: st.error("No audio recorded. Please try again.")