Spaces:

RSHVR
/

Command_RTC

Sleeping

App Files Files Community

RSHVR commited on Mar 30

Commit

839f7b2

verified ·

1 Parent(s): 12d303c

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -173

app.py CHANGED Viewed

@@ -1,191 +1,97 @@
 import os
-import tempfile
 import gradio as gr
-import torch
-import torchaudio
-import spaces
-from huggingface_hub import snapshot_download
-from tortoise.api import TextToSpeech
-from tortoise.utils.audio import load_audio
-import numpy as np
-import uuid
-from pydub import AudioSegment
-# Create output directory if it doesn't exist
-os.makedirs("outputs", exist_ok=True)
-# Check for CUDA availability (this will show CPU due to Zero-GPU)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Initial device check: {device}")
-# Create a tensor to verify Zero-GPU is working
-zero = torch.Tensor([0])
-if torch.cuda.is_available():
-    zero = zero.cuda()
-    print(f"Zero tensor device: {zero.device}")
-# Initialize Tortoise TTS (will be loaded on demand with Zero-GPU)
-tts = None
-# Available preset voice options
-PRESET_VOICES = ["random", "angie", "daniel", "deniro", "emma", "freeman",
-                "geralt", "halle", "jlaw", "lj", "mol", "myself", "pat",
-                "snakes", "tim_reynolds", "tom", "train_atkins", "train_daws",
-                "train_dotrice", "train_dreams", "train_empire", "train_grace",
-                "train_kennard", "train_lescault", "train_mouse", "weaver", "william"]
-def process_audio_file(audio_file_path):
-    """Process uploaded audio file to ensure it meets Tortoise requirements"""
-    # Load audio file
-    audio = AudioSegment.from_file(audio_file_path)
-    # Convert to WAV format if it's not already
-    if not audio_file_path.lower().endswith('.wav'):
-        temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
-        audio.export(temp_wav.name, format="wav")
-        audio_file_path = temp_wav.name
-    # Resample to 22.05kHz which is what Tortoise expects
-    y, sr = torchaudio.load(audio_file_path)
-    if sr != 22050:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)
-        y = resampler(y)
-        temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
-        torchaudio.save(temp_file.name, y, 22050)
-        audio_file_path = temp_file.name
-    return audio_file_path
-@spaces.GPU
-def generate_tts_with_voice(text, voice_sample_path=None, preset_voice=None):
-    """Generate TTS audio using Tortoise with either a custom voice or preset"""
-    global tts
-    try:
-        # Now that we're inside the @spaces.GPU decorated function, CUDA should be available
-        print(f"GPU function device: {zero.device}")
-        # Initialize TTS model if not already initialized
-        if tts is None:
-            tts = TextToSpeech(use_deepspeed=True if torch.cuda.is_available() else False)
-            print("TTS model initialized")
-        voice_samples = None
-        if voice_sample_path:
-            # Process the voice sample
-            voice_sample_path = process_audio_file(voice_sample_path)
-            voice_samples, _ = load_audio(voice_sample_path, 22050)
-            voice_samples = [voice_samples]
-            preset_voice = None
-        elif preset_voice and preset_voice != "random":
-            voice_samples = None
-        else:  # random voice
-            voice_samples = None
-            preset_voice = "random"
-        # Generate the speech
-        output_id = str(uuid.uuid4())[:8]
-        output_path = f"outputs/tts_output_{output_id}.wav"
-        gen = tts.tts_with_preset(
-            text,
-            voice_samples=voice_samples,
-            preset=preset_voice
-        )
-        # Save the generated audio
-        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
-        return output_path, "Success: TTS generation completed."
-    except Exception as e:
-        return None, f"Error: {str(e)}"
-@spaces.GPU
-def tts_interface(text, audio_file, preset_voice, record_audio):
-    """Interface function for Gradio with GPU acceleration"""
-    print(f"Processing with device: {zero.device}")
-    voice_sample_path = None
-    # Determine which voice input to use
-    if record_audio is not None:
-        # Use recorded audio
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
-        temp_file.close()
-        record_audio = (record_audio[0], 22050)  # Ensure sample rate is 22050
-        torchaudio.save(temp_file.name, torch.tensor(record_audio[0]).unsqueeze(0), record_audio[1])
-        voice_sample_path = temp_file.name
-    elif audio_file is not None:
-        # Use uploaded audio file
-        voice_sample_path = audio_file
-    # If no custom voice is provided, use the preset
-    if voice_sample_path is None and preset_voice == "":
-        preset_voice = "random"
-    # Generate TTS
-    output_path, message = generate_tts_with_voice(text, voice_sample_path, preset_voice)
-    if output_path:
-        return output_path, message
-    else:
-        return None, message
-# Create Gradio interface
-with gr.Blocks(title="Tortoise TTS with Voice Cloning") as demo:
-    gr.Markdown("# Tortoise Text-to-Speech with Voice Cloning")
-    gr.Markdown("Enter text and either upload a voice sample, record your voice, or select a preset voice.")
-    with gr.Row():
-        with gr.Column():
-            text_input = gr.Textbox(
-                label="Text to speak",
-                placeholder="Enter the text you want to convert to speech...",
-                lines=5
-            )
-            preset_voice = gr.Dropdown(
-                choices=[""] + PRESET_VOICES,
-                label="Preset Voice (optional)",
-                value=""
-            )
-        with gr.Column():
-            gr.Markdown("### Voice Input Options")
-            with gr.Tab("Upload Voice"):
-                audio_file = gr.Audio(
-                    label="Upload Voice Sample (optional)",
-                    type="filepath"
-                )
-            with gr.Tab("Record Voice"):
-                record_audio = gr.Audio(
-                    label="Record Your Voice (optional)",
-                    source="microphone"
-                )
-    generate_button = gr.Button("Generate Speech")
-    with gr.Row():
-        output_audio = gr.Audio(label="Generated Speech")
-        output_message = gr.Textbox(label="Status")
-    generate_button.click(
-        fn=tts_interface,
-        inputs=[text_input, audio_file, preset_voice, record_audio],
-        outputs=[output_audio, output_message]
     )
-    gr.Markdown("### About This App")
-    gr.Markdown("""
-    This app uses Tortoise-TTS to generate high-quality speech from text.
-    You can:
-    - Enter any text you want to be spoken
-    - Upload or record a voice sample for voice cloning
-    - Or select from pre-defined voice presets
-    The app runs on Hugging Face Spaces with Zero-GPU optimization.
-    """)
 if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
+from fastrtc import Stream, ReplyOnPause, AdditionalOutputs
+# Import your modules
+import stt
+import tts
+import cohereAPI
+# Environment variables
+COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+system_message = "You respond concisely, in about 15 words or less"
+# Initialize conversation history
+conversation_history = []
+async def response(audio_file_path):
+    global conversation_history
+    # Convert speech to text
+    user_message = await stt.transcribe_audio(audio_file_path)
+    # Add user message to chat history
+    yield AdditionalOutputs({"transcript": user_message, "role": "user"})
+    # Send text to Cohere API
+    response_text, updated_history = await cohereAPI.send_message(
+        system_message,
+        user_message,
+        conversation_history,
+        COHERE_API_KEY
+    )
+    # Update conversation history
+    conversation_history = updated_history
+    # Generate speech from text
+    _, (sample_rate, speech_array) = await tts.generate_speech(
+        response_text,
+        voice_preset="random"
+    )
+    # Add assistant message to chat history
+    yield AdditionalOutputs({"transcript": response_text, "role": "assistant"})
+    # Return audio response
+    yield (sample_rate, speech_array)
+# Create FastRTC stream with ReplyOnPause
+stream = Stream(
+    handler=ReplyOnPause(response),
+    modality="audio",
+    mode="send-receive",
+    additional_outputs=[
+        {"name": "transcript", "type": "text"},
+        {"name": "role", "type": "text"}
+    ]
+)
+# Create Gradio interface that uses the FastRTC stream
+with gr.Blocks(title="Voice Chat Assistant with ReplyOnPause") as demo:
+    gr.Markdown("# Voice Chat Assistant")
+    gr.Markdown("Speak and pause to trigger a response.")
+    chatbot = gr.Chatbot(label="Conversation")
+    # Mount the FastRTC UI
+    stream_ui = stream.ui(label="Speak")
+    # Handle additional outputs from FastRTC to update the chatbot
+    def update_chat(transcript, role, history):
+        if transcript and role:
+            if role == "user":
+                history.append((transcript, None))
+            elif role == "assistant":
+                if history and history[-1][1] is None:
+                    history[-1] = (history[-1][0], transcript)
+                else:
+                    history.append((None, transcript))
+        return history
+    stream_ui.change(
+        update_chat,
+        inputs=[stream_ui.output_components[0], stream_ui.output_components[1], chatbot],
+        outputs=[chatbot]
     )
+    clear_btn = gr.Button("Clear Conversation")
+    clear_btn.click(lambda: [], outputs=[chatbot])
+# Launch the app
 if __name__ == "__main__":
+    demo.queue().launch(
+        server_name="0.0.0.0",
+        share=False,
+        show_error=True
+    )