Talking Image

import gradio as gr
from gradio_client import Client
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

# Load the model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

def transcribe_speech(audio_path):
    if audio_path is None:
        raise gr.Error("No audio file provided.")
    speech, _ = librosa.load(audio_path, sr=16000)
    input_values = processor(speech, return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription[0]

def get_dreamtalk(image_in, speech):
    if image_in is None or speech is None:
        raise gr.Error("Image or speech input is missing.")
    try:
        client = Client("https://fffiloni-dreamtalk.hf.space/")
        result = client.predict(
            speech, # filepath  in 'Audio input' Audio component
            image_in,   # filepath  in 'Image' Image component
            "M030_front_neutral_level1_001.mat",   # Literal in 'emotional style' Dropdown component
            api_name="/infer"
        )
        return result['video']
    except Exception as e:
        print(f"Error in get_dreamtalk: {e}")
        raise gr.Error(f"Error in get_dreamtalk: {str(e)}")

def pipe(text, voice, image_in):
    if text is None or voice is None or image_in is None:
        raise gr.Error("All inputs (text, voice, image) are required.")
    try:
        speech = transcribe_speech(voice)
        video = get_dreamtalk(image_in, speech)
        return video
    except Exception as e:
        print(f"An error occurred while processing: {e}")
        raise gr.Error(f"An error occurred while processing: {str(e)}")

with gr.Blocks() as demo:
    with gr.Column():
        gr.HTML("""
        <h1 style="text-align: center;">
        Talking Image 
        </h1>
        <h3 style="text-align: center;">
        Clone your voice and make your photos speak. 
        </h3>
        """)
        with gr.Row():
            with gr.Column():
                image_in = gr.Image(label="Portrait IN", type="filepath", value="./creatus.jpg")
            with gr.Column():
                voice = gr.Audio(type="filepath", label="Upload or Record Speaker audio (Optional voice cloning)")
                text = gr.Textbox(label="text")
                submit_btn = gr.Button('Submit')
            with gr.Column():
                video_o = gr.Video(label="Video result")
    submit_btn.click(
        fn=pipe,
        inputs=[text, voice, image_in],
        outputs=[video_o],
        concurrency_limit=3
    )
demo.queue(max_size=10).launch(show_error=True, show_api=False)