import torch import numpy as np import subprocess from yt_dlp import YoutubeDL from sentence_transformers import SentenceTransformer, util from TTS.api import TTS import moviepy.editor as mp import gradio as gr import cv2 # ------------------------------- # 1. Configuration # ------------------------------- youtube_url = "https://www.youtube.com/@TheGodHouseCentre" # TTS Model tts = TTS(model_name="tts_models/en/vctk/vits", progress_bar=False, gpu=torch.cuda.is_available()) # Text Embedding Model text_model = SentenceTransformer("all-MiniLM-L6-v2") # Demo dataset dataset = [ {"question": "Hello", "answer": "Hi there! How can I help you?", "video_url": youtube_url} ] # Precompute embeddings questions = [item["question"] for item in dataset] question_embeddings = text_model.encode(questions, convert_to_tensor=True) # ------------------------------- # 2. Retrieve Closest Answer # ------------------------------- def retrieve_answer(query): query_emb = text_model.encode(query, convert_to_tensor=True) cos_scores = util.cos_sim(query_emb, question_embeddings)[0] best_idx = torch.argmax(cos_scores) return dataset[best_idx] # ------------------------------- # 3. TTS Audio Generation # ------------------------------- def generate_audio(text): audio_path = "/tmp/temp_audio.wav" tts.tts_to_file(text=text, file_path=audio_path) return audio_path # ------------------------------- # 4. YouTube Streaming (single frame demo) # ------------------------------- def stream_youtube_frame(url): ydl_opts = {'format': 'best', 'quiet': True, 'noplaylist': True} with YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) stream_url = info['url'] width, height = 640, 360 frame_size = width * height * 3 process = subprocess.Popen( ['ffmpeg', '-i', stream_url, '-f', 'image2pipe', '-pix_fmt', 'rgb24', '-vcodec', 'rawvideo', '-'], stdout=subprocess.PIPE ) raw_frame = process.stdout.read(frame_size) frame = np.frombuffer(raw_frame, dtype=np.uint8).reshape((height, width, 3)) return frame # ------------------------------- # 5. Video Generation with Audio # ------------------------------- def generate_video_with_audio(source_frame, driving_audio): tmp_frame = "/tmp/temp_frame.jpg" tmp_output = "/tmp/output_video.mp4" cv2.imwrite(tmp_frame, cv2.cvtColor(source_frame, cv2.COLOR_RGB2BGR)) clip = mp.ImageClip(tmp_frame).set_duration(5) clip = clip.set_audio(mp.AudioFileClip(driving_audio)) clip.write_videofile(tmp_output, fps=24, verbose=False, logger=None) return tmp_output # ------------------------------- # 6. Main Pipeline # ------------------------------- def answer_question(query): row = retrieve_answer(query) audio_file = generate_audio(row["answer"]) frame = stream_youtube_frame(row["video_url"]) video_file = generate_video_with_audio(frame, audio_file) return video_file # ------------------------------- # 7. Gradio Interface # ------------------------------- iface = gr.Interface( fn=answer_question, inputs=gr.Textbox(label="Ask a question"), outputs=gr.Video(label="Generated Video"), title="AI Video/Audio Generator", description="Ask a question and get a video+audio response generated from YouTube frames with TTS." ) if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)