Spaces:
Running
on
Zero
Running
on
Zero
File size: 30,096 Bytes
9fa4d05 acf56d2 9fa4d05 acf56d2 9fa4d05 acf56d2 9fa4d05 fb68449 9fa4d05 fb68449 9fa4d05 fb68449 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 afd038c 9fa4d05 fb68449 9fa4d05 fb68449 9fa4d05 fb68449 9fa4d05 afd038c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 |
#!/usr/bin/env python3
import os
import shutil
import uuid
import json
import gradio as gr
import torch
from PIL import Image
import time
# Import pipeline modules
from utils.transcribe import transcribe_audio, list_available_whisper_models
from utils.segment import segment_lyrics
from utils.prompt_gen import generate_scene_prompts, list_available_llm_models
from utils.video_gen import (
create_video_segments,
list_available_image_models,
list_available_video_models,
preview_image_generation
)
from utils.glue import stitch_and_caption
# Create output directories if not existing
os.makedirs("templates", exist_ok=True)
os.makedirs("templates/minimalist", exist_ok=True)
os.makedirs("tmp", exist_ok=True)
# Load available model options
WHISPER_MODELS = list_available_whisper_models()
DEFAULT_WHISPER_MODEL = "medium.en"
LLM_MODELS = list_available_llm_models()
DEFAULT_LLM_MODEL = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
IMAGE_MODELS = list_available_image_models()
DEFAULT_IMAGE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
VIDEO_MODELS = list_available_video_models()
DEFAULT_VIDEO_MODEL = "stabilityai/stable-video-diffusion-img2vid-xt"
# Default prompt template
DEFAULT_PROMPT_TEMPLATE = """You are a cinematographer generating a scene for a music video.
Describe one vivid visual scene ({max_words} words max) that matches the mood and imagery of these lyrics.
Focus on setting, atmosphere, lighting, and framing. Do not mention the artist or singing.
Use only {max_sentences} sentence(s).
Lyrics: "{lyrics}"
Scene description:"""
# Prepare style template options by scanning templates/ directory
TEMPLATE_DIR = "templates"
template_choices = []
for name in os.listdir(TEMPLATE_DIR):
if os.path.isdir(os.path.join(TEMPLATE_DIR, name)):
template_choices.append(name)
template_choices = sorted(template_choices)
DEFAULT_TEMPLATE = "minimalist" if "minimalist" in template_choices else (template_choices[0] if template_choices else None)
# Advanced settings defaults
DEFAULT_RESOLUTION = "1024x576" # default resolution
DEFAULT_FPS_MODE = "Auto" # auto-match lyric timing
DEFAULT_SEED = 0 # 0 means random seed
DEFAULT_MAX_WORDS = 30 # default word limit for scene descriptions
DEFAULT_MAX_SENTENCES = 1 # default sentence limit
DEFAULT_CROSSFADE = 0.25 # default crossfade duration
DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain"
# Mode for image generation
IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
DEFAULT_IMAGE_MODE = "Independent"
def process_audio(
audio_path,
whisper_model,
llm_model,
image_model,
video_model,
template_name,
resolution,
fps_mode,
seed,
prompt_template,
max_words,
max_sentences,
style_suffix,
image_mode,
strength,
crossfade_duration
):
"""
End-to-end processing function to generate the music video with kinetic subtitles.
Returns final video file path for preview and download.
"""
# Default progress function just prints to console
def progress(percent, desc=""):
print(f"Progress: {percent}% - {desc}")
# Input validation
if not audio_path or not os.path.exists(audio_path):
raise ValueError("Please provide a valid audio file")
if not template_name or template_name not in template_choices:
template_name = DEFAULT_TEMPLATE or "minimalist"
# Prepare a unique temp directory for this run (to avoid conflicts between parallel jobs)
session_id = str(uuid.uuid4())[:8]
work_dir = os.path.join("tmp", f"run_{session_id}")
os.makedirs(work_dir, exist_ok=True)
# Save parameter settings for debugging
params = {
"whisper_model": whisper_model,
"llm_model": llm_model,
"image_model": image_model,
"video_model": video_model,
"template": template_name,
"resolution": resolution,
"fps_mode": fps_mode,
"seed": seed,
"max_words": max_words,
"max_sentences": max_sentences,
"style_suffix": style_suffix,
"image_mode": image_mode,
"strength": strength,
"crossfade_duration": crossfade_duration
}
with open(os.path.join(work_dir, "params.json"), "w") as f:
json.dump(params, f, indent=2)
try:
# 1. Transcription
progress(0, desc="Transcribing audio with Whisper...")
try:
result = transcribe_audio(audio_path, whisper_model)
if not result or 'segments' not in result:
raise ValueError("Transcription failed - no speech detected")
except Exception as e:
raise RuntimeError(f"Audio transcription failed: {str(e)}")
progress(15, desc="Transcription completed. Segmenting lyrics...")
# 2. Segmentation
try:
segments = segment_lyrics(result)
if not segments:
raise ValueError("No valid segments found in transcription")
except Exception as e:
raise RuntimeError(f"Audio segmentation failed: {str(e)}")
progress(25, desc=f"Detected {len(segments)} lyric segments. Generating scene prompts...")
# 3. Scene-prompt generation
try:
# Format the prompt template with the limits
formatted_prompt_template = prompt_template.format(
max_words=max_words,
max_sentences=max_sentences,
lyrics="{lyrics}" # This placeholder will be filled for each segment
)
prompts = generate_scene_prompts(
segments,
llm_model=llm_model,
prompt_template=formatted_prompt_template,
style_suffix=style_suffix
)
if len(prompts) != len(segments):
raise ValueError(f"Prompt generation mismatch: {len(prompts)} prompts for {len(segments)} segments")
except Exception as e:
raise RuntimeError(f"Scene prompt generation failed: {str(e)}")
# Save generated prompts for display or debugging
with open(os.path.join(work_dir, "prompts.txt"), "w", encoding="utf-8") as f:
for i, p in enumerate(prompts):
f.write(f"Segment {i+1}: {p}\n")
progress(35, desc="Scene prompts ready. Generating video segments...")
# Parse resolution with validation
try:
if resolution and "x" in resolution.lower():
width, height = map(int, resolution.lower().split("x"))
if width <= 0 or height <= 0:
raise ValueError("Invalid resolution values")
else:
width, height = 1024, 576 # default high resolution
except (ValueError, TypeError) as e:
print(f"Warning: Invalid resolution '{resolution}', using default 1024x576")
width, height = 1024, 576
# Determine FPS handling
fps_value = None
dynamic_fps = True
if fps_mode and fps_mode.lower() != "auto":
try:
fps_value = float(fps_mode)
if fps_value <= 0:
raise ValueError("FPS must be positive")
dynamic_fps = False
except (ValueError, TypeError):
print(f"Warning: Invalid FPS '{fps_mode}', using auto mode")
fps_value = None
dynamic_fps = True
# 4. Imageβvideo generation for each segment
try:
segment_videos = create_video_segments(
segments,
prompts,
image_model=image_model,
video_model=video_model,
width=width,
height=height,
dynamic_fps=dynamic_fps,
base_fps=fps_value,
seed=seed,
work_dir=work_dir,
image_mode=image_mode,
strength=strength,
progress_callback=None
)
if not segment_videos:
raise ValueError("No video segments were generated")
except Exception as e:
raise RuntimeError(f"Video generation failed: {str(e)}")
progress(80, desc="Video segments generated. Stitching and adding subtitles...")
# 5. Concatenation & audio syncing, plus kinetic subtitles overlay
try:
final_video_path = stitch_and_caption(
segment_videos,
audio_path,
segments,
template_name,
work_dir=work_dir,
crossfade_duration=crossfade_duration
)
if not final_video_path or not os.path.exists(final_video_path):
raise ValueError("Final video file was not created")
except Exception as e:
raise RuntimeError(f"Video stitching and captioning failed: {str(e)}")
progress(100, desc="β
Generation complete!")
return final_video_path, work_dir
except Exception as e:
# Enhanced error reporting
error_msg = str(e)
if "CUDA" in error_msg or "GPU" in error_msg:
error_msg += "\n\nπ‘ Tip: This application requires a CUDA-compatible GPU with sufficient VRAM."
elif "model" in error_msg.lower():
error_msg += "\n\nπ‘ Tip: Model loading failed. Check your internet connection and try again."
elif "audio" in error_msg.lower():
error_msg += "\n\nπ‘ Tip: Please ensure your audio file is in a supported format (MP3, WAV, M4A)."
print(f"Error during processing: {error_msg}")
raise RuntimeError(error_msg)
# Define Gradio UI components
with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π΅ Audio β Kinetic-Subtitle Music Video
Transform your audio tracks into dynamic music videos with AI-generated scenes and animated subtitles.
**β¨ Features:**
- π€ **Whisper Transcription** - Accurate speech-to-text with word-level timing
- π§ **AI Scene Generation** - LLM-powered visual descriptions from lyrics
- π¨ **Image & Video AI** - Stable Diffusion + Video Diffusion models
- π¬ **Kinetic Subtitles** - Animated text synchronized with audio
- β‘ **Fully Local** - No API keys required, runs on your GPU
**π Quick Start:**
1. Upload an audio file (MP3, WAV, M4A)
2. Choose your AI models (or keep defaults)
3. Customize style and settings
4. Click "Generate Music Video"
""")
# System requirements info
with gr.Accordion("π» System Requirements & Tips", open=False):
gr.Markdown("""
**Hardware Requirements:**
- NVIDIA GPU with 8GB+ VRAM (recommended: RTX 3080/4070 or better)
- 16GB+ system RAM
- Fast storage (SSD recommended)
**Supported Audio Formats:**
- MP3, WAV, M4A, FLAC, OGG
- Recommended: Clear vocals, 30 seconds to 3 minutes
**Performance Tips:**
- Use lower resolution (512x288) for faster generation
- Choose smaller models for quicker processing
- Ensure stable power supply for GPU-intensive tasks
""")
# Main configuration
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="π΅ Upload Audio Track",
type="filepath",
info="Upload your music file. For best results, use clear audio with distinct vocals."
)
with gr.Column():
# Quick settings panel
gr.Markdown("### β‘ Quick Settings")
quick_quality = gr.Radio(
choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
value="Balanced (1024x576)",
label="Quality Preset",
info="Higher quality = better results but slower generation"
)
# Model selection tabs
with gr.Tabs():
with gr.TabItem("π€ AI Models"):
gr.Markdown("**Choose the AI models for each processing step:**")
with gr.Row():
with gr.Column():
whisper_dropdown = gr.Dropdown(
label="π€ Transcription Model (Whisper)",
choices=WHISPER_MODELS,
value=DEFAULT_WHISPER_MODEL,
info="Larger models are more accurate but slower. 'medium.en' is recommended for English."
)
llm_dropdown = gr.Dropdown(
label="π§ Scene Description Model (LLM)",
choices=LLM_MODELS,
value=DEFAULT_LLM_MODEL,
info="Language model to generate visual scene descriptions from lyrics."
)
with gr.Column():
image_dropdown = gr.Dropdown(
label="π¨ Image Generation Model",
choices=IMAGE_MODELS,
value=DEFAULT_IMAGE_MODEL,
info="Stable Diffusion model for generating scene images."
)
video_dropdown = gr.Dropdown(
label="π¬ Video Animation Model",
choices=VIDEO_MODELS,
value=DEFAULT_VIDEO_MODEL,
info="Model to animate still images into video clips."
)
with gr.TabItem("βοΈ Scene Prompting"):
gr.Markdown("**Customize how AI generates scene descriptions:**")
with gr.Column():
prompt_template_input = gr.Textbox(
label="LLM Prompt Template",
value=DEFAULT_PROMPT_TEMPLATE,
lines=6,
info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders."
)
with gr.Row():
max_words_input = gr.Slider(
label="Max Words per Scene",
minimum=10,
maximum=100,
step=5,
value=DEFAULT_MAX_WORDS,
info="Limit words in each scene description (more words = more detailed scenes)."
)
max_sentences_input = gr.Slider(
label="Max Sentences per Scene",
minimum=1,
maximum=5,
step=1,
value=DEFAULT_MAX_SENTENCES,
info="Limit sentences per scene (1-2 recommended for music videos)."
)
style_suffix_input = gr.Textbox(
label="Visual Style Keywords",
value=DEFAULT_STYLE_SUFFIX,
info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')."
)
with gr.TabItem("π¬ Video Settings"):
gr.Markdown("**Configure video output and subtitle styling:**")
with gr.Column():
with gr.Row():
template_dropdown = gr.Dropdown(
label="πͺ Subtitle Animation Style",
choices=template_choices,
value=DEFAULT_TEMPLATE,
info="Choose the kinetic subtitle animation style."
)
res_dropdown = gr.Dropdown(
label="πΊ Video Resolution",
choices=["512x288", "1024x576", "1280x720"],
value=DEFAULT_RESOLUTION,
info="Higher resolution = better quality but much slower generation."
)
with gr.Row():
fps_input = gr.Textbox(
label="ποΈ Video FPS",
value=DEFAULT_FPS_MODE,
info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')."
)
seed_input = gr.Number(
label="π± Random Seed",
value=DEFAULT_SEED,
precision=0,
info="Set seed for reproducible results (0 = random). Use same seed to recreate results."
)
with gr.Row():
image_mode_input = gr.Radio(
label="πΌοΈ Scene Generation Mode",
choices=IMAGE_MODES,
value=DEFAULT_IMAGE_MODE,
info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity."
)
strength_slider = gr.Slider(
label="π― Style Consistency Strength",
minimum=0.1,
maximum=0.9,
step=0.05,
value=0.5,
visible=False,
info="How much each scene influences the next (lower = more influence, higher = more variety)."
)
crossfade_slider = gr.Slider(
label="π Scene Transition Duration",
minimum=0.0,
maximum=1.0,
step=0.05,
value=DEFAULT_CROSSFADE,
info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)."
)
# Quick preset handling
def apply_quality_preset(preset):
if preset == "Fast (512x288)":
return gr.update(value="512x288"), gr.update(value="tiny"), gr.update(value="stabilityai/sdxl-turbo")
elif preset == "High Quality (1280x720)":
return gr.update(value="1280x720"), gr.update(value="large"), gr.update(value="stabilityai/stable-diffusion-xl-base-1.0")
else: # Balanced
return gr.update(value="1024x576"), gr.update(value="medium.en"), gr.update(value="stabilityai/stable-diffusion-xl-base-1.0")
quick_quality.change(
apply_quality_preset,
inputs=[quick_quality],
outputs=[res_dropdown, whisper_dropdown, image_dropdown]
)
# Make strength slider visible only when Consistent mode is selected
def update_strength_visibility(mode):
return gr.update(visible=(mode == "Consistent (Img2Img)"))
image_mode_input.change(update_strength_visibility, inputs=image_mode_input, outputs=strength_slider)
# Enhanced preview section
with gr.Row():
with gr.Column(scale=1):
preview_btn = gr.Button("π Preview First Scene", variant="secondary", size="lg")
gr.Markdown("*Generate a quick preview of the first scene to test your settings*")
with gr.Column(scale=2):
generate_btn = gr.Button("π¬ Generate Complete Music Video", variant="primary", size="lg")
gr.Markdown("*Start the full video generation process (this may take several minutes)*")
# Preview results
with gr.Row(visible=False) as preview_row:
with gr.Column():
preview_img = gr.Image(label="Preview Scene", type="pil", height=300)
with gr.Column():
preview_prompt = gr.Textbox(label="Generated Scene Description", lines=3)
preview_info = gr.Markdown("")
# Progress and status
progress_bar = gr.Progress()
status_text = gr.Textbox(
label="π Generation Status",
value="Ready to start...",
interactive=False,
lines=2
)
# Results section with better organization
with gr.Tabs():
with gr.TabItem("π₯ Final Video"):
output_video = gr.Video(label="Generated Music Video", format="mp4", height=400)
with gr.Row():
download_file = gr.File(label="π₯ Download Video File", file_count="single")
video_info = gr.Textbox(label="Video Information", lines=2, interactive=False)
with gr.TabItem("πΌοΈ Generated Images"):
image_gallery = gr.Gallery(
label="Scene Images from Video Generation",
columns=3,
rows=2,
height="auto",
object_fit="contain",
show_label=True
)
gallery_info = gr.Markdown("*Scene images will appear here after generation*")
with gr.TabItem("π Scene Descriptions"):
with gr.Accordion("Generated Scene Prompts", open=True):
prompt_text = gr.Markdown("", elem_id="prompt_markdown")
segment_info = gr.Textbox(
label="Segmentation Summary",
lines=3,
interactive=False,
placeholder="Segment analysis will appear here..."
)
# Preview function
def on_preview(
audio, whisper_model, llm_model, image_model,
prompt_template, max_words, max_sentences, style_suffix, resolution
):
if not audio:
return (gr.update(visible=False), None, "Please upload audio first",
"β οΈ **No audio file provided**\n\nPlease upload an audio file to generate a preview.")
# Quick transcription and segmentation of first few seconds
try:
# Extract first 10 seconds of audio for quick preview
import subprocess
import tempfile
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
temp_audio_path = temp_audio.name
# Use ffmpeg to extract first 10 seconds
subprocess.run([
"ffmpeg", "-y", "-i", audio, "-ss", "0", "-t", "10",
"-acodec", "pcm_s16le", temp_audio_path
], check=True, capture_output=True)
# Transcribe with fastest model for preview
result = transcribe_audio(temp_audio_path, "tiny")
segments = segment_lyrics(result)
os.unlink(temp_audio_path)
if not segments:
return (gr.update(visible=False), None, "No speech detected in first 10 seconds",
"β οΈ **No speech detected**\n\nTry with audio that has clear vocals at the beginning.")
first_segment = segments[0]
# Format prompt template
formatted_prompt = prompt_template.format(
max_words=max_words,
max_sentences=max_sentences,
lyrics=first_segment["text"]
)
# Generate prompt
scene_prompt = generate_scene_prompts(
[first_segment],
llm_model=llm_model,
prompt_template=formatted_prompt,
style_suffix=style_suffix
)[0]
# Generate image
if resolution and "x" in resolution.lower():
width, height = map(int, resolution.lower().split("x"))
else:
width, height = 1024, 576
image = preview_image_generation(
scene_prompt,
image_model=image_model,
width=width,
height=height
)
# Create info text
duration = first_segment['end'] - first_segment['start']
info_text = f"""
β
**Preview Generated Successfully**
**Detected Lyrics:** "{first_segment['text'][:100]}{'...' if len(first_segment['text']) > 100 else ''}"
**Scene Duration:** {duration:.1f} seconds
**Generated Description:** {scene_prompt[:150]}{'...' if len(scene_prompt) > 150 else ''}
**Image Resolution:** {width}x{height}
"""
return gr.update(visible=True), image, scene_prompt, info_text
except subprocess.CalledProcessError as e:
return (gr.update(visible=False), None, "Audio processing failed",
"β **Audio Processing Error**\n\nFFmpeg failed to process the audio file. Please check the format.")
except Exception as e:
print(f"Preview error: {e}")
return (gr.update(visible=False), None, f"Preview failed: {str(e)}",
f"β **Preview Error**\n\n{str(e)}\n\nPlease check your audio file and model settings.")
# Bind button click to processing function
def on_generate(
audio, whisper_model, llm_model, image_model, video_model,
template_name, resolution, fps, seed, prompt_template,
max_words, max_sentences, style_suffix, image_mode, strength,
crossfade_duration
):
if not audio:
return (None, None, gr.update(value="**No audio file provided**\n\nPlease upload an audio file to start generation.", visible=True),
[], "Ready to start...", "", "")
try:
# Start generation
start_time = time.time()
final_path, work_dir = process_audio(
audio, whisper_model, llm_model, image_model, video_model,
template_name, resolution, fps, int(seed), prompt_template,
max_words, max_sentences, style_suffix, image_mode, strength,
crossfade_duration
)
generation_time = time.time() - start_time
# Load prompts from file to display
prompts_file = os.path.join(work_dir, "prompts.txt")
prompts_markdown = ""
try:
with open(prompts_file, 'r', encoding='utf-8') as pf:
content = pf.read()
# Format prompts as numbered list
prompts_lines = content.strip().splitlines()
prompts_markdown = "\n".join([f"**{line}**" for line in prompts_lines])
except:
prompts_markdown = "Scene prompts not available"
# Load segment information
segment_summary = ""
try:
# Get audio duration and file info
import subprocess
duration_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", audio]
audio_duration = float(subprocess.check_output(duration_cmd, text=True).strip())
file_size = os.path.getsize(final_path) / (1024 * 1024) # MB
segment_summary = f"""π **Generation Summary:**
β’ Audio Duration: {audio_duration:.1f} seconds
β’ Processing Time: {generation_time/60:.1f} minutes
β’ Final Video Size: {file_size:.1f} MB
β’ Resolution: {resolution}
β’ Template: {template_name}"""
except:
segment_summary = f"Generation completed in {generation_time/60:.1f} minutes"
# Load generated images for the gallery
images = []
try:
import glob
image_files = glob.glob(os.path.join(work_dir, "*_img.png"))
for img_file in sorted(image_files):
try:
img = Image.open(img_file)
images.append(img)
except:
pass
except Exception as e:
print(f"Error loading images for gallery: {e}")
# Create video info
video_info = f"β
Video generated successfully!\nFile: {os.path.basename(final_path)}\nSize: {file_size:.1f} MB"
gallery_info_text = f"**{len(images)} scene images generated**" if images else "No images available"
return (final_path, final_path, gr.update(value=prompts_markdown, visible=True),
images, f"β
Generation complete! ({generation_time/60:.1f} minutes)",
video_info, segment_summary)
except Exception as e:
error_msg = str(e)
print(f"Generation error: {e}")
import traceback
traceback.print_exc()
return (None, None, gr.update(value=f"**β Generation Failed**\n\n{error_msg}", visible=True),
[], f"β Error: {error_msg}", "", "")
preview_btn.click(
on_preview,
inputs=[
audio_input, whisper_dropdown, llm_dropdown, image_dropdown,
prompt_template_input, max_words_input, max_sentences_input,
style_suffix_input, res_dropdown
],
outputs=[preview_row, preview_img, preview_prompt, preview_info]
)
generate_btn.click(
on_generate,
inputs=[
audio_input, whisper_dropdown, llm_dropdown, image_dropdown, video_dropdown,
template_dropdown, res_dropdown, fps_input, seed_input, prompt_template_input,
max_words_input, max_sentences_input, style_suffix_input,
image_mode_input, strength_slider, crossfade_slider
],
outputs=[output_video, download_file, prompt_text, image_gallery, status_text, video_info, segment_info]
)
if __name__ == "__main__":
# Uncomment for custom hosting options
# demo.launch(server_name='0.0.0.0', server_port=7860)
demo.launch() |