Spaces:
Running
on
Zero
Running
on
Zero
Commit
Β·
afd038c
1
Parent(s):
826f24c
Fix ZeroGPU pickle error by removing lambda functions and fixing progress callbacks
Browse files- README.md +0 -8
- app.py +18 -20
- requirements.txt +1 -1
- utils/prompt_gen.py +2 -0
- utils/transcribe.py +2 -0
- utils/video_gen.py +3 -0
README.md
CHANGED
@@ -1,11 +1,3 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
title: Audio2KineticVid
|
4 |
-
sdk: gradio
|
5 |
-
emoji: π
|
6 |
-
colorFrom: yellow
|
7 |
-
colorTo: red
|
8 |
-
---
|
9 |
# Audio2KineticVid
|
10 |
|
11 |
Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models β no external APIs or paid services required.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Audio2KineticVid
|
2 |
|
3 |
Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models β no external APIs or paid services required.
|
app.py
CHANGED
@@ -7,7 +7,6 @@ import gradio as gr
|
|
7 |
import torch
|
8 |
from PIL import Image
|
9 |
import time
|
10 |
-
import spaces
|
11 |
|
12 |
# Import pipeline modules
|
13 |
from utils.transcribe import transcribe_audio, list_available_whisper_models
|
@@ -71,7 +70,6 @@ DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain"
|
|
71 |
IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
|
72 |
DEFAULT_IMAGE_MODE = "Independent"
|
73 |
|
74 |
-
@spaces.GPU
|
75 |
def process_audio(
|
76 |
audio_path,
|
77 |
whisper_model,
|
@@ -311,7 +309,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
311 |
audio_input = gr.Audio(
|
312 |
label="π΅ Upload Audio Track",
|
313 |
type="filepath",
|
314 |
-
|
315 |
)
|
316 |
with gr.Column():
|
317 |
# Quick settings panel
|
@@ -320,7 +318,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
320 |
choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
|
321 |
value="Balanced (1024x576)",
|
322 |
label="Quality Preset",
|
323 |
-
|
324 |
)
|
325 |
|
326 |
# Model selection tabs
|
@@ -333,26 +331,26 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
333 |
label="π€ Transcription Model (Whisper)",
|
334 |
choices=WHISPER_MODELS,
|
335 |
value=DEFAULT_WHISPER_MODEL,
|
336 |
-
|
337 |
)
|
338 |
llm_dropdown = gr.Dropdown(
|
339 |
label="π§ Scene Description Model (LLM)",
|
340 |
choices=LLM_MODELS,
|
341 |
value=DEFAULT_LLM_MODEL,
|
342 |
-
|
343 |
)
|
344 |
with gr.Column():
|
345 |
image_dropdown = gr.Dropdown(
|
346 |
label="π¨ Image Generation Model",
|
347 |
choices=IMAGE_MODELS,
|
348 |
value=DEFAULT_IMAGE_MODEL,
|
349 |
-
|
350 |
)
|
351 |
video_dropdown = gr.Dropdown(
|
352 |
label="π¬ Video Animation Model",
|
353 |
choices=VIDEO_MODELS,
|
354 |
value=DEFAULT_VIDEO_MODEL,
|
355 |
-
|
356 |
)
|
357 |
|
358 |
with gr.TabItem("βοΈ Scene Prompting"):
|
@@ -362,7 +360,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
362 |
label="LLM Prompt Template",
|
363 |
value=DEFAULT_PROMPT_TEMPLATE,
|
364 |
lines=6,
|
365 |
-
|
366 |
)
|
367 |
with gr.Row():
|
368 |
max_words_input = gr.Slider(
|
@@ -371,7 +369,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
371 |
maximum=100,
|
372 |
step=5,
|
373 |
value=DEFAULT_MAX_WORDS,
|
374 |
-
|
375 |
)
|
376 |
max_sentences_input = gr.Slider(
|
377 |
label="Max Sentences per Scene",
|
@@ -379,12 +377,12 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
379 |
maximum=5,
|
380 |
step=1,
|
381 |
value=DEFAULT_MAX_SENTENCES,
|
382 |
-
|
383 |
)
|
384 |
style_suffix_input = gr.Textbox(
|
385 |
label="Visual Style Keywords",
|
386 |
value=DEFAULT_STYLE_SUFFIX,
|
387 |
-
|
388 |
)
|
389 |
|
390 |
with gr.TabItem("π¬ Video Settings"):
|
@@ -395,32 +393,32 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
395 |
label="πͺ Subtitle Animation Style",
|
396 |
choices=template_choices,
|
397 |
value=DEFAULT_TEMPLATE,
|
398 |
-
|
399 |
)
|
400 |
res_dropdown = gr.Dropdown(
|
401 |
label="πΊ Video Resolution",
|
402 |
choices=["512x288", "1024x576", "1280x720"],
|
403 |
value=DEFAULT_RESOLUTION,
|
404 |
-
|
405 |
)
|
406 |
with gr.Row():
|
407 |
fps_input = gr.Textbox(
|
408 |
label="ποΈ Video FPS",
|
409 |
value=DEFAULT_FPS_MODE,
|
410 |
-
|
411 |
)
|
412 |
seed_input = gr.Number(
|
413 |
label="π± Random Seed",
|
414 |
value=DEFAULT_SEED,
|
415 |
precision=0,
|
416 |
-
|
417 |
)
|
418 |
with gr.Row():
|
419 |
image_mode_input = gr.Radio(
|
420 |
label="πΌοΈ Scene Generation Mode",
|
421 |
choices=IMAGE_MODES,
|
422 |
value=DEFAULT_IMAGE_MODE,
|
423 |
-
|
424 |
)
|
425 |
strength_slider = gr.Slider(
|
426 |
label="π― Style Consistency Strength",
|
@@ -429,7 +427,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
429 |
step=0.05,
|
430 |
value=0.5,
|
431 |
visible=False,
|
432 |
-
|
433 |
)
|
434 |
crossfade_slider = gr.Slider(
|
435 |
label="π Scene Transition Duration",
|
@@ -437,7 +435,7 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
437 |
maximum=1.0,
|
438 |
step=0.05,
|
439 |
value=DEFAULT_CROSSFADE,
|
440 |
-
|
441 |
)
|
442 |
|
443 |
# Quick preset handling
|
@@ -717,4 +715,4 @@ with gr.Blocks(title="Audio β Kinetic-Subtitle Music Video", theme=gr.themes.S
|
|
717 |
if __name__ == "__main__":
|
718 |
# Uncomment for custom hosting options
|
719 |
# demo.launch(server_name='0.0.0.0', server_port=7860)
|
720 |
-
demo.launch(
|
|
|
7 |
import torch
|
8 |
from PIL import Image
|
9 |
import time
|
|
|
10 |
|
11 |
# Import pipeline modules
|
12 |
from utils.transcribe import transcribe_audio, list_available_whisper_models
|
|
|
70 |
IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
|
71 |
DEFAULT_IMAGE_MODE = "Independent"
|
72 |
|
|
|
73 |
def process_audio(
|
74 |
audio_path,
|
75 |
whisper_model,
|
|
|
309 |
audio_input = gr.Audio(
|
310 |
label="π΅ Upload Audio Track",
|
311 |
type="filepath",
|
312 |
+
info="Upload your music file. For best results, use clear audio with distinct vocals."
|
313 |
)
|
314 |
with gr.Column():
|
315 |
# Quick settings panel
|
|
|
318 |
choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
|
319 |
value="Balanced (1024x576)",
|
320 |
label="Quality Preset",
|
321 |
+
info="Higher quality = better results but slower generation"
|
322 |
)
|
323 |
|
324 |
# Model selection tabs
|
|
|
331 |
label="π€ Transcription Model (Whisper)",
|
332 |
choices=WHISPER_MODELS,
|
333 |
value=DEFAULT_WHISPER_MODEL,
|
334 |
+
info="Larger models are more accurate but slower. 'medium.en' is recommended for English."
|
335 |
)
|
336 |
llm_dropdown = gr.Dropdown(
|
337 |
label="π§ Scene Description Model (LLM)",
|
338 |
choices=LLM_MODELS,
|
339 |
value=DEFAULT_LLM_MODEL,
|
340 |
+
info="Language model to generate visual scene descriptions from lyrics."
|
341 |
)
|
342 |
with gr.Column():
|
343 |
image_dropdown = gr.Dropdown(
|
344 |
label="π¨ Image Generation Model",
|
345 |
choices=IMAGE_MODELS,
|
346 |
value=DEFAULT_IMAGE_MODEL,
|
347 |
+
info="Stable Diffusion model for generating scene images."
|
348 |
)
|
349 |
video_dropdown = gr.Dropdown(
|
350 |
label="π¬ Video Animation Model",
|
351 |
choices=VIDEO_MODELS,
|
352 |
value=DEFAULT_VIDEO_MODEL,
|
353 |
+
info="Model to animate still images into video clips."
|
354 |
)
|
355 |
|
356 |
with gr.TabItem("βοΈ Scene Prompting"):
|
|
|
360 |
label="LLM Prompt Template",
|
361 |
value=DEFAULT_PROMPT_TEMPLATE,
|
362 |
lines=6,
|
363 |
+
info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders."
|
364 |
)
|
365 |
with gr.Row():
|
366 |
max_words_input = gr.Slider(
|
|
|
369 |
maximum=100,
|
370 |
step=5,
|
371 |
value=DEFAULT_MAX_WORDS,
|
372 |
+
info="Limit words in each scene description (more words = more detailed scenes)."
|
373 |
)
|
374 |
max_sentences_input = gr.Slider(
|
375 |
label="Max Sentences per Scene",
|
|
|
377 |
maximum=5,
|
378 |
step=1,
|
379 |
value=DEFAULT_MAX_SENTENCES,
|
380 |
+
info="Limit sentences per scene (1-2 recommended for music videos)."
|
381 |
)
|
382 |
style_suffix_input = gr.Textbox(
|
383 |
label="Visual Style Keywords",
|
384 |
value=DEFAULT_STYLE_SUFFIX,
|
385 |
+
info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')."
|
386 |
)
|
387 |
|
388 |
with gr.TabItem("π¬ Video Settings"):
|
|
|
393 |
label="πͺ Subtitle Animation Style",
|
394 |
choices=template_choices,
|
395 |
value=DEFAULT_TEMPLATE,
|
396 |
+
info="Choose the kinetic subtitle animation style."
|
397 |
)
|
398 |
res_dropdown = gr.Dropdown(
|
399 |
label="πΊ Video Resolution",
|
400 |
choices=["512x288", "1024x576", "1280x720"],
|
401 |
value=DEFAULT_RESOLUTION,
|
402 |
+
info="Higher resolution = better quality but much slower generation."
|
403 |
)
|
404 |
with gr.Row():
|
405 |
fps_input = gr.Textbox(
|
406 |
label="ποΈ Video FPS",
|
407 |
value=DEFAULT_FPS_MODE,
|
408 |
+
info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')."
|
409 |
)
|
410 |
seed_input = gr.Number(
|
411 |
label="π± Random Seed",
|
412 |
value=DEFAULT_SEED,
|
413 |
precision=0,
|
414 |
+
info="Set seed for reproducible results (0 = random). Use same seed to recreate results."
|
415 |
)
|
416 |
with gr.Row():
|
417 |
image_mode_input = gr.Radio(
|
418 |
label="πΌοΈ Scene Generation Mode",
|
419 |
choices=IMAGE_MODES,
|
420 |
value=DEFAULT_IMAGE_MODE,
|
421 |
+
info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity."
|
422 |
)
|
423 |
strength_slider = gr.Slider(
|
424 |
label="π― Style Consistency Strength",
|
|
|
427 |
step=0.05,
|
428 |
value=0.5,
|
429 |
visible=False,
|
430 |
+
info="How much each scene influences the next (lower = more influence, higher = more variety)."
|
431 |
)
|
432 |
crossfade_slider = gr.Slider(
|
433 |
label="π Scene Transition Duration",
|
|
|
435 |
maximum=1.0,
|
436 |
step=0.05,
|
437 |
value=DEFAULT_CROSSFADE,
|
438 |
+
info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)."
|
439 |
)
|
440 |
|
441 |
# Quick preset handling
|
|
|
715 |
if __name__ == "__main__":
|
716 |
# Uncomment for custom hosting options
|
717 |
# demo.launch(server_name='0.0.0.0', server_port=7860)
|
718 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -5,7 +5,7 @@ accelerate>=0.30
|
|
5 |
diffusers>=0.34
|
6 |
torchaudio
|
7 |
openai-whisper
|
8 |
-
pyannote.audio==3.2.
|
9 |
pycaps @ git+https://github.com/francozanardi/pycaps.git
|
10 |
ffmpeg-python
|
11 |
auto-gptq==0.7.1
|
|
|
5 |
diffusers>=0.34
|
6 |
torchaudio
|
7 |
openai-whisper
|
8 |
+
pyannote.audio==3.2.1
|
9 |
pycaps @ git+https://github.com/francozanardi/pycaps.git
|
10 |
ffmpeg-python
|
11 |
auto-gptq==0.7.1
|
utils/prompt_gen.py
CHANGED
@@ -6,6 +6,7 @@ try:
|
|
6 |
except ImportError:
|
7 |
AutoGPTQForCausalLM = None
|
8 |
from transformers import AutoModelForCausalLM
|
|
|
9 |
|
10 |
# Cache models and tokenizers
|
11 |
_llm_cache = {} # {model_name: (model, tokenizer)}
|
@@ -51,6 +52,7 @@ def _load_llm(model_name):
|
|
51 |
|
52 |
return _llm_cache[model_name]
|
53 |
|
|
|
54 |
def generate_scene_prompts(
|
55 |
segments,
|
56 |
llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
|
|
|
6 |
except ImportError:
|
7 |
AutoGPTQForCausalLM = None
|
8 |
from transformers import AutoModelForCausalLM
|
9 |
+
import spaces
|
10 |
|
11 |
# Cache models and tokenizers
|
12 |
_llm_cache = {} # {model_name: (model, tokenizer)}
|
|
|
52 |
|
53 |
return _llm_cache[model_name]
|
54 |
|
55 |
+
@spaces.GPU
|
56 |
def generate_scene_prompts(
|
57 |
segments,
|
58 |
llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
|
utils/transcribe.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import whisper
|
|
|
2 |
|
3 |
# Cache loaded whisper models to avoid reloading for each request
|
4 |
_model_cache = {}
|
@@ -7,6 +8,7 @@ def list_available_whisper_models():
|
|
7 |
"""Return list of available Whisper models"""
|
8 |
return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
|
9 |
|
|
|
10 |
def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
|
11 |
"""
|
12 |
Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
|
|
|
1 |
import whisper
|
2 |
+
import spaces
|
3 |
|
4 |
# Cache loaded whisper models to avoid reloading for each request
|
5 |
_model_cache = {}
|
|
|
8 |
"""Return list of available Whisper models"""
|
9 |
return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
|
10 |
|
11 |
+
@spaces.GPU
|
12 |
def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
|
13 |
"""
|
14 |
Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
|
utils/video_gen.py
CHANGED
@@ -11,6 +11,7 @@ from diffusers import (
|
|
11 |
from PIL import Image
|
12 |
import numpy as np
|
13 |
import time
|
|
|
14 |
|
15 |
# Global pipelines cache
|
16 |
_model_cache = {}
|
@@ -96,6 +97,7 @@ def _load_video_pipeline(model_name):
|
|
96 |
|
97 |
return _model_cache[model_name]
|
98 |
|
|
|
99 |
def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
|
100 |
"""
|
101 |
Generate a preview image from a prompt
|
@@ -125,6 +127,7 @@ def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-x
|
|
125 |
|
126 |
return image
|
127 |
|
|
|
128 |
def create_video_segments(
|
129 |
segments,
|
130 |
scene_prompts,
|
|
|
11 |
from PIL import Image
|
12 |
import numpy as np
|
13 |
import time
|
14 |
+
import spaces
|
15 |
|
16 |
# Global pipelines cache
|
17 |
_model_cache = {}
|
|
|
97 |
|
98 |
return _model_cache[model_name]
|
99 |
|
100 |
+
@spaces.GPU
|
101 |
def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
|
102 |
"""
|
103 |
Generate a preview image from a prompt
|
|
|
127 |
|
128 |
return image
|
129 |
|
130 |
+
@spaces.GPU
|
131 |
def create_video_segments(
|
132 |
segments,
|
133 |
scene_prompts,
|