doodle-med commited on
Commit
afd038c
Β·
1 Parent(s): 826f24c

Fix ZeroGPU pickle error by removing lambda functions and fixing progress callbacks

Browse files
Files changed (6) hide show
  1. README.md +0 -8
  2. app.py +18 -20
  3. requirements.txt +1 -1
  4. utils/prompt_gen.py +2 -0
  5. utils/transcribe.py +2 -0
  6. utils/video_gen.py +3 -0
README.md CHANGED
@@ -1,11 +1,3 @@
1
- ---
2
- license: apache-2.0
3
- title: Audio2KineticVid
4
- sdk: gradio
5
- emoji: πŸš€
6
- colorFrom: yellow
7
- colorTo: red
8
- ---
9
  # Audio2KineticVid
10
 
11
  Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models – no external APIs or paid services required.
 
 
 
 
 
 
 
 
 
1
  # Audio2KineticVid
2
 
3
  Audio2KineticVid is a comprehensive tool that converts an audio track (e.g., a song) into a dynamic music video with AI-generated scenes and synchronized kinetic typography (animated subtitles). Everything runs locally using open-source models – no external APIs or paid services required.
app.py CHANGED
@@ -7,7 +7,6 @@ import gradio as gr
7
  import torch
8
  from PIL import Image
9
  import time
10
- import spaces
11
 
12
  # Import pipeline modules
13
  from utils.transcribe import transcribe_audio, list_available_whisper_models
@@ -71,7 +70,6 @@ DEFAULT_STYLE_SUFFIX = "cinematic, 35 mm, shallow depth of field, film grain"
71
  IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
72
  DEFAULT_IMAGE_MODE = "Independent"
73
 
74
- @spaces.GPU
75
  def process_audio(
76
  audio_path,
77
  whisper_model,
@@ -311,7 +309,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
311
  audio_input = gr.Audio(
312
  label="🎡 Upload Audio Track",
313
  type="filepath",
314
-
315
  )
316
  with gr.Column():
317
  # Quick settings panel
@@ -320,7 +318,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
320
  choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
321
  value="Balanced (1024x576)",
322
  label="Quality Preset",
323
-
324
  )
325
 
326
  # Model selection tabs
@@ -333,26 +331,26 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
333
  label="🎀 Transcription Model (Whisper)",
334
  choices=WHISPER_MODELS,
335
  value=DEFAULT_WHISPER_MODEL,
336
-
337
  )
338
  llm_dropdown = gr.Dropdown(
339
  label="🧠 Scene Description Model (LLM)",
340
  choices=LLM_MODELS,
341
  value=DEFAULT_LLM_MODEL,
342
-
343
  )
344
  with gr.Column():
345
  image_dropdown = gr.Dropdown(
346
  label="🎨 Image Generation Model",
347
  choices=IMAGE_MODELS,
348
  value=DEFAULT_IMAGE_MODEL,
349
-
350
  )
351
  video_dropdown = gr.Dropdown(
352
  label="🎬 Video Animation Model",
353
  choices=VIDEO_MODELS,
354
  value=DEFAULT_VIDEO_MODEL,
355
-
356
  )
357
 
358
  with gr.TabItem("✍️ Scene Prompting"):
@@ -362,7 +360,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
362
  label="LLM Prompt Template",
363
  value=DEFAULT_PROMPT_TEMPLATE,
364
  lines=6,
365
-
366
  )
367
  with gr.Row():
368
  max_words_input = gr.Slider(
@@ -371,7 +369,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
371
  maximum=100,
372
  step=5,
373
  value=DEFAULT_MAX_WORDS,
374
-
375
  )
376
  max_sentences_input = gr.Slider(
377
  label="Max Sentences per Scene",
@@ -379,12 +377,12 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
379
  maximum=5,
380
  step=1,
381
  value=DEFAULT_MAX_SENTENCES,
382
-
383
  )
384
  style_suffix_input = gr.Textbox(
385
  label="Visual Style Keywords",
386
  value=DEFAULT_STYLE_SUFFIX,
387
-
388
  )
389
 
390
  with gr.TabItem("🎬 Video Settings"):
@@ -395,32 +393,32 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
395
  label="πŸŽͺ Subtitle Animation Style",
396
  choices=template_choices,
397
  value=DEFAULT_TEMPLATE,
398
-
399
  )
400
  res_dropdown = gr.Dropdown(
401
  label="πŸ“Ί Video Resolution",
402
  choices=["512x288", "1024x576", "1280x720"],
403
  value=DEFAULT_RESOLUTION,
404
-
405
  )
406
  with gr.Row():
407
  fps_input = gr.Textbox(
408
  label="🎞️ Video FPS",
409
  value=DEFAULT_FPS_MODE,
410
-
411
  )
412
  seed_input = gr.Number(
413
  label="🌱 Random Seed",
414
  value=DEFAULT_SEED,
415
  precision=0,
416
-
417
  )
418
  with gr.Row():
419
  image_mode_input = gr.Radio(
420
  label="πŸ–ΌοΈ Scene Generation Mode",
421
  choices=IMAGE_MODES,
422
  value=DEFAULT_IMAGE_MODE,
423
-
424
  )
425
  strength_slider = gr.Slider(
426
  label="🎯 Style Consistency Strength",
@@ -429,7 +427,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
429
  step=0.05,
430
  value=0.5,
431
  visible=False,
432
-
433
  )
434
  crossfade_slider = gr.Slider(
435
  label="πŸ”„ Scene Transition Duration",
@@ -437,7 +435,7 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
437
  maximum=1.0,
438
  step=0.05,
439
  value=DEFAULT_CROSSFADE,
440
-
441
  )
442
 
443
  # Quick preset handling
@@ -717,4 +715,4 @@ with gr.Blocks(title="Audio β†’ Kinetic-Subtitle Music Video", theme=gr.themes.S
717
  if __name__ == "__main__":
718
  # Uncomment for custom hosting options
719
  # demo.launch(server_name='0.0.0.0', server_port=7860)
720
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
7
  import torch
8
  from PIL import Image
9
  import time
 
10
 
11
  # Import pipeline modules
12
  from utils.transcribe import transcribe_audio, list_available_whisper_models
 
70
  IMAGE_MODES = ["Independent", "Consistent (Img2Img)"]
71
  DEFAULT_IMAGE_MODE = "Independent"
72
 
 
73
  def process_audio(
74
  audio_path,
75
  whisper_model,
 
309
  audio_input = gr.Audio(
310
  label="🎡 Upload Audio Track",
311
  type="filepath",
312
+ info="Upload your music file. For best results, use clear audio with distinct vocals."
313
  )
314
  with gr.Column():
315
  # Quick settings panel
 
318
  choices=["Fast (512x288)", "Balanced (1024x576)", "High Quality (1280x720)"],
319
  value="Balanced (1024x576)",
320
  label="Quality Preset",
321
+ info="Higher quality = better results but slower generation"
322
  )
323
 
324
  # Model selection tabs
 
331
  label="🎀 Transcription Model (Whisper)",
332
  choices=WHISPER_MODELS,
333
  value=DEFAULT_WHISPER_MODEL,
334
+ info="Larger models are more accurate but slower. 'medium.en' is recommended for English."
335
  )
336
  llm_dropdown = gr.Dropdown(
337
  label="🧠 Scene Description Model (LLM)",
338
  choices=LLM_MODELS,
339
  value=DEFAULT_LLM_MODEL,
340
+ info="Language model to generate visual scene descriptions from lyrics."
341
  )
342
  with gr.Column():
343
  image_dropdown = gr.Dropdown(
344
  label="🎨 Image Generation Model",
345
  choices=IMAGE_MODELS,
346
  value=DEFAULT_IMAGE_MODEL,
347
+ info="Stable Diffusion model for generating scene images."
348
  )
349
  video_dropdown = gr.Dropdown(
350
  label="🎬 Video Animation Model",
351
  choices=VIDEO_MODELS,
352
  value=DEFAULT_VIDEO_MODEL,
353
+ info="Model to animate still images into video clips."
354
  )
355
 
356
  with gr.TabItem("✍️ Scene Prompting"):
 
360
  label="LLM Prompt Template",
361
  value=DEFAULT_PROMPT_TEMPLATE,
362
  lines=6,
363
+ info="Template for generating scene descriptions. Use {lyrics}, {max_words}, and {max_sentences} as placeholders."
364
  )
365
  with gr.Row():
366
  max_words_input = gr.Slider(
 
369
  maximum=100,
370
  step=5,
371
  value=DEFAULT_MAX_WORDS,
372
+ info="Limit words in each scene description (more words = more detailed scenes)."
373
  )
374
  max_sentences_input = gr.Slider(
375
  label="Max Sentences per Scene",
 
377
  maximum=5,
378
  step=1,
379
  value=DEFAULT_MAX_SENTENCES,
380
+ info="Limit sentences per scene (1-2 recommended for music videos)."
381
  )
382
  style_suffix_input = gr.Textbox(
383
  label="Visual Style Keywords",
384
  value=DEFAULT_STYLE_SUFFIX,
385
+ info="Style keywords added to all scenes for consistent visual style (e.g., 'cinematic, vibrant colors')."
386
  )
387
 
388
  with gr.TabItem("🎬 Video Settings"):
 
393
  label="πŸŽͺ Subtitle Animation Style",
394
  choices=template_choices,
395
  value=DEFAULT_TEMPLATE,
396
+ info="Choose the kinetic subtitle animation style."
397
  )
398
  res_dropdown = gr.Dropdown(
399
  label="πŸ“Ί Video Resolution",
400
  choices=["512x288", "1024x576", "1280x720"],
401
  value=DEFAULT_RESOLUTION,
402
+ info="Higher resolution = better quality but much slower generation."
403
  )
404
  with gr.Row():
405
  fps_input = gr.Textbox(
406
  label="🎞️ Video FPS",
407
  value=DEFAULT_FPS_MODE,
408
+ info="Frames per second. Use 'Auto' to match lyric timing, or set fixed value (e.g., '24', '30')."
409
  )
410
  seed_input = gr.Number(
411
  label="🌱 Random Seed",
412
  value=DEFAULT_SEED,
413
  precision=0,
414
+ info="Set seed for reproducible results (0 = random). Use same seed to recreate results."
415
  )
416
  with gr.Row():
417
  image_mode_input = gr.Radio(
418
  label="πŸ–ΌοΈ Scene Generation Mode",
419
  choices=IMAGE_MODES,
420
  value=DEFAULT_IMAGE_MODE,
421
+ info="Independent: each scene is unique. Consistent: scenes influence each other for style continuity."
422
  )
423
  strength_slider = gr.Slider(
424
  label="🎯 Style Consistency Strength",
 
427
  step=0.05,
428
  value=0.5,
429
  visible=False,
430
+ info="How much each scene influences the next (lower = more influence, higher = more variety)."
431
  )
432
  crossfade_slider = gr.Slider(
433
  label="πŸ”„ Scene Transition Duration",
 
435
  maximum=1.0,
436
  step=0.05,
437
  value=DEFAULT_CROSSFADE,
438
+ info="Smooth crossfade between scenes in seconds (0 = hard cuts, 0.25 = subtle blend)."
439
  )
440
 
441
  # Quick preset handling
 
715
  if __name__ == "__main__":
716
  # Uncomment for custom hosting options
717
  # demo.launch(server_name='0.0.0.0', server_port=7860)
718
+ demo.launch()
requirements.txt CHANGED
@@ -5,7 +5,7 @@ accelerate>=0.30
5
  diffusers>=0.34
6
  torchaudio
7
  openai-whisper
8
- pyannote.audio==3.2.0
9
  pycaps @ git+https://github.com/francozanardi/pycaps.git
10
  ffmpeg-python
11
  auto-gptq==0.7.1
 
5
  diffusers>=0.34
6
  torchaudio
7
  openai-whisper
8
+ pyannote.audio==3.2.1
9
  pycaps @ git+https://github.com/francozanardi/pycaps.git
10
  ffmpeg-python
11
  auto-gptq==0.7.1
utils/prompt_gen.py CHANGED
@@ -6,6 +6,7 @@ try:
6
  except ImportError:
7
  AutoGPTQForCausalLM = None
8
  from transformers import AutoModelForCausalLM
 
9
 
10
  # Cache models and tokenizers
11
  _llm_cache = {} # {model_name: (model, tokenizer)}
@@ -51,6 +52,7 @@ def _load_llm(model_name):
51
 
52
  return _llm_cache[model_name]
53
 
 
54
  def generate_scene_prompts(
55
  segments,
56
  llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
 
6
  except ImportError:
7
  AutoGPTQForCausalLM = None
8
  from transformers import AutoModelForCausalLM
9
+ import spaces
10
 
11
  # Cache models and tokenizers
12
  _llm_cache = {} # {model_name: (model, tokenizer)}
 
52
 
53
  return _llm_cache[model_name]
54
 
55
+ @spaces.GPU
56
  def generate_scene_prompts(
57
  segments,
58
  llm_model="TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
utils/transcribe.py CHANGED
@@ -1,4 +1,5 @@
1
  import whisper
 
2
 
3
  # Cache loaded whisper models to avoid reloading for each request
4
  _model_cache = {}
@@ -7,6 +8,7 @@ def list_available_whisper_models():
7
  """Return list of available Whisper models"""
8
  return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
9
 
 
10
  def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
11
  """
12
  Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
 
1
  import whisper
2
+ import spaces
3
 
4
  # Cache loaded whisper models to avoid reloading for each request
5
  _model_cache = {}
 
8
  """Return list of available Whisper models"""
9
  return ["tiny", "base", "small", "medium", "medium.en", "large", "large-v2"]
10
 
11
+ @spaces.GPU
12
  def transcribe_audio(audio_path: str, model_size: str = "medium.en"):
13
  """
14
  Transcribe the given audio file using OpenAI Whisper and return the result dictionary.
utils/video_gen.py CHANGED
@@ -11,6 +11,7 @@ from diffusers import (
11
  from PIL import Image
12
  import numpy as np
13
  import time
 
14
 
15
  # Global pipelines cache
16
  _model_cache = {}
@@ -96,6 +97,7 @@ def _load_video_pipeline(model_name):
96
 
97
  return _model_cache[model_name]
98
 
 
99
  def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
100
  """
101
  Generate a preview image from a prompt
@@ -125,6 +127,7 @@ def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-x
125
 
126
  return image
127
 
 
128
  def create_video_segments(
129
  segments,
130
  scene_prompts,
 
11
  from PIL import Image
12
  import numpy as np
13
  import time
14
+ import spaces
15
 
16
  # Global pipelines cache
17
  _model_cache = {}
 
97
 
98
  return _model_cache[model_name]
99
 
100
+ @spaces.GPU
101
  def preview_image_generation(prompt, image_model="stabilityai/stable-diffusion-xl-base-1.0", width=1024, height=576, seed=None):
102
  """
103
  Generate a preview image from a prompt
 
127
 
128
  return image
129
 
130
+ @spaces.GPU
131
  def create_video_segments(
132
  segments,
133
  scene_prompts,