Lightricks
/

LTX-Video

@@ -121,58 +121,220 @@ Make sure you install `diffusers` before trying out the examples below.
 pip install -U git+https://github.com/huggingface/diffusers
 ```
-Now, you can run the examples below:
 ```py
 import torch
-from diffusers import LTXPipeline
 from diffusers.utils import export_to_video
-pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
 pipe.to("cuda")
-prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 video = pipe(
     prompt=prompt,
     negative_prompt=negative_prompt,
-    width=704,
-    height=480,
-    num_frames=161,
-    num_inference_steps=50,
 ).frames[0]
 export_to_video(video, "output.mp4", fps=24)
 ```
-For image-to-video:
 ```py
 import torch
-from diffusers import LTXImageToVideoPipeline
 from diffusers.utils import export_to_video, load_image
-pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
 pipe.to("cuda")
-image = load_image(
-    "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
-)
-prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
 video = pipe(
-    image=image,
     prompt=prompt,
     negative_prompt=negative_prompt,
-    width=704,
-    height=480,
-    num_frames=161,
-    num_inference_steps=50,
 ).frames[0]
 export_to_video(video, "output.mp4", fps=24)
 ```
 To learn more, check out the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
 Diffusers also supports directly loading from the original LTX checkpoints using the `from_single_file()` method. Check out [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video#loading-single-files) to learn more.

 pip install -U git+https://github.com/huggingface/diffusers
 ```
+Now, you can run the examples below (note that the upsampling stage is optional but reccomeneded):
+### text-to-video:
 ```py
 import torch
+from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
+from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
 from diffusers.utils import export_to_video
+pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
+pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
+pipe_upsample.to("cuda")
+pipe.vae.enable_tiling()
+prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+expected_height, expected_width = 704, 512
+downscale_factor = 2 / 3
+num_frames = 121
+# Part 1. Generate video at smaller resolution
+downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
+latents = pipe(
+    conditions=None,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=downscaled_width,
+    height=downscaled_height,
+    num_frames=num_frames,
+    num_inference_steps=30,
+    generator=torch.Generator().manual_seed(0),
+    output_type="latent",
+).frames
+# Part 2. Upscale generated video using latent upsampler with fewer inference steps
+# The available latent upsampler upscales the height/width by 2x
+upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
+upscaled_latents = pipe_upsample(
+    latents=latents,
+    output_type="latent"
+).frames
+# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
 video = pipe(
     prompt=prompt,
     negative_prompt=negative_prompt,
+    width=upscaled_width,
+    height=upscaled_height,
+    num_frames=num_frames,
+    denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
+    num_inference_steps=10,
+    latents=upscaled_latents,
+    decode_timestep=0.05,
+    image_cond_noise_scale=0.025,
+    generator=torch.Generator().manual_seed(0),
+    output_type="pil",
 ).frames[0]
+# Part 4. Downscale the video to the expected resolution
+video = [frame.resize((expected_width, expected_height)) for frame in video]
 export_to_video(video, "output.mp4", fps=24)
 ```
+### For image-to-video:
 ```py
 import torch
+from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
+from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
 from diffusers.utils import export_to_video, load_image
+pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
+pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
+pipe_upsample.to("cuda")
+pipe.vae.enable_tiling()
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png")
+video = [image]
+condition1 = LTXVideoCondition(video=video, frame_index=0)
+prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
 negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+expected_height, expected_width = 832, 480
+downscale_factor = 2 / 3
+num_frames = 96
+# Part 1. Generate video at smaller resolution
+downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
+downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
+latents = pipe(
+    conditions=[condition1],
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=downscaled_width,
+    height=downscaled_height,
+    num_frames=num_frames,
+    num_inference_steps=30,
+    generator=torch.Generator().manual_seed(0),
+    output_type="latent",
+).frames
+# Part 2. Upscale generated video using latent upsampler with fewer inference steps
+# The available latent upsampler upscales the height/width by 2x
+upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
+upscaled_latents = pipe_upsample(
+    latents=latents,
+    output_type="latent"
+).frames
+# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
+video = pipe(
+    conditions=[condition1],
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=upscaled_width,
+    height=upscaled_height,
+    num_frames=num_frames,
+    denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
+    num_inference_steps=10,
+    latents=upscaled_latents,
+    decode_timestep=0.05,
+    image_cond_noise_scale=0.025,
+    generator=torch.Generator().manual_seed(0),
+    output_type="pil",
+).frames[0]
+# Part 4. Downscale the video to the expected resolution
+video = [frame.resize((expected_width, expected_height)) for frame in video]
+export_to_video(video, "output.mp4", fps=24)
+```
+### For video-to-video:
+```py
+import torch
+from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
+from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
+from diffusers.utils import export_to_video, load_video
+pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
+pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+pipe_upsample.to("cuda")
+pipe.vae.enable_tiling()
+def round_to_nearest_resolution_acceptable_by_vae(height, width):
+    height = height - (height % pipe.vae_temporal_compression_ratio)
+    width = width - (width % pipe.vae_temporal_compression_ratio)
+    return height, width
+video = load_video(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
+)[:21]  # Use only the first 21 frames as conditioning
+condition1 = LTXVideoCondition(video=video, frame_index=0)
+prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
+negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+expected_height, expected_width = 768, 1152
+downscale_factor = 2 / 3
+num_frames = 161
+# Part 1. Generate video at smaller resolution
+downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
+downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
+latents = pipe(
+    conditions=[condition1],
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=downscaled_width,
+    height=downscaled_height,
+    num_frames=num_frames,
+    num_inference_steps=30,
+    generator=torch.Generator().manual_seed(0),
+    output_type="latent",
+).frames
+# Part 2. Upscale generated video using latent upsampler with fewer inference steps
+# The available latent upsampler upscales the height/width by 2x
+upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
+upscaled_latents = pipe_upsample(
+    latents=latents,
+    output_type="latent"
+).frames
+# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
 video = pipe(
+    conditions=[condition1],
     prompt=prompt,
     negative_prompt=negative_prompt,
+    width=upscaled_width,
+    height=upscaled_height,
+    num_frames=num_frames,
+    denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
+    num_inference_steps=10,
+    latents=upscaled_latents,
+    decode_timestep=0.05,
+    image_cond_noise_scale=0.025,
+    generator=torch.Generator().manual_seed(0),
+    output_type="pil",
 ).frames[0]
+# Part 4. Downscale the video to the expected resolution
+video = [frame.resize((expected_width, expected_height)) for frame in video]
 export_to_video(video, "output.mp4", fps=24)
 ```
 To learn more, check out the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
 Diffusers also supports directly loading from the original LTX checkpoints using the `from_single_file()` method. Check out [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video#loading-single-files) to learn more.