Loading full fine tuned model
Great works!
I have tried to full fine tune these models (transformer and transformers_2) by DiffSynth-Studio.
However, I could not load the models.
My test code is as follows:
import os
import torch
import numpy as np
from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
from diffusers.utils import export_to_video, load_image
from diffusers import FlowMatchEulerDiscreteScheduler
from PIL import Image
from transformers import CLIPVisionModel, AutoImageProcessor
print("Loading transformers...")
transformer_high_noise = WanTransformer3DModel.from_single_file(
"./high_noise.safetensors",
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=False
)
transformer_low_noise = WanTransformer3DModel.from_single_file(
"./low_noise.safetensors",
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=False
)
image_encoder = CLIPVisionModel.from_pretrained("Wan-AI/Wan2.1-I2V-14B-720P-Diffusers", subfolder="image_encoder", torch_dtype=torch.float32)
image_processor = AutoImageProcessor.from_pretrained("Wan-AI/Wan2.1-I2V-14B-720P-Diffusers", subfolder="image_processor")
print("Creating pipeline with transformers...")
pipe = WanImageToVideoPipeline.from_pretrained("Wan-AI/Wan2.2-I2V-A14B-Diffusers", torch_dtype=torch.bfloat16)
pipe.transformer=transformer_high_noise
pipe.transformer_2=transformer_low_noise
pipe.image_processor = image_processor
pipe.image_encoder = image_encoder
# Lightning LoRA
pipe.load_lora_weights(
"lightx2v/Wan2.2-Lightning",
weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/high_noise_model.safetensors",
adapter_name="high",
)
pipe.load_lora_weights(
"lightx2v/Wan2.2-Lightning",
weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/low_noise_model.safetensors",
adapter_name="low",
load_into_transformer_2=True,
)
pipe.set_adapters(["high", "low"], adapter_weights=[2., 2.])
pipe.enable_model_cpu_offload()
prompt = "A young girl with blue hair is walking in a cherry blossom park, petals gently falling around her. The girl wears a school uniform. The girl is smiling."
negative_prompt = "3d, cg, photo, stop, wait"
image = Image.open("./image.png")
max_area = 480 * 832
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))
output = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=int(16*5+1),
guidance_scale=1.0,
guidance_scale_2=1.0,
num_inference_steps=4
).frames[0]
export_to_video(output, f"output.mp4", fps=15)
The result is as follows:
input:
output:
What do you think?
In addition, I used another test code by DiffSynth-Studio.
The test code is as follows:
import torch
from PIL import Image
from diffsynth import save_video
from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
from modelscope import dataset_snapshot_download
pipe = WanVideoPipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device="cuda",
model_configs=[
ModelConfig(path="high_noise.safetensors", offload_device="cpu"),
ModelConfig(path="low_noise.safetensors", offload_device="cpu"),
ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
],
)
pipe.enable_vram_management()
input_image = Image.open("image.png").resize((480, 832))
video = pipe(
prompt = "A young girl with blue hair is walking in a cherry blossom park, petals gently falling around her. The girl wears a school uniform. The girl is smiling.",
negative_prompt = "3d, cg, photo, stop, wait",
seed=0, tiled=True,
input_image=input_image,
width=480,
height=832,
switch_DiT_boundary=0.9,
)
save_video(video, "video1.mp4", fps=16, quality=9)
The result is as follows:
Thanks in advance!
OK. I found the bug.
If we load transformers from single files, the config.image_dim is not None.
So, we should set the config.image_dim as None.
pipe = WanImageToVideoPipeline.from_pretrained(
"Wan-AI/Wan2.2-I2V-A14B-Diffusers",
transformer=transformer_high_noise, # High noise goes to main transformer
transformer_2=transformer_low_noise, # Low noise goes to transformer_2
torch_dtype=torch.bfloat16,
)
pipe.transformer.config.image_dim=None
As a result, the perfect test code is as follows:
import os
import torch
import numpy as np
from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
from diffusers.utils import export_to_video, load_image
from diffusers import FlowMatchEulerDiscreteScheduler
from PIL import Image
from transformers import CLIPVisionModel, AutoImageProcessor
from torchvision import transforms
print("Loading transformers...")
transformer_high_noise = WanTransformer3DModel.from_single_file(
"./high_noise.safetensors",
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=False
)
transformer_low_noise = WanTransformer3DModel.from_single_file(
"./low_noise.safetensors",
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=False
)
print("Creating pipeline with transformers...")
pipe = WanImageToVideoPipeline.from_pretrained(
"Wan-AI/Wan2.2-I2V-A14B-Diffusers",
transformer=transformer_high_noise, # High noise goes to main transformer
transformer_2=transformer_low_noise, # Low noise goes to transformer_2
torch_dtype=torch.bfloat16,
)
pipe.transformer.config.image_dim=None
# Lightning LoRA
pipe.load_lora_weights(
"lightx2v/Wan2.2-Lightning",
weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/high_noise_model.safetensors",
adapter_name="high",
)
pipe.load_lora_weights(
"lightx2v/Wan2.2-Lightning",
weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/low_noise_model.safetensors",
adapter_name="low",
load_into_transformer_2=True,
)
pipe.set_adapters(["high", "low"], adapter_weights=[2., 2.])
pipe.enable_model_cpu_offload()
prompt = "A young girl with blue hair is walking in a cherry blossom park, petals gently falling around her. The girl wears a school uniform. The girl is smiling."
negative_prompt = "3d, cg, photo, stop, wait"
image = Image.open("./image.png")
max_area = 720 * 1280
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))
output = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=int(16*5+1),
guidance_scale=1.0,
guidance_scale_2=1.0,
num_inference_steps=4
).frames[0]
export_to_video(output, "output.mp4", fps=16)
Output:
Please fix the diffusers code.
Thanks for the detailed discussion. Could you please report this bug with a minimal reproducible snippet on our GitHub repository? Would appreciate that very much :)