Loading full fine tuned model

#5
by alfredplpl - opened

Great works!

I have tried to full fine tune these models (transformer and transformers_2) by DiffSynth-Studio.
However, I could not load the models.

My test code is as follows:

import os

import torch
import numpy as np
from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
from diffusers.utils import export_to_video, load_image
from diffusers import FlowMatchEulerDiscreteScheduler
from PIL import Image
from transformers import CLIPVisionModel, AutoImageProcessor

print("Loading transformers...")
transformer_high_noise = WanTransformer3DModel.from_single_file(
    "./high_noise.safetensors",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=False
)

transformer_low_noise = WanTransformer3DModel.from_single_file(
    "./low_noise.safetensors", 
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=False
)

image_encoder = CLIPVisionModel.from_pretrained("Wan-AI/Wan2.1-I2V-14B-720P-Diffusers", subfolder="image_encoder", torch_dtype=torch.float32)
image_processor = AutoImageProcessor.from_pretrained("Wan-AI/Wan2.1-I2V-14B-720P-Diffusers", subfolder="image_processor")

print("Creating pipeline with transformers...")
pipe = WanImageToVideoPipeline.from_pretrained("Wan-AI/Wan2.2-I2V-A14B-Diffusers", torch_dtype=torch.bfloat16)
pipe.transformer=transformer_high_noise
pipe.transformer_2=transformer_low_noise

pipe.image_processor = image_processor
pipe.image_encoder = image_encoder
# Lightning LoRA

pipe.load_lora_weights(
    "lightx2v/Wan2.2-Lightning",
    weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/high_noise_model.safetensors",
    adapter_name="high",
)
pipe.load_lora_weights(
    "lightx2v/Wan2.2-Lightning",
    weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/low_noise_model.safetensors",
    adapter_name="low",
    load_into_transformer_2=True,
)

pipe.set_adapters(["high", "low"], adapter_weights=[2., 2.])

pipe.enable_model_cpu_offload()

prompt = "A young girl with blue hair is walking in a cherry blossom park, petals gently falling around her. The girl wears a school uniform. The girl is smiling."
negative_prompt = "3d, cg, photo, stop, wait"

image = Image.open("./image.png")

max_area = 480 * 832
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))

output = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=int(16*5+1),
    guidance_scale=1.0,
    guidance_scale_2=1.0,
    num_inference_steps=4
).frames[0]

export_to_video(output, f"output.mp4", fps=15)

The result is as follows:
input:

image.png

output:

What do you think?

alfredplpl changed discussion title from Full fine tuned model loading to Loading full fine tuned model

In addition, I used another test code by DiffSynth-Studio.

The test code is as follows:

import torch
from PIL import Image
from diffsynth import save_video
from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
from modelscope import dataset_snapshot_download

pipe = WanVideoPipeline.from_pretrained(
    torch_dtype=torch.bfloat16,
    device="cuda",
    model_configs=[
        ModelConfig(path="high_noise.safetensors", offload_device="cpu"),
        ModelConfig(path="low_noise.safetensors", offload_device="cpu"),
        ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
        ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
    ],
)
pipe.enable_vram_management()

input_image = Image.open("image.png").resize((480, 832))

video = pipe(
    prompt = "A young girl with blue hair is walking in a cherry blossom park, petals gently falling around her. The girl wears a school uniform. The girl is smiling.",
    negative_prompt = "3d, cg, photo, stop, wait",
    seed=0, tiled=True,
    input_image=input_image,
    width=480,
    height=832,
    switch_DiT_boundary=0.9,
)
save_video(video, "video1.mp4", fps=16, quality=9)

The result is as follows:

Thanks in advance!

OK. I found the bug.
If we load transformers from single files, the config.image_dim is not None.
So, we should set the config.image_dim as None.

pipe = WanImageToVideoPipeline.from_pretrained(
    "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
    transformer=transformer_high_noise,  # High noise goes to main transformer
    transformer_2=transformer_low_noise,  # Low noise goes to transformer_2
    torch_dtype=torch.bfloat16,
)
pipe.transformer.config.image_dim=None

As a result, the perfect test code is as follows:

import os

import torch
import numpy as np
from diffusers import AutoencoderKLWan, WanTransformer3DModel, WanImageToVideoPipeline
from diffusers.utils import export_to_video, load_image
from diffusers import FlowMatchEulerDiscreteScheduler
from PIL import Image
from transformers import CLIPVisionModel, AutoImageProcessor
from torchvision import transforms

print("Loading transformers...")
transformer_high_noise = WanTransformer3DModel.from_single_file(
    "./high_noise.safetensors",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=False
)

transformer_low_noise = WanTransformer3DModel.from_single_file(
    "./low_noise.safetensors", 
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=False
)

print("Creating pipeline with transformers...")

pipe = WanImageToVideoPipeline.from_pretrained(
    "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
    transformer=transformer_high_noise,  # High noise goes to main transformer
    transformer_2=transformer_low_noise,  # Low noise goes to transformer_2
    torch_dtype=torch.bfloat16,
)
pipe.transformer.config.image_dim=None

# Lightning LoRA
pipe.load_lora_weights(
    "lightx2v/Wan2.2-Lightning",
    weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/high_noise_model.safetensors",
    adapter_name="high",
)
pipe.load_lora_weights(
    "lightx2v/Wan2.2-Lightning",
    weight_name="Wan2.2-I2V-A14B-4steps-lora-rank64-Seko-V1/low_noise_model.safetensors",
    adapter_name="low",
    load_into_transformer_2=True,
)

pipe.set_adapters(["high", "low"], adapter_weights=[2., 2.])

pipe.enable_model_cpu_offload()

prompt = "A young girl with blue hair is walking in a cherry blossom park, petals gently falling around her. The girl wears a school uniform. The girl is smiling."
negative_prompt = "3d, cg, photo, stop, wait"

image = Image.open("./image.png")

max_area = 720 * 1280
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))

output = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=int(16*5+1),
    guidance_scale=1.0,
    guidance_scale_2=1.0,
    num_inference_steps=4
).frames[0]

export_to_video(output, "output.mp4", fps=16)

Output:

Please fix the diffusers code.

Thanks for the detailed discussion. Could you please report this bug with a minimal reproducible snippet on our GitHub repository? Would appreciate that very much :)

Thank you for your advice. I reported https://github.com/huggingface/diffusers/issues/12329 .

Sign up or log in to comment