Spaces:
Running
on
Zero
Running
on
Zero
__object__: | |
path: projects.video_diffusion_sr.train | |
name: VideoDiffusionTrainer | |
dit: | |
model: | |
__object__: | |
path: models.dit_v2.nadit | |
name: NaDiT | |
args: as_params | |
vid_in_channels: 33 | |
vid_out_channels: 16 | |
vid_dim: 2560 | |
vid_out_norm: fusedrms | |
txt_in_dim: 5120 | |
txt_in_norm: fusedln | |
txt_dim: ${.vid_dim} | |
emb_dim: ${eval:'6 * ${.vid_dim}'} | |
heads: 20 | |
head_dim: 128 # llm-like | |
expand_ratio: 4 | |
norm: fusedrms | |
norm_eps: 1.0e-05 | |
ada: single | |
qk_bias: False | |
qk_norm: fusedrms | |
patch_size: [ 1,2,2 ] | |
num_layers: 32 # llm-like | |
mm_layers: 10 | |
mlp_type: swiglu | |
msa_type: None | |
block_type: ${eval:'${.num_layers} * ["mmdit_sr"]'} # space-full | |
window: ${eval:'${.num_layers} * [(4,3,3)]'} # space-full | |
window_method: ${eval:'${.num_layers} // 2 * ["720pwin_by_size_bysize","720pswin_by_size_bysize"]'} # space-full | |
rope_type: mmrope3d | |
rope_dim: 128 | |
compile: False | |
gradient_checkpoint: True | |
fsdp: | |
sharding_strategy: _HYBRID_SHARD_ZERO2 | |
ema: | |
decay: 0.9998 | |
vae: | |
model: | |
__inherit__: models/video_vae_v3/s8_c16_t4_inflation_sd3.yaml | |
freeze_encoder: False | |
# gradient_checkpoint: True | |
slicing: | |
split_size: 4 | |
memory_device: same | |
memory_limit: | |
conv_max_mem: 0.5 | |
norm_max_mem: 0.5 | |
checkpoint: ./ckpts/ema_vae.pth | |
scaling_factor: 0.9152 | |
compile: False | |
grouping: False | |
dtype: bfloat16 | |
diffusion: | |
schedule: | |
type: lerp | |
T: 1000.0 | |
sampler: | |
type: euler | |
prediction_type: v_lerp | |
timesteps: | |
training: | |
type: logitnormal | |
loc: 0.0 | |
scale: 1.0 | |
sampling: | |
type: uniform_trailing | |
steps: 50 | |
transform: True | |
loss: | |
type: v_lerp | |
cfg: | |
scale: 7.5 | |
rescale: 0 | |
condition: | |
i2v: 0.0 | |
v2v: 0.0 | |
sr: 1.0 | |
noise_scale: 0.25 | |