target: model.cldm.ControlLDM
params:
  linear_start: 0.00085
  linear_end: 0.0120
  num_timesteps_cond: 1
  log_every_t: 200
  timesteps: 1000
  first_stage_key: "jpg"
  cond_stage_key: "txt"
  control_key: "hint"
  image_size: 64
  channels: 4
  cond_stage_trainable: false
  conditioning_key: crossattn
  monitor: val/loss_simple_ema
  scale_factor: 0.18215
  use_ema: False
  
  sd_locked: True
  only_mid_control: False
  # Learning rate.
  learning_rate: 1e-4
  
  control_stage_config:
    target: model.cldm.ControlNet
    params:
      use_checkpoint: True
      image_size: 32 # unused
      in_channels: 4
      hint_channels: 4
      model_channels: 320
      attention_resolutions: [ 4, 2, 1 ]
      num_res_blocks: 2
      channel_mult: [ 1, 2, 4, 4 ]
      num_head_channels: 64 # need to fix for flash-attn
      use_spatial_transformer: True
      use_linear_in_transformer: True
      transformer_depth: 1
      context_dim: 1024
      legacy: False

  unet_config:
    target: model.cldm.ControlledUnetModel
    params:
      use_checkpoint: True
      image_size: 32 # unused
      in_channels: 4
      out_channels: 4
      model_channels: 320
      attention_resolutions: [ 4, 2, 1 ]
      num_res_blocks: 2
      channel_mult: [ 1, 2, 4, 4 ]
      num_head_channels: 64 # need to fix for flash-attn
      use_spatial_transformer: True
      use_linear_in_transformer: True
      transformer_depth: 1
      context_dim: 1024
      legacy: False

  first_stage_config:
    target: ldm.models.autoencoder.AutoencoderKL
    params:
      embed_dim: 4
      monitor: val/rec_loss
      ddconfig:
        #attn_type: "vanilla-xformers"
        double_z: true
        z_channels: 4
        resolution: 256
        in_channels: 3
        out_ch: 3
        ch: 128
        ch_mult:
        - 1
        - 2
        - 4
        - 4
        num_res_blocks: 2
        attn_resolutions: []
        dropout: 0.0
      lossconfig:
        target: torch.nn.Identity

  cond_stage_config:
    target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
    params:
      freeze: True
      layer: "penultimate"

  preprocess_config:
    target: model.swinir.SwinIR
    params:
      img_size: 64
      patch_size: 1
      in_chans: 3
      embed_dim: 180
      depths: [6, 6, 6, 6, 6, 6, 6, 6]
      num_heads: [6, 6, 6, 6, 6, 6, 6, 6]
      window_size: 8
      mlp_ratio: 2
      sf: 8
      img_range: 1.0
      upsampler: "nearest+conv"
      resi_connection: "1conv"
      unshuffle: True
      unshuffle_scale: 8