ryanzhangfan commited on Dec 20, 2023

Commit

94953a3

1 Parent(s): 748b205

Upload 30 files

Browse files

Files changed (30) hide show

feature_extractor/preprocessor_config.json +28 -0
model_index.json +32 -0
multimodal_encoder/config.json +51 -0
multimodal_encoder/configuration_emu.py +77 -0
multimodal_encoder/constants.py +47 -0
multimodal_encoder/model.bf16-00001-of-00008.safetensors +3 -0
multimodal_encoder/model.bf16-00002-of-00008.safetensors +3 -0
multimodal_encoder/model.bf16-00003-of-00008.safetensors +3 -0
multimodal_encoder/model.bf16-00004-of-00008.safetensors +3 -0
multimodal_encoder/model.bf16-00005-of-00008.safetensors +3 -0
multimodal_encoder/model.bf16-00006-of-00008.safetensors +3 -0
multimodal_encoder/model.bf16-00007-of-00008.safetensors +3 -0
multimodal_encoder/model.bf16-00008-of-00008.safetensors +3 -0
multimodal_encoder/model.safetensors.index.bf16.json +0 -0
multimodal_encoder/modeling_emu.py +185 -0
multimodal_encoder/modeling_llama.py +1011 -0
multimodal_encoder/visual.py +452 -0
pipeline_emu2_gen.py +234 -0
safety_checker/config.json +168 -0
safety_checker/model.bf16.safetensors +3 -0
scheduler/scheduler_config.json +18 -0
tokenizer/added_tokens.json +274 -0
tokenizer/special_tokens_map.json +285 -0
tokenizer/tokenizer.json +0 -0
tokenizer/tokenizer.model +3 -0
tokenizer/tokenizer_config.json +34 -0
unet/config.json +72 -0
unet/diffusion_pytorch_model.bf16.safetensors +3 -0
vae/config.json +32 -0
vae/diffusion_pytorch_model.bf16.safetensors +3 -0

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "EmuVisualGenerationPipeline",
+  "_diffusers_version": "0.21.2",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "multimodal_encoder": [
+    "transformers_modules.modeling_emu",
+    "EmuForCausalLM"
+  ],
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
+  ],
+  "scheduler": [
+    "diffusers",
+    "EulerDiscreteScheduler"
+  ],
+  "tokenizer": [
+    "transformers",
+    "LlamaTokenizerFast"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

multimodal_encoder/config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_name_or_path": "/share/project/quansun/release_hf/Emu2-VisualGeneration/multimodal_encoder/",
+  "architectures": [
+    "EmuForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_emu.EmuConfig",
+    "AutoModelForCausalLM": "modeling_emu.EmuForCausalLM"
+  },
+  "bos_token_id": 1,
+  "d_model": 1792,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 6656,
+  "initializer_range": 0.02,
+  "intermediate_size": 17920,
+  "max_position_embeddings": 2048,
+  "model_version": "base",
+  "num_attention_heads": 52,
+  "num_hidden_layers": 60,
+  "num_key_value_heads": 52,
+  "pad_token_id": 32000,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.31.0",
+  "use_cache": true,
+  "vision_config": {
+    "drop_path_rate": 0,
+    "eva_model_name": "eva-clip-E-14-plus",
+    "head_width": 112,
+    "image_size": 448,
+    "intermediate_size": 15360,
+    "layer_norm_eps": 1e-06,
+    "layers": 64,
+    "mlp_ratio": 8.571428571428571,
+    "n_query": 64,
+    "patch_size": 14,
+    "postnorm": true,
+    "qkv_bias": true,
+    "v_query": 64,
+    "width": 1792,
+    "xattn": true
+  },
+  "vocab_size": 32272
+}

multimodal_encoder/configuration_emu.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from typing import Literal
+from transformers import PretrainedConfig
+class EmuConfig(PretrainedConfig):
+    _auto_class = "AutoConfig"
+    def __init__(
+            self,
+            vocab_size=32000,
+            hidden_size=4096,
+            intermediate_size=11008,
+            num_hidden_layers=32,
+            num_attention_heads=32,
+            hidden_act='silu',
+            max_position_embeddings=2048,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            model_version: Literal["base", "chat"] = "base",
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            tie_word_embeddings=False,
+            use_cache=True,
+            pretraining_tp=1,
+            rope_theta=10000.0,
+            rope_scaling=None,
+            attention_bias=False,
+            attention_dropout=0.0,
+            **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.rms_norm_eps = rms_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_act = hidden_act
+        self.model_version = model_version
+        self.use_cache = use_cache
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

multimodal_encoder/constants.py ADDED Viewed

	@@ -0,0 +1,47 @@

+EVA_IMAGE_SIZE = 448
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+DEFAULT_IMAGE_FILE_SUFFIX = ['jpg', '0.png', 'png', 'jpeg', 'webp']
+DEFAULT_TEXT_FILE_SUFFIX = ['txt', '0.txt']
+IGNORE_INDEX = -100
+# special tokens
+# START
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_BOS_TOKEN = '<s>'
+DEFAULT_EOS_TOKEN = '</s>'
+DEFAULT_UNK_TOKEN = "<unk>"
+DEFAULT_IMG_TOKEN = "[IMG]"
+DEFAULT_IMG_END_TOKEN = "[/IMG]"
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_gIMG_TOKEN = "[gIMG]"
+DEFAULT_gIMG_END_TOKEN = "[/gIMG]"
+DEFAULT_EOC_TOKEN = "[EOC]"
+DEFAULT_VIDEO_TOKEN = "[VIDEO]"
+GRD_SYMBOL = "<grounding>"
+BOP_SYMBOL = "<phrase>"
+EOP_SYMBOL = "</phrase>"
+BOO_SYMBOL = "<object>"
+EOO_SYMBOL = "</object>"
+DOM_SYMBOL = "</delimiter_of_multi_objects/>"
+REC_SYMBOL = "<REC>"
+USER_TOKEN = "[USER]"
+ASSISTANT_TOKEN = "[ASSISTANT]"
+# END
+# special token id
+# START
+IMAGE = 32003
+BOI = 32001
+VIDEO = 32004
+# END
+DEFAULT_IMG_PLACEHOLDER = "[<IMG_PLH>]"
+DEFAULT_VID_PLACEHOLDER = "[<VID_PLH>]"
+FAKE_VIDEO_END_TOKEN = "[/VIDEO]"

multimodal_encoder/model.bf16-00001-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:849f23e3d375518a179cb7887cb8861f088e185e7619e518a38ec2a069417f87
+size 9961629600

multimodal_encoder/model.bf16-00002-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae62cc224559ee79ccc91687e3457310f3797f7517df944d02af637cad666cf4
+size 9958082896

multimodal_encoder/model.bf16-00003-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3690630dfd3ad092a527fbd5a00bc3881c6e1ff4cedf8c46001eec8a47c1e9f3
+size 9896714920

multimodal_encoder/model.bf16-00004-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9b92e277b4a31bf1daaea769b8702f32ea0cf61657f1d0f64305fe0b8ed266a
+size 9869451296

multimodal_encoder/model.bf16-00005-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adba114c2f977df27e344297798cce0fae6537891339e3aa030764d892004aa1
+size 9869451296

multimodal_encoder/model.bf16-00006-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed94b7b7fdfe014355af7b0eb99be16bf5b0e0d384cd07c358bbc078fb1d2c22
+size 9958082992

multimodal_encoder/model.bf16-00007-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f89cfc60475e3454e315fa73fb4afc263e89d87a2c93377411156c5462346590
+size 9896714920

multimodal_encoder/model.bf16-00008-of-00008.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:332b756156697afb8614b55baee954df24db366d80603e6dc83e6d3b1d5e0e4d
+size 4403309264

multimodal_encoder/model.safetensors.index.bf16.json ADDED Viewed

The diff for this file is too large to render. See raw diff

multimodal_encoder/modeling_emu.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from functools import partial
+from typing import List, Optional
+from argparse import Namespace
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PreTrainedTokenizer
+from .configuration_emu import EmuConfig
+from .constants import *
+from .modeling_llama import LlamaForCausalLM
+from .visual import EVAVisionTransformer
+class EmuPreTrainedModel(PreTrainedModel):
+    config_class = EmuConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["LlamaDecoderLayer", "Block"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class EmuForClsAndRegression(EmuPreTrainedModel):
+    def __init__(self, config):
+        super(EmuForClsAndRegression, self).__init__(config)
+        self.lm = LlamaForCausalLM(config=config)
+        self.lm.model.embed_tokens.padding_idx = config.pad_token_id
+    def get_num_layers(self):
+        return len(self.lm.model.layers)
+class EmuModel(EmuPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        vision_config = Namespace(**config.vision_config)
+        self.visual = EVAVisionTransformer(
+            img_size=vision_config.image_size,
+            patch_size=vision_config.patch_size,
+            embed_dim=vision_config.width,
+            depth=vision_config.layers,
+            num_heads=vision_config.width // vision_config.head_width,
+            mlp_ratio=vision_config.mlp_ratio,
+            qkv_bias=vision_config.qkv_bias,
+            drop_path_rate=vision_config.drop_path_rate,
+            norm_layer=partial(nn.LayerNorm, eps=vision_config.layer_norm_eps),
+            xattn=vision_config.xattn,
+            postnorm=vision_config.postnorm,
+        )
+        self.decoder = EmuForClsAndRegression(config)
+        self.gradient_checkpointing = False
+        self.n_query = vision_config.n_query
+        self.v_query = vision_config.v_query
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+    @property
+    def dtype(self):
+        return next(iter(self.parameters())).dtype
+    @torch.no_grad()
+    def encode_image(self, image: torch.Tensor, *, n_query=None):
+        n_query = n_query if n_query is not None else self.n_query
+        image_embeds = self.visual(image)
+        image_embeds = image_embeds[:, 1:, :]
+        b, n, c = image_embeds.shape
+        sqrt_n = int(n**0.5)
+        image_embeds = image_embeds.permute(0, 2, 1).view(b, c, sqrt_n, sqrt_n)
+        stride = int(sqrt_n // (n_query ** 0.5))
+        image_embeds = F.avg_pool2d(image_embeds, kernel_size=(stride, stride), stride=stride)
+        image_embeds = image_embeds.view(b, c, -1).permute(0, 2, 1).contiguous()
+        return image_embeds
+class EmuForCausalLM(EmuPreTrainedModel):
+    _auto_class = "AutoModelForCausalLM"
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = EmuModel(config)
+        # LM to EVA
+        self.project_down = nn.Linear(config.hidden_size, config.d_model, bias=False)
+        # EVA to LM
+        self.project_up = nn.Linear(config.d_model, config.hidden_size, bias=False)
+        self.n_query = self.model.n_query
+        self.image_placeholder = DEFAULT_IMG_TOKEN + DEFAULT_IMAGE_TOKEN * self.n_query + DEFAULT_IMG_END_TOKEN
+    def device(self, module=None):
+        if module is None:
+            return next(self.parameters()).device
+        return next(module.parameters()).device
+    def dtype(self, module):
+        if module is None:
+            return next(self.parameters()).dtype
+        return next(module.parameters()).dtype
+    @torch.no_grad()
+    def generate_image(
+        self,
+        text: List[str],
+        tokenizer: PreTrainedTokenizer,
+        image: Optional[torch.Tensor] = None,
+        placeholder: str = DEFAULT_IMG_PLACEHOLDER,
+    ):
+        IMAGE, BOI = tokenizer.convert_tokens_to_ids([DEFAULT_IMAGE_TOKEN, DEFAULT_IMG_TOKEN])
+        if image is not None:
+            prompt_image_embeds = self.model.encode_image(image)
+            _, _, c = prompt_image_embeds.shape
+            prompt_image_embeds = prompt_image_embeds.view(-1, c)
+            prompt_image_embeds = self.project_up(prompt_image_embeds)
+        text = [t.replace(placeholder, self.image_placeholder) for t in text]
+        target_image_embeds = None
+        for num_img_token in range(self.n_query):
+            if num_img_token == 0:
+                text = [f"{t}{DEFAULT_IMG_TOKEN}" for t in text]
+            else:
+                text = [f"{t}{DEFAULT_IMAGE_TOKEN}" for t in text]
+            inputs = tokenizer(text, padding="longest", return_tensors="pt")
+            device = self.device(self.model.decoder.lm.model.embed_tokens)
+            attention_mask = inputs.attention_mask.to(device)
+            input_ids = inputs.input_ids.to(device) # B x N
+            text_embeds = self.model.decoder.lm.model.embed_tokens(input_ids)
+            image_idx = (input_ids == IMAGE)
+            cumsum_idx = torch.flip(torch.cumsum(torch.flip(image_idx, dims=[1]), dim=1), dims=[1])
+            if image is not None:
+                prompt_idx = torch.logical_and(image_idx, cumsum_idx > num_img_token)
+                text_embeds[prompt_idx] = prompt_image_embeds.to(text_embeds.device)
+            if target_image_embeds is not None:
+                target_idx = torch.logical_and(image_idx, torch.logical_and(cumsum_idx > 0, cumsum_idx <= num_img_token))
+                text_embeds[target_idx] = self.project_up(target_image_embeds).to(text_embeds.device)
+            outputs = self.model.decoder.lm.model(
+                inputs_embeds=text_embeds,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            image_idx = (input_ids == IMAGE) + (input_ids == BOI)
+            cumsum_idx = torch.flip(torch.cumsum(torch.flip(image_idx, dims=[1]), dim=1), dims=[1])
+            target_idx = torch.logical_and(image_idx, torch.logical_and(cumsum_idx > 0, cumsum_idx <= num_img_token+1))
+            hidden_states = outputs.hidden_states[-1]
+            target_image_embeds = hidden_states[target_idx.to(hidden_states.device)]
+            target_image_embeds = target_image_embeds.view(-1, target_image_embeds.shape[-1])
+            target_image_embeds = self.project_down(target_image_embeds)
+        _, C = target_image_embeds.shape
+        B = hidden_states.shape[0]
+        target_image_embeds = target_image_embeds.view(B, -1, C)
+        return target_image_embeds

multimodal_encoder/modeling_llama.py ADDED Viewed

	@@ -0,0 +1,1011 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import PreTrainedModel
+from transformers import LlamaConfig
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.pretraining_tp = config.pretraining_tp
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.pretraining_tp > 1:
+            slice = self.intermediate_size // self.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.pretraining_tp = config.pretraining_tp
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
+            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

multimodal_encoder/visual.py ADDED Viewed

	@@ -0,0 +1,452 @@

+# --------------------------------------------------------
+# Adapted from  https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import os
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+try:
+    from timm.models.layers import drop_path, to_2tuple
+except:
+    from timm.layers import drop_path, to_2tuple
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+    print("Please 'pip install xformers'")
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+        print(f"os.getenv('RoPE')={os.getenv('RoPE')}")
+    def forward(self, x):
+        if not self.training or self.prob == 0.:
+            return x
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+        rand = torch.randn(batch, num_tokens)
+        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+        x = x[batch_indices, patch_indices_keep]
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+        if self.training and os.getenv('RoPE') == '1':
+            return x, patch_indices_keep
+        return x
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        drop=0.,
+        subln=False,
+        ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.ffn_ln(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
+                norm_layer=nn.LayerNorm, subln=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.w3 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.subln = subln
+        if self.subln:
+            self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        else:
+            if qkv_bias:
+                self.qkv = nn.Linear(dim, all_head_dim * 3, bias=True)
+            else:
+                self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        # if qkv_bias:
+        #     self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+        #     self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        #     qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        #     self.qkv.bias.data = qkv_bias
+        # else:
+        #     self.q_bias = None
+        #     self.v_bias = None
+        self.window_size = None
+        self.relative_position_bias_table = None
+        self.relative_position_index = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
+        # self.proj = nn.Linear(all_head_dim, all_head_dim)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.xattn = xattn
+        self.xattn_drop = attn_drop
+        self.rope = rope
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        B, N, C = x.shape
+        if self.subln:
+            q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+            k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+            v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+            q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)     # B, num_heads, N, C
+            k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+            v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        else:
+            # qkv_bias = None
+            # if self.q_bias is not None:
+            #     qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+            # qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = self.qkv(x)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)   # 3, B, num_heads, N, C
+            q, k, v = qkv[0], qkv[1], qkv[2]
+        if self.rope:
+            q_t = q[:, :, 1:, :]
+            ro_q_t = self.rope(q_t)
+            q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v)
+            k_t = k[:, :, 1:, :]
+            ro_k_t = self.rope(k_t)
+            k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v)
+        if self.xattn:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            x = xops.memory_efficient_attention(
+                q, k, v,
+                p=self.xattn_drop,
+                scale=self.scale,
+                )
+            x = x.reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        else:
+            q = q * self.scale
+            attn = (q @ k.transpose(-2, -1))
+            if self.relative_position_bias_table is not None:
+                relative_position_bias = \
+                    self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                        self.window_size[0] * self.window_size[1] + 1,
+                        self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0).type_as(attn)
+            if rel_pos_bias is not None:
+                attn = attn + rel_pos_bias.type_as(attn)
+            if attn_mask is not None:
+                attn_mask = attn_mask.bool()
+                attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None, xattn=False, rope=None, postnorm=False,
+                 subln=False, naiveswiglu=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim,
+            xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if naiveswiglu:
+            self.mlp = SwiGLU(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                subln=subln,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.mlp = Mlp(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                act_layer=act_layer,
+                subln=subln,
+                drop=drop
+            )
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+        self.postnorm = postnorm
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        if self.gamma_1 is None:
+            if self.postnorm:
+                x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
+                x = x + self.drop_path(self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            if self.postnorm:
+                x = x + self.drop_path(self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
+                x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
+                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class EVAVisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, patch_dropout=0.,
+                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, rope=False,
+                 use_mean_pooling=True, init_scale=0.001, grad_checkpointing=False, xattn=False, postnorm=False,
+                 pt_hw_seq_len=16, intp_freq=False, naiveswiglu=False, subln=False,
+                 ):
+        super().__init__()
+        self.image_size = img_size
+        # self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.rel_pos_bias = None
+        self.rope = None
+        self.naiveswiglu = naiveswiglu
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
+                xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
+            for i in range(depth)])
+        # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
+        self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
+        self.grad_checkpointing = grad_checkpointing
+    def get_num_layers(self):
+        return len(self.blocks)
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert unlocked_groups == 0, 'partial locking not currently supported for this model'
+        for param in self.parameters():
+            param.requires_grad = False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
+        if os.getenv('RoPE') == '1':
+            if self.training and not isinstance(self.patch_dropout, nn.Identity):
+                x, patch_indices_keep = self.patch_dropout(x)
+                self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep)
+            else:
+                self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
+                x = self.patch_dropout(x)
+        else:
+            x = self.patch_dropout(x)
+        rel_pos_bias = None
+        for blk in self.blocks:
+            if self.grad_checkpointing:
+                x = checkpoint(blk, x, (rel_pos_bias,))
+            else:
+                x = blk(x, rel_pos_bias=rel_pos_bias)
+        return x
+    def forward(self, x):
+        """
+        :return:
+            forward_features function returns raw features of ViT,
+            forward with return_all_features returns normalized features of ViT
+        :param x:
+        :param return_all_features:
+        """
+        features = self.forward_features(x)  # [B, n_patch, C]
+        return features

pipeline_emu2_gen.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# -*- coding: utf-8 -*-
+# ===========================================================================================
+#
+#    Copyright (c) Beijing Academy of Artificial Intelligence (BAAI). All rights reserved.
+#
+#    Author        : Fan Zhang
+#    Email         : [email protected]
+#    Institute     : Beijing Academy of Artificial Intelligence (BAAI)
+#    Create On     : 2023-12-19 10:45
+#    Last Modified : 2023-12-19 14:01
+#    File Name     : pipeline.py
+#    Description   :
+#
+# ===========================================================================================
+from dataclasses import dataclass
+from typing import List, Optional, Union
+from PIL import Image
+import numpy as np
+import torch
+from torchvision import transforms as TF
+from tqdm import tqdm
+from diffusers import DiffusionPipeline
+from diffusers.utils import BaseOutput
+from diffusers import UNet2DConditionModel, EulerDiscreteScheduler, AutoencoderKL
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from transformers import CLIPImageProcessor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+EVA_IMAGE_SIZE = 448
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+DEFAULT_IMG_PLACEHOLDER = "[<IMG_PLH>]"
+@dataclass
+class EmuVisualGenerationPipelineOutput(BaseOutput):
+    images: Union[List[Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+class EmuVisualGenerationPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        multimodal_encoder: AutoModelForCausalLM,
+        scheduler: EulerDiscreteScheduler,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        feature_extractor: CLIPImageProcessor,
+        safety_checker: StableDiffusionSafetyChecker,
+        eva_size=EVA_IMAGE_SIZE,
+        eva_mean=OPENAI_DATASET_MEAN,
+        eva_std=OPENAI_DATASET_STD,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            multimodal_encoder=multimodal_encoder,
+            scheduler=scheduler,
+            unet=unet,
+            vae=vae,
+            feature_extractor=feature_extractor,
+            safety_checker=safety_checker,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.transform = TF.Compose([
+            TF.Resize((eva_size, eva_size), interpolation=TF.InterpolationMode.BICUBIC),
+            TF.ToTensor(),
+            TF.Normalize(mean=eva_mean, std=eva_std),
+        ])
+        self.negative_prompt = None
+    def device(self, module):
+        return next(module.parameters()).device
+    def dtype(self, module):
+        return next(module.parameters()).dtype
+    @torch.no_grad()
+    def __call__(
+        self,
+        inputs: List[Image.Image | str] | str | Image.Image,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 3.,
+        crop_info: List[int] = [0, 0],
+        original_size: List[int] = [1024, 1024],
+    ):
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        device = self.device(self.unet)
+        dtype = self.dtype(self.unet)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 1. Encode input prompt
+        prompt_embeds = self._prepare_and_encode_inputs(
+            inputs,
+            do_classifier_free_guidance,
+        ).to(dtype).to(device)
+        batch_size = prompt_embeds.shape[0] // 2 if do_classifier_free_guidance else prompt_embeds.shape[0]
+        unet_added_conditions = {}
+        time_ids = torch.LongTensor(original_size + crop_info + [height, width]).to(device)
+        if do_classifier_free_guidance:
+            unet_added_conditions["time_ids"] = torch.cat([time_ids, time_ids], dim=0)
+        else:
+            unet_added_conditions["time_ids"] = time_ids
+        unet_added_conditions["text_embeds"] = torch.mean(prompt_embeds, dim=1)
+        # 2. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 3. Prepare latent variables
+        shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        latents = torch.randn(shape, device=device, dtype=dtype)
+        latents = latents * self.scheduler.init_noise_sigma
+        # 4. Denoising loop
+        for t in tqdm(timesteps):
+            # expand the latents if we are doing classifier free guidance
+            # 2B x 4 x H x W
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                added_cond_kwargs=unet_added_conditions,
+            ).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        # 5. Post-processing
+        images = self.decode_latents(latents)
+        # 6. Run safety checker
+        images, has_nsfw_concept = self.run_safety_checker(images)
+        # 7. Convert to PIL
+        images = self.numpy_to_pil(images)
+        return EmuVisualGenerationPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+    def _prepare_and_encode_inputs(
+        self,
+        inputs: List[str | Image.Image],
+        do_classifier_free_guidance: bool = False,
+        placeholder: str = DEFAULT_IMG_PLACEHOLDER,
+    ):
+        device = self.device(self.multimodal_encoder.model.visual)
+        dtype = self.dtype(self.multimodal_encoder.model.visual)
+        text_prompt, image_prompt = "", []
+        for x in inputs:
+            if isinstance(x, str):
+                text_prompt += x
+            else:
+                text_prompt += placeholder
+                image_prompt.append(self.transform(x))
+        if len(image_prompt) == 0:
+            image_prompt = None
+        else:
+            image_prompt = torch.stack(image_prompt)
+            image_prompt = image_prompt.type(dtype).to(device)
+        prompt = self.multimodal_encoder.generate_image(text=[text_prompt], image=image_prompt, tokenizer=self.tokenizer)
+        if do_classifier_free_guidance:
+            if self.negative_prompt is None:
+                self.negative_prompt = self.multimodal_encoder.generate_image(text=[""], tokenizer=self.tokenizer)
+            prompt = torch.cat([prompt, self.negative_prompt], dim=0)
+        return prompt
+    def decode_latents(self, latents: torch.Tensor) -> np.ndarray:
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def numpy_to_pil(self, images: np.ndarray) -> List[Image.Image]:
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    def run_safety_checker(self, images: np.ndarray):
+        if self.safety_checker is not None:
+            device = self.device(self.safety_checker)
+            dtype = self.dtype(self.safety_checker)
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(images), return_tensors="pt").to(device)
+            images, has_nsfw_concept = self.safety_checker(
+                images=images, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return images, has_nsfw_concept

safety_checker/config.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "/share/project/quansun/release_hf/Emu2-VisualGeneration/safety_checker",
+  "architectures": [
+    "StableDiffusionSafetyChecker"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 49406,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 49407,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.31.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

safety_checker/model.bf16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:013ddb2eb3e3ddb6b91fd739de8abbc8281de91f2ae9f5067ac8586d6aa29cf6
+size 608016672

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "EulerDiscreteScheduler",
+  "_diffusers_version": "0.21.2",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "interpolation_type": "linear",
+  "num_train_timesteps": 1000,
+  "prediction_type": "epsilon",
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "timestep_spacing": "leading",
+  "trained_betas": null,
+  "use_karras_sigmas": false
+}

tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,274 @@

+{
+  "</delimiter_of_multi_objects/>": 32013,
+  "</object>": 32012,
+  "</phrase>": 32010,
+  "<REC>": 32014,
+  "<grounding>": 32008,
+  "<image>": 32003,
+  "<object>": 32011,
+  "<patch_index_0000>": 32015,
+  "<patch_index_0001>": 32016,
+  "<patch_index_0002>": 32017,
+  "<patch_index_0003>": 32018,
+  "<patch_index_0004>": 32019,
+  "<patch_index_0005>": 32020,
+  "<patch_index_0006>": 32021,
+  "<patch_index_0007>": 32022,
+  "<patch_index_0008>": 32023,
+  "<patch_index_0009>": 32024,
+  "<patch_index_0010>": 32025,
+  "<patch_index_0011>": 32026,
+  "<patch_index_0012>": 32027,
+  "<patch_index_0013>": 32028,
+  "<patch_index_0014>": 32029,
+  "<patch_index_0015>": 32030,
+  "<patch_index_0016>": 32031,
+  "<patch_index_0017>": 32032,
+  "<patch_index_0018>": 32033,
+  "<patch_index_0019>": 32034,
+  "<patch_index_0020>": 32035,
+  "<patch_index_0021>": 32036,
+  "<patch_index_0022>": 32037,
+  "<patch_index_0023>": 32038,
+  "<patch_index_0024>": 32039,
+  "<patch_index_0025>": 32040,
+  "<patch_index_0026>": 32041,
+  "<patch_index_0027>": 32042,
+  "<patch_index_0028>": 32043,
+  "<patch_index_0029>": 32044,
+  "<patch_index_0030>": 32045,
+  "<patch_index_0031>": 32046,
+  "<patch_index_0032>": 32047,
+  "<patch_index_0033>": 32048,
+  "<patch_index_0034>": 32049,
+  "<patch_index_0035>": 32050,
+  "<patch_index_0036>": 32051,
+  "<patch_index_0037>": 32052,
+  "<patch_index_0038>": 32053,
+  "<patch_index_0039>": 32054,
+  "<patch_index_0040>": 32055,
+  "<patch_index_0041>": 32056,
+  "<patch_index_0042>": 32057,
+  "<patch_index_0043>": 32058,
+  "<patch_index_0044>": 32059,
+  "<patch_index_0045>": 32060,
+  "<patch_index_0046>": 32061,
+  "<patch_index_0047>": 32062,
+  "<patch_index_0048>": 32063,
+  "<patch_index_0049>": 32064,
+  "<patch_index_0050>": 32065,
+  "<patch_index_0051>": 32066,
+  "<patch_index_0052>": 32067,
+  "<patch_index_0053>": 32068,
+  "<patch_index_0054>": 32069,
+  "<patch_index_0055>": 32070,
+  "<patch_index_0056>": 32071,
+  "<patch_index_0057>": 32072,
+  "<patch_index_0058>": 32073,
+  "<patch_index_0059>": 32074,
+  "<patch_index_0060>": 32075,
+  "<patch_index_0061>": 32076,
+  "<patch_index_0062>": 32077,
+  "<patch_index_0063>": 32078,
+  "<patch_index_0064>": 32079,
+  "<patch_index_0065>": 32080,
+  "<patch_index_0066>": 32081,
+  "<patch_index_0067>": 32082,
+  "<patch_index_0068>": 32083,
+  "<patch_index_0069>": 32084,
+  "<patch_index_0070>": 32085,
+  "<patch_index_0071>": 32086,
+  "<patch_index_0072>": 32087,
+  "<patch_index_0073>": 32088,
+  "<patch_index_0074>": 32089,
+  "<patch_index_0075>": 32090,
+  "<patch_index_0076>": 32091,
+  "<patch_index_0077>": 32092,
+  "<patch_index_0078>": 32093,
+  "<patch_index_0079>": 32094,
+  "<patch_index_0080>": 32095,
+  "<patch_index_0081>": 32096,
+  "<patch_index_0082>": 32097,
+  "<patch_index_0083>": 32098,
+  "<patch_index_0084>": 32099,
+  "<patch_index_0085>": 32100,
+  "<patch_index_0086>": 32101,
+  "<patch_index_0087>": 32102,
+  "<patch_index_0088>": 32103,
+  "<patch_index_0089>": 32104,
+  "<patch_index_0090>": 32105,
+  "<patch_index_0091>": 32106,
+  "<patch_index_0092>": 32107,
+  "<patch_index_0093>": 32108,
+  "<patch_index_0094>": 32109,
+  "<patch_index_0095>": 32110,
+  "<patch_index_0096>": 32111,
+  "<patch_index_0097>": 32112,
+  "<patch_index_0098>": 32113,
+  "<patch_index_0099>": 32114,
+  "<patch_index_0100>": 32115,
+  "<patch_index_0101>": 32116,
+  "<patch_index_0102>": 32117,
+  "<patch_index_0103>": 32118,
+  "<patch_index_0104>": 32119,
+  "<patch_index_0105>": 32120,
+  "<patch_index_0106>": 32121,
+  "<patch_index_0107>": 32122,
+  "<patch_index_0108>": 32123,
+  "<patch_index_0109>": 32124,
+  "<patch_index_0110>": 32125,
+  "<patch_index_0111>": 32126,
+  "<patch_index_0112>": 32127,
+  "<patch_index_0113>": 32128,
+  "<patch_index_0114>": 32129,
+  "<patch_index_0115>": 32130,
+  "<patch_index_0116>": 32131,
+  "<patch_index_0117>": 32132,
+  "<patch_index_0118>": 32133,
+  "<patch_index_0119>": 32134,
+  "<patch_index_0120>": 32135,
+  "<patch_index_0121>": 32136,
+  "<patch_index_0122>": 32137,
+  "<patch_index_0123>": 32138,
+  "<patch_index_0124>": 32139,
+  "<patch_index_0125>": 32140,
+  "<patch_index_0126>": 32141,
+  "<patch_index_0127>": 32142,
+  "<patch_index_0128>": 32143,
+  "<patch_index_0129>": 32144,
+  "<patch_index_0130>": 32145,
+  "<patch_index_0131>": 32146,
+  "<patch_index_0132>": 32147,
+  "<patch_index_0133>": 32148,
+  "<patch_index_0134>": 32149,
+  "<patch_index_0135>": 32150,
+  "<patch_index_0136>": 32151,
+  "<patch_index_0137>": 32152,
+  "<patch_index_0138>": 32153,
+  "<patch_index_0139>": 32154,
+  "<patch_index_0140>": 32155,
+  "<patch_index_0141>": 32156,
+  "<patch_index_0142>": 32157,
+  "<patch_index_0143>": 32158,
+  "<patch_index_0144>": 32159,
+  "<patch_index_0145>": 32160,
+  "<patch_index_0146>": 32161,
+  "<patch_index_0147>": 32162,
+  "<patch_index_0148>": 32163,
+  "<patch_index_0149>": 32164,
+  "<patch_index_0150>": 32165,
+  "<patch_index_0151>": 32166,
+  "<patch_index_0152>": 32167,
+  "<patch_index_0153>": 32168,
+  "<patch_index_0154>": 32169,
+  "<patch_index_0155>": 32170,
+  "<patch_index_0156>": 32171,
+  "<patch_index_0157>": 32172,
+  "<patch_index_0158>": 32173,
+  "<patch_index_0159>": 32174,
+  "<patch_index_0160>": 32175,
+  "<patch_index_0161>": 32176,
+  "<patch_index_0162>": 32177,
+  "<patch_index_0163>": 32178,
+  "<patch_index_0164>": 32179,
+  "<patch_index_0165>": 32180,
+  "<patch_index_0166>": 32181,
+  "<patch_index_0167>": 32182,
+  "<patch_index_0168>": 32183,
+  "<patch_index_0169>": 32184,
+  "<patch_index_0170>": 32185,
+  "<patch_index_0171>": 32186,
+  "<patch_index_0172>": 32187,
+  "<patch_index_0173>": 32188,
+  "<patch_index_0174>": 32189,
+  "<patch_index_0175>": 32190,
+  "<patch_index_0176>": 32191,
+  "<patch_index_0177>": 32192,
+  "<patch_index_0178>": 32193,
+  "<patch_index_0179>": 32194,
+  "<patch_index_0180>": 32195,
+  "<patch_index_0181>": 32196,
+  "<patch_index_0182>": 32197,
+  "<patch_index_0183>": 32198,
+  "<patch_index_0184>": 32199,
+  "<patch_index_0185>": 32200,
+  "<patch_index_0186>": 32201,
+  "<patch_index_0187>": 32202,
+  "<patch_index_0188>": 32203,
+  "<patch_index_0189>": 32204,
+  "<patch_index_0190>": 32205,
+  "<patch_index_0191>": 32206,
+  "<patch_index_0192>": 32207,
+  "<patch_index_0193>": 32208,
+  "<patch_index_0194>": 32209,
+  "<patch_index_0195>": 32210,
+  "<patch_index_0196>": 32211,
+  "<patch_index_0197>": 32212,
+  "<patch_index_0198>": 32213,
+  "<patch_index_0199>": 32214,
+  "<patch_index_0200>": 32215,
+  "<patch_index_0201>": 32216,
+  "<patch_index_0202>": 32217,
+  "<patch_index_0203>": 32218,
+  "<patch_index_0204>": 32219,
+  "<patch_index_0205>": 32220,
+  "<patch_index_0206>": 32221,
+  "<patch_index_0207>": 32222,
+  "<patch_index_0208>": 32223,
+  "<patch_index_0209>": 32224,
+  "<patch_index_0210>": 32225,
+  "<patch_index_0211>": 32226,
+  "<patch_index_0212>": 32227,
+  "<patch_index_0213>": 32228,
+  "<patch_index_0214>": 32229,
+  "<patch_index_0215>": 32230,
+  "<patch_index_0216>": 32231,
+  "<patch_index_0217>": 32232,
+  "<patch_index_0218>": 32233,
+  "<patch_index_0219>": 32234,
+  "<patch_index_0220>": 32235,
+  "<patch_index_0221>": 32236,
+  "<patch_index_0222>": 32237,
+  "<patch_index_0223>": 32238,
+  "<patch_index_0224>": 32239,
+  "<patch_index_0225>": 32240,
+  "<patch_index_0226>": 32241,
+  "<patch_index_0227>": 32242,
+  "<patch_index_0228>": 32243,
+  "<patch_index_0229>": 32244,
+  "<patch_index_0230>": 32245,
+  "<patch_index_0231>": 32246,
+  "<patch_index_0232>": 32247,
+  "<patch_index_0233>": 32248,
+  "<patch_index_0234>": 32249,
+  "<patch_index_0235>": 32250,
+  "<patch_index_0236>": 32251,
+  "<patch_index_0237>": 32252,
+  "<patch_index_0238>": 32253,
+  "<patch_index_0239>": 32254,
+  "<patch_index_0240>": 32255,
+  "<patch_index_0241>": 32256,
+  "<patch_index_0242>": 32257,
+  "<patch_index_0243>": 32258,
+  "<patch_index_0244>": 32259,
+  "<patch_index_0245>": 32260,
+  "<patch_index_0246>": 32261,
+  "<patch_index_0247>": 32262,
+  "<patch_index_0248>": 32263,
+  "<patch_index_0249>": 32264,
+  "<patch_index_0250>": 32265,
+  "<patch_index_0251>": 32266,
+  "<patch_index_0252>": 32267,
+  "<patch_index_0253>": 32268,
+  "<patch_index_0254>": 32269,
+  "<patch_index_0255>": 32270,
+  "<patch_index_0256>": 32271,
+  "<phrase>": 32009,
+  "[/IMG]": 32002,
+  "[/gIMG]": 32005,
+  "[EOC]": 32006,
+  "[IMG]": 32001,
+  "[PAD]": 32000,
+  "[VIDEO]": 32007,
+  "[gIMG]": 32004
+}

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,285 @@

+{
+  "additional_special_tokens": [
+    "[IMG]",
+    "[/IMG]",
+    "<image>",
+    "[gIMG]",
+    "[/gIMG]",
+    "[EOC]",
+    "[VIDEO]",
+    "<grounding>",
+    "<phrase>",
+    "</phrase>",
+    "<object>",
+    "</object>",
+    "</delimiter_of_multi_objects/>",
+    "<REC>",
+    "<patch_index_0000>",
+    "<patch_index_0001>",
+    "<patch_index_0002>",
+    "<patch_index_0003>",
+    "<patch_index_0004>",
+    "<patch_index_0005>",
+    "<patch_index_0006>",
+    "<patch_index_0007>",
+    "<patch_index_0008>",
+    "<patch_index_0009>",
+    "<patch_index_0010>",
+    "<patch_index_0011>",
+    "<patch_index_0012>",
+    "<patch_index_0013>",
+    "<patch_index_0014>",
+    "<patch_index_0015>",
+    "<patch_index_0016>",
+    "<patch_index_0017>",
+    "<patch_index_0018>",
+    "<patch_index_0019>",
+    "<patch_index_0020>",
+    "<patch_index_0021>",
+    "<patch_index_0022>",
+    "<patch_index_0023>",
+    "<patch_index_0024>",
+    "<patch_index_0025>",
+    "<patch_index_0026>",
+    "<patch_index_0027>",
+    "<patch_index_0028>",
+    "<patch_index_0029>",
+    "<patch_index_0030>",
+    "<patch_index_0031>",
+    "<patch_index_0032>",
+    "<patch_index_0033>",
+    "<patch_index_0034>",
+    "<patch_index_0035>",
+    "<patch_index_0036>",
+    "<patch_index_0037>",
+    "<patch_index_0038>",
+    "<patch_index_0039>",
+    "<patch_index_0040>",
+    "<patch_index_0041>",
+    "<patch_index_0042>",
+    "<patch_index_0043>",
+    "<patch_index_0044>",
+    "<patch_index_0045>",
+    "<patch_index_0046>",
+    "<patch_index_0047>",
+    "<patch_index_0048>",
+    "<patch_index_0049>",
+    "<patch_index_0050>",
+    "<patch_index_0051>",
+    "<patch_index_0052>",
+    "<patch_index_0053>",
+    "<patch_index_0054>",
+    "<patch_index_0055>",
+    "<patch_index_0056>",
+    "<patch_index_0057>",
+    "<patch_index_0058>",
+    "<patch_index_0059>",
+    "<patch_index_0060>",
+    "<patch_index_0061>",
+    "<patch_index_0062>",
+    "<patch_index_0063>",
+    "<patch_index_0064>",
+    "<patch_index_0065>",
+    "<patch_index_0066>",
+    "<patch_index_0067>",
+    "<patch_index_0068>",
+    "<patch_index_0069>",
+    "<patch_index_0070>",
+    "<patch_index_0071>",
+    "<patch_index_0072>",
+    "<patch_index_0073>",
+    "<patch_index_0074>",
+    "<patch_index_0075>",
+    "<patch_index_0076>",
+    "<patch_index_0077>",
+    "<patch_index_0078>",
+    "<patch_index_0079>",
+    "<patch_index_0080>",
+    "<patch_index_0081>",
+    "<patch_index_0082>",
+    "<patch_index_0083>",
+    "<patch_index_0084>",
+    "<patch_index_0085>",
+    "<patch_index_0086>",
+    "<patch_index_0087>",
+    "<patch_index_0088>",
+    "<patch_index_0089>",
+    "<patch_index_0090>",
+    "<patch_index_0091>",
+    "<patch_index_0092>",
+    "<patch_index_0093>",
+    "<patch_index_0094>",
+    "<patch_index_0095>",
+    "<patch_index_0096>",
+    "<patch_index_0097>",
+    "<patch_index_0098>",
+    "<patch_index_0099>",
+    "<patch_index_0100>",
+    "<patch_index_0101>",
+    "<patch_index_0102>",
+    "<patch_index_0103>",
+    "<patch_index_0104>",
+    "<patch_index_0105>",
+    "<patch_index_0106>",
+    "<patch_index_0107>",
+    "<patch_index_0108>",
+    "<patch_index_0109>",
+    "<patch_index_0110>",
+    "<patch_index_0111>",
+    "<patch_index_0112>",
+    "<patch_index_0113>",
+    "<patch_index_0114>",
+    "<patch_index_0115>",
+    "<patch_index_0116>",
+    "<patch_index_0117>",
+    "<patch_index_0118>",
+    "<patch_index_0119>",
+    "<patch_index_0120>",
+    "<patch_index_0121>",
+    "<patch_index_0122>",
+    "<patch_index_0123>",
+    "<patch_index_0124>",
+    "<patch_index_0125>",
+    "<patch_index_0126>",
+    "<patch_index_0127>",
+    "<patch_index_0128>",
+    "<patch_index_0129>",
+    "<patch_index_0130>",
+    "<patch_index_0131>",
+    "<patch_index_0132>",
+    "<patch_index_0133>",
+    "<patch_index_0134>",
+    "<patch_index_0135>",
+    "<patch_index_0136>",
+    "<patch_index_0137>",
+    "<patch_index_0138>",
+    "<patch_index_0139>",
+    "<patch_index_0140>",
+    "<patch_index_0141>",
+    "<patch_index_0142>",
+    "<patch_index_0143>",
+    "<patch_index_0144>",
+    "<patch_index_0145>",
+    "<patch_index_0146>",
+    "<patch_index_0147>",
+    "<patch_index_0148>",
+    "<patch_index_0149>",
+    "<patch_index_0150>",
+    "<patch_index_0151>",
+    "<patch_index_0152>",
+    "<patch_index_0153>",
+    "<patch_index_0154>",
+    "<patch_index_0155>",
+    "<patch_index_0156>",
+    "<patch_index_0157>",
+    "<patch_index_0158>",
+    "<patch_index_0159>",
+    "<patch_index_0160>",
+    "<patch_index_0161>",
+    "<patch_index_0162>",
+    "<patch_index_0163>",
+    "<patch_index_0164>",
+    "<patch_index_0165>",
+    "<patch_index_0166>",
+    "<patch_index_0167>",
+    "<patch_index_0168>",
+    "<patch_index_0169>",
+    "<patch_index_0170>",
+    "<patch_index_0171>",
+    "<patch_index_0172>",
+    "<patch_index_0173>",
+    "<patch_index_0174>",
+    "<patch_index_0175>",
+    "<patch_index_0176>",
+    "<patch_index_0177>",
+    "<patch_index_0178>",
+    "<patch_index_0179>",
+    "<patch_index_0180>",
+    "<patch_index_0181>",
+    "<patch_index_0182>",
+    "<patch_index_0183>",
+    "<patch_index_0184>",
+    "<patch_index_0185>",
+    "<patch_index_0186>",
+    "<patch_index_0187>",
+    "<patch_index_0188>",
+    "<patch_index_0189>",
+    "<patch_index_0190>",
+    "<patch_index_0191>",
+    "<patch_index_0192>",
+    "<patch_index_0193>",
+    "<patch_index_0194>",
+    "<patch_index_0195>",
+    "<patch_index_0196>",
+    "<patch_index_0197>",
+    "<patch_index_0198>",
+    "<patch_index_0199>",
+    "<patch_index_0200>",
+    "<patch_index_0201>",
+    "<patch_index_0202>",
+    "<patch_index_0203>",
+    "<patch_index_0204>",
+    "<patch_index_0205>",
+    "<patch_index_0206>",
+    "<patch_index_0207>",
+    "<patch_index_0208>",
+    "<patch_index_0209>",
+    "<patch_index_0210>",
+    "<patch_index_0211>",
+    "<patch_index_0212>",
+    "<patch_index_0213>",
+    "<patch_index_0214>",
+    "<patch_index_0215>",
+    "<patch_index_0216>",
+    "<patch_index_0217>",
+    "<patch_index_0218>",
+    "<patch_index_0219>",
+    "<patch_index_0220>",
+    "<patch_index_0221>",
+    "<patch_index_0222>",
+    "<patch_index_0223>",
+    "<patch_index_0224>",
+    "<patch_index_0225>",
+    "<patch_index_0226>",
+    "<patch_index_0227>",
+    "<patch_index_0228>",
+    "<patch_index_0229>",
+    "<patch_index_0230>",
+    "<patch_index_0231>",
+    "<patch_index_0232>",
+    "<patch_index_0233>",
+    "<patch_index_0234>",
+    "<patch_index_0235>",
+    "<patch_index_0236>",
+    "<patch_index_0237>",
+    "<patch_index_0238>",
+    "<patch_index_0239>",
+    "<patch_index_0240>",
+    "<patch_index_0241>",
+    "<patch_index_0242>",
+    "<patch_index_0243>",
+    "<patch_index_0244>",
+    "<patch_index_0245>",
+    "<patch_index_0246>",
+    "<patch_index_0247>",
+    "<patch_index_0248>",
+    "<patch_index_0249>",
+    "<patch_index_0250>",
+    "<patch_index_0251>",
+    "<patch_index_0252>",
+    "<patch_index_0253>",
+    "<patch_index_0254>",
+    "<patch_index_0255>",
+    "<patch_index_0256>"
+  ],
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "[PAD]",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

unet/config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.21.2",
+  "_name_or_path": "/share/project/quansun/release_hf/Emu2-VisualGeneration/unet",
+  "act_fn": "silu",
+  "addition_embed_type": "text_time",
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": 256,
+  "attention_head_dim": [
+    5,
+    10,
+    20
+  ],
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 1792,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "DownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": 3328,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "sample_size": 128,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": [
+    1,
+    2,
+    10
+  ],
+  "up_block_types": [
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "UpBlock2D"
+  ],
+  "upcast_attention": null,
+  "use_linear_projection": true
+}

unet/diffusion_pytorch_model.bf16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67215fe9e8e24202651fce2ff72203d21bdb7986a88ec062f72cc94f6040a314
+size 5051265352

vae/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.21.2",
+  "_name_or_path": "/share/project/quansun/release_hf/Emu2-VisualGeneration/vae",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 1024,
+  "scaling_factor": 0.13025,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae/diffusion_pytorch_model.bf16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2741af7e84fe3b0a7aee02f89fa34c0858ed55f5782aab5931b94938983652da
+size 167335590