OmniRewardModel checkpoint upload
Browse files- .gitattributes +1 -0
- added_tokens.json +59 -0
- config.json +197 -0
- configuration_minicpm.py +210 -0
- generation_config.json +6 -0
- image_processing_minicpmv.py +407 -0
- merges.txt +0 -0
- model-00001-of-00004.safetensors +3 -0
- model-00002-of-00004.safetensors +3 -0
- model-00003-of-00004.safetensors +3 -0
- model-00004-of-00004.safetensors +3 -0
- model.safetensors.index.json +1167 -0
- modeling_minicpmo.py +0 -0
- modeling_navit_siglip.py +940 -0
- preprocessor_config.json +43 -0
- processing_minicpmo.py +505 -0
- processor_config.json +6 -0
- resampler.py +864 -0
- special_tokens_map.json +74 -0
- tokenization_minicpmo_fast.py +110 -0
- tokenizer.json +3 -0
- tokenizer_config.json +527 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
- utils.py +203 -0
- value_head.safetensors +3 -0
- vocab.json +0 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | 
|  | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
            +
            tokenizer.json filter=lfs diff=lfs merge=lfs -text
         | 
    	
        added_tokens.json
    ADDED
    
    | @@ -0,0 +1,59 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "</asr>": 151682,
         | 
| 3 | 
            +
              "</box>": 151670,
         | 
| 4 | 
            +
              "</image>": 151666,
         | 
| 5 | 
            +
              "</image_id>": 151678,
         | 
| 6 | 
            +
              "</point>": 151674,
         | 
| 7 | 
            +
              "</quad>": 151672,
         | 
| 8 | 
            +
              "</query>": 151684,
         | 
| 9 | 
            +
              "</ref>": 151668,
         | 
| 10 | 
            +
              "</slice>": 151676,
         | 
| 11 | 
            +
              "</tool_call>": 151658,
         | 
| 12 | 
            +
              "</unit>": 151680,
         | 
| 13 | 
            +
              "<asr>": 151681,
         | 
| 14 | 
            +
              "<box>": 151669,
         | 
| 15 | 
            +
              "<image>": 151665,
         | 
| 16 | 
            +
              "<image_id>": 151677,
         | 
| 17 | 
            +
              "<point>": 151673,
         | 
| 18 | 
            +
              "<quad>": 151671,
         | 
| 19 | 
            +
              "<query>": 151683,
         | 
| 20 | 
            +
              "<ref>": 151667,
         | 
| 21 | 
            +
              "<reserved_43>": 151698,
         | 
| 22 | 
            +
              "<reserved_53>": 151699,
         | 
| 23 | 
            +
              "<slice>": 151675,
         | 
| 24 | 
            +
              "<tool_call>": 151657,
         | 
| 25 | 
            +
              "<unit>": 151679,
         | 
| 26 | 
            +
              "<|audio_end|>": 151687,
         | 
| 27 | 
            +
              "<|audio_start|>": 151685,
         | 
| 28 | 
            +
              "<|audio|>": 151686,
         | 
| 29 | 
            +
              "<|box_end|>": 151649,
         | 
| 30 | 
            +
              "<|box_start|>": 151648,
         | 
| 31 | 
            +
              "<|endoftext|>": 151643,
         | 
| 32 | 
            +
              "<|file_sep|>": 151664,
         | 
| 33 | 
            +
              "<|fim_middle|>": 151660,
         | 
| 34 | 
            +
              "<|fim_pad|>": 151662,
         | 
| 35 | 
            +
              "<|fim_prefix|>": 151659,
         | 
| 36 | 
            +
              "<|fim_suffix|>": 151661,
         | 
| 37 | 
            +
              "<|im_end|>": 151645,
         | 
| 38 | 
            +
              "<|im_start|>": 151644,
         | 
| 39 | 
            +
              "<|image_pad|>": 151655,
         | 
| 40 | 
            +
              "<|interrupt|>": 151695,
         | 
| 41 | 
            +
              "<|listen|>": 151693,
         | 
| 42 | 
            +
              "<|object_ref_end|>": 151647,
         | 
| 43 | 
            +
              "<|object_ref_start|>": 151646,
         | 
| 44 | 
            +
              "<|quad_end|>": 151651,
         | 
| 45 | 
            +
              "<|quad_start|>": 151650,
         | 
| 46 | 
            +
              "<|repo_name|>": 151663,
         | 
| 47 | 
            +
              "<|speak|>": 151694,
         | 
| 48 | 
            +
              "<|spk_bos|>": 151688,
         | 
| 49 | 
            +
              "<|spk_eos|>": 151690,
         | 
| 50 | 
            +
              "<|spk|>": 151689,
         | 
| 51 | 
            +
              "<|tts_bos|>": 151691,
         | 
| 52 | 
            +
              "<|tts_eos|>": 151692,
         | 
| 53 | 
            +
              "<|vad_end|>": 151697,
         | 
| 54 | 
            +
              "<|vad_start|>": 151696,
         | 
| 55 | 
            +
              "<|video_pad|>": 151656,
         | 
| 56 | 
            +
              "<|vision_end|>": 151653,
         | 
| 57 | 
            +
              "<|vision_pad|>": 151654,
         | 
| 58 | 
            +
              "<|vision_start|>": 151652
         | 
| 59 | 
            +
            }
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,197 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "/mnt/usercache/hongbang/MiniCPM-o-2_6/",
         | 
| 3 | 
            +
              "architectures": [
         | 
| 4 | 
            +
                "MiniCPMO"
         | 
| 5 | 
            +
              ],
         | 
| 6 | 
            +
              "attention_dropout": 0.0,
         | 
| 7 | 
            +
              "audio_chunk_length": 1.0,
         | 
| 8 | 
            +
              "audio_config": {
         | 
| 9 | 
            +
                "_name_or_path": "openai/whisper-medium",
         | 
| 10 | 
            +
                "architectures": [
         | 
| 11 | 
            +
                  "MiniCPMWhisperEncoder"
         | 
| 12 | 
            +
                ],
         | 
| 13 | 
            +
                "begin_suppress_tokens": [
         | 
| 14 | 
            +
                  220,
         | 
| 15 | 
            +
                  50257
         | 
| 16 | 
            +
                ],
         | 
| 17 | 
            +
                "bos_token_id": 50257,
         | 
| 18 | 
            +
                "d_model": 1024,
         | 
| 19 | 
            +
                "decoder_attention_heads": 16,
         | 
| 20 | 
            +
                "decoder_ffn_dim": 4096,
         | 
| 21 | 
            +
                "decoder_layers": 24,
         | 
| 22 | 
            +
                "decoder_start_token_id": 50258,
         | 
| 23 | 
            +
                "encoder_attention_heads": 16,
         | 
| 24 | 
            +
                "encoder_ffn_dim": 4096,
         | 
| 25 | 
            +
                "encoder_layers": 24,
         | 
| 26 | 
            +
                "eos_token_id": 50257,
         | 
| 27 | 
            +
                "forced_decoder_ids": [
         | 
| 28 | 
            +
                  [
         | 
| 29 | 
            +
                    1,
         | 
| 30 | 
            +
                    50259
         | 
| 31 | 
            +
                  ],
         | 
| 32 | 
            +
                  [
         | 
| 33 | 
            +
                    2,
         | 
| 34 | 
            +
                    50359
         | 
| 35 | 
            +
                  ],
         | 
| 36 | 
            +
                  [
         | 
| 37 | 
            +
                    3,
         | 
| 38 | 
            +
                    50363
         | 
| 39 | 
            +
                  ]
         | 
| 40 | 
            +
                ],
         | 
| 41 | 
            +
                "max_length": 448,
         | 
| 42 | 
            +
                "model_type": "whisper",
         | 
| 43 | 
            +
                "num_hidden_layers": 24,
         | 
| 44 | 
            +
                "pad_token_id": 50257,
         | 
| 45 | 
            +
                "suppress_tokens": [
         | 
| 46 | 
            +
                  1,
         | 
| 47 | 
            +
                  2,
         | 
| 48 | 
            +
                  7,
         | 
| 49 | 
            +
                  8,
         | 
| 50 | 
            +
                  9,
         | 
| 51 | 
            +
                  10,
         | 
| 52 | 
            +
                  14,
         | 
| 53 | 
            +
                  25,
         | 
| 54 | 
            +
                  26,
         | 
| 55 | 
            +
                  27,
         | 
| 56 | 
            +
                  28,
         | 
| 57 | 
            +
                  29,
         | 
| 58 | 
            +
                  31,
         | 
| 59 | 
            +
                  58,
         | 
| 60 | 
            +
                  59,
         | 
| 61 | 
            +
                  60,
         | 
| 62 | 
            +
                  61,
         | 
| 63 | 
            +
                  62,
         | 
| 64 | 
            +
                  63,
         | 
| 65 | 
            +
                  90,
         | 
| 66 | 
            +
                  91,
         | 
| 67 | 
            +
                  92,
         | 
| 68 | 
            +
                  93,
         | 
| 69 | 
            +
                  359,
         | 
| 70 | 
            +
                  503,
         | 
| 71 | 
            +
                  522,
         | 
| 72 | 
            +
                  542,
         | 
| 73 | 
            +
                  873,
         | 
| 74 | 
            +
                  893,
         | 
| 75 | 
            +
                  902,
         | 
| 76 | 
            +
                  918,
         | 
| 77 | 
            +
                  922,
         | 
| 78 | 
            +
                  931,
         | 
| 79 | 
            +
                  1350,
         | 
| 80 | 
            +
                  1853,
         | 
| 81 | 
            +
                  1982,
         | 
| 82 | 
            +
                  2460,
         | 
| 83 | 
            +
                  2627,
         | 
| 84 | 
            +
                  3246,
         | 
| 85 | 
            +
                  3253,
         | 
| 86 | 
            +
                  3268,
         | 
| 87 | 
            +
                  3536,
         | 
| 88 | 
            +
                  3846,
         | 
| 89 | 
            +
                  3961,
         | 
| 90 | 
            +
                  4183,
         | 
| 91 | 
            +
                  4667,
         | 
| 92 | 
            +
                  6585,
         | 
| 93 | 
            +
                  6647,
         | 
| 94 | 
            +
                  7273,
         | 
| 95 | 
            +
                  9061,
         | 
| 96 | 
            +
                  9383,
         | 
| 97 | 
            +
                  10428,
         | 
| 98 | 
            +
                  10929,
         | 
| 99 | 
            +
                  11938,
         | 
| 100 | 
            +
                  12033,
         | 
| 101 | 
            +
                  12331,
         | 
| 102 | 
            +
                  12562,
         | 
| 103 | 
            +
                  13793,
         | 
| 104 | 
            +
                  14157,
         | 
| 105 | 
            +
                  14635,
         | 
| 106 | 
            +
                  15265,
         | 
| 107 | 
            +
                  15618,
         | 
| 108 | 
            +
                  16553,
         | 
| 109 | 
            +
                  16604,
         | 
| 110 | 
            +
                  18362,
         | 
| 111 | 
            +
                  18956,
         | 
| 112 | 
            +
                  20075,
         | 
| 113 | 
            +
                  21675,
         | 
| 114 | 
            +
                  22520,
         | 
| 115 | 
            +
                  26130,
         | 
| 116 | 
            +
                  26161,
         | 
| 117 | 
            +
                  26435,
         | 
| 118 | 
            +
                  28279,
         | 
| 119 | 
            +
                  29464,
         | 
| 120 | 
            +
                  31650,
         | 
| 121 | 
            +
                  32302,
         | 
| 122 | 
            +
                  32470,
         | 
| 123 | 
            +
                  36865,
         | 
| 124 | 
            +
                  42863,
         | 
| 125 | 
            +
                  47425,
         | 
| 126 | 
            +
                  49870,
         | 
| 127 | 
            +
                  50254,
         | 
| 128 | 
            +
                  50258,
         | 
| 129 | 
            +
                  50358,
         | 
| 130 | 
            +
                  50359,
         | 
| 131 | 
            +
                  50360,
         | 
| 132 | 
            +
                  50361,
         | 
| 133 | 
            +
                  50362
         | 
| 134 | 
            +
                ],
         | 
| 135 | 
            +
                "torch_dtype": "float32"
         | 
| 136 | 
            +
              },
         | 
| 137 | 
            +
              "audio_pool_step": 2,
         | 
| 138 | 
            +
              "auto_map": {
         | 
| 139 | 
            +
                "AutoConfig": "configuration_minicpm.MiniCPMOConfig",
         | 
| 140 | 
            +
                "AutoModel": "modeling_minicpmo.MiniCPMO",
         | 
| 141 | 
            +
                "AutoModelForCausalLM": "modeling_minicpmo.MiniCPMO"
         | 
| 142 | 
            +
              },
         | 
| 143 | 
            +
              "batch_vision_input": true,
         | 
| 144 | 
            +
              "bos_token_id": 151643,
         | 
| 145 | 
            +
              "chunk_input": true,
         | 
| 146 | 
            +
              "drop_vision_last_layer": false,
         | 
| 147 | 
            +
              "eos_token_id": 151645,
         | 
| 148 | 
            +
              "hidden_act": "silu",
         | 
| 149 | 
            +
              "hidden_size": 3584,
         | 
| 150 | 
            +
              "image_size": 448,
         | 
| 151 | 
            +
              "init_audio": true,
         | 
| 152 | 
            +
              "init_tts": false,
         | 
| 153 | 
            +
              "init_vision": true,
         | 
| 154 | 
            +
              "initializer_range": 0.02,
         | 
| 155 | 
            +
              "intermediate_size": 18944,
         | 
| 156 | 
            +
              "listen_speak_type": "asr",
         | 
| 157 | 
            +
              "max_position_embeddings": 32768,
         | 
| 158 | 
            +
              "max_window_layers": 28,
         | 
| 159 | 
            +
              "model_type": "minicpmo",
         | 
| 160 | 
            +
              "num_attention_heads": 28,
         | 
| 161 | 
            +
              "num_hidden_layers": 28,
         | 
| 162 | 
            +
              "num_key_value_heads": 4,
         | 
| 163 | 
            +
              "patch_size": 14,
         | 
| 164 | 
            +
              "query_num": 64,
         | 
| 165 | 
            +
              "rms_norm_eps": 1e-06,
         | 
| 166 | 
            +
              "rope_scaling": null,
         | 
| 167 | 
            +
              "rope_theta": 1000000.0,
         | 
| 168 | 
            +
              "slice_config": {
         | 
| 169 | 
            +
                "max_slice_nums": 9,
         | 
| 170 | 
            +
                "model_type": "minicpmv"
         | 
| 171 | 
            +
              },
         | 
| 172 | 
            +
              "slice_mode": true,
         | 
| 173 | 
            +
              "sliding_window": null,
         | 
| 174 | 
            +
              "stream_input": false,
         | 
| 175 | 
            +
              "tie_word_embeddings": false,
         | 
| 176 | 
            +
              "torch_dtype": "bfloat16",
         | 
| 177 | 
            +
              "transformers_version": "4.45.2",
         | 
| 178 | 
            +
              "tts_config": {
         | 
| 179 | 
            +
                "llm_dim": 3584,
         | 
| 180 | 
            +
                "model_type": "conditional_chattts"
         | 
| 181 | 
            +
              },
         | 
| 182 | 
            +
              "use_cache": false,
         | 
| 183 | 
            +
              "use_image_id": true,
         | 
| 184 | 
            +
              "use_sliding_window": false,
         | 
| 185 | 
            +
              "version": 2.6,
         | 
| 186 | 
            +
              "vision_batch_size": 16,
         | 
| 187 | 
            +
              "vision_config": {
         | 
| 188 | 
            +
                "hidden_size": 1152,
         | 
| 189 | 
            +
                "image_size": 980,
         | 
| 190 | 
            +
                "intermediate_size": 4304,
         | 
| 191 | 
            +
                "model_type": "siglip_vision_model",
         | 
| 192 | 
            +
                "num_attention_heads": 16,
         | 
| 193 | 
            +
                "num_hidden_layers": 27,
         | 
| 194 | 
            +
                "patch_size": 14
         | 
| 195 | 
            +
              },
         | 
| 196 | 
            +
              "vocab_size": 151700
         | 
| 197 | 
            +
            }
         | 
    	
        configuration_minicpm.py
    ADDED
    
    | @@ -0,0 +1,210 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2025 The OpenBMB Team. All rights reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            import os
         | 
| 17 | 
            +
            from typing import Union
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            from transformers import PretrainedConfig
         | 
| 20 | 
            +
            from transformers import Qwen2Config
         | 
| 21 | 
            +
            from transformers import WhisperConfig
         | 
| 22 | 
            +
            from transformers.utils import logging
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            from .modeling_navit_siglip import SiglipVisionConfig
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            logger = logging.get_logger(__name__)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
             | 
| 29 | 
            +
            class MiniCPMVSliceConfig(PretrainedConfig):
         | 
| 30 | 
            +
                model_type = "minicpmv"
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                def __init__(
         | 
| 33 | 
            +
                    self,
         | 
| 34 | 
            +
                    patch_size=14,
         | 
| 35 | 
            +
                    max_slice_nums=9,
         | 
| 36 | 
            +
                    scale_resolution=448,
         | 
| 37 | 
            +
                    **kwargs,
         | 
| 38 | 
            +
                ):
         | 
| 39 | 
            +
                    super().__init__(**kwargs)
         | 
| 40 | 
            +
                    self.patch_size = patch_size
         | 
| 41 | 
            +
                    self.max_slice_nums = max_slice_nums
         | 
| 42 | 
            +
                    self.scale_resolution = scale_resolution
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                @classmethod
         | 
| 45 | 
            +
                def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         | 
| 46 | 
            +
                    cls._set_token_in_kwargs(kwargs)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    if config_dict.get("model_type") == "minicpmv":
         | 
| 51 | 
            +
                        config_dict = config_dict["slice_config"]
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                    if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
         | 
| 54 | 
            +
                        logger.warning(
         | 
| 55 | 
            +
                            f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
         | 
| 56 | 
            +
                            f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
         | 
| 57 | 
            +
                        )
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    return cls.from_dict(config_dict, **kwargs)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
             | 
| 62 | 
            +
            class ConditionalChatTTSConfig(PretrainedConfig):
         | 
| 63 | 
            +
                model_type = "conditional_chattts"
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                def __init__(
         | 
| 66 | 
            +
                    self,
         | 
| 67 | 
            +
                    llm_dim: int = 2560,
         | 
| 68 | 
            +
                    hidden_size: int = 768,
         | 
| 69 | 
            +
                    intermediate_size: int = 3072,
         | 
| 70 | 
            +
                    num_attention_heads: int = 12,
         | 
| 71 | 
            +
                    num_hidden_layers: int = 20,
         | 
| 72 | 
            +
                    max_position_embeddings: int = 4096,
         | 
| 73 | 
            +
                    num_audio_tokens: int = 626,
         | 
| 74 | 
            +
                    num_text_tokens: int = 21178,
         | 
| 75 | 
            +
                    num_mel_bins: int = 100,
         | 
| 76 | 
            +
                    num_vq: int = 4,
         | 
| 77 | 
            +
                    use_speaker_embedding: bool = True,
         | 
| 78 | 
            +
                    use_llm_hidden_state: bool = False,
         | 
| 79 | 
            +
                    spk_emb_token_id: int = 21143,
         | 
| 80 | 
            +
                    num_spk_embs: int = 1,
         | 
| 81 | 
            +
                    audio_bos_token_id: int = 21132,
         | 
| 82 | 
            +
                    text_eos_token_id: int = 21133,
         | 
| 83 | 
            +
                    use_text: bool = True,
         | 
| 84 | 
            +
                    streaming: bool = True,
         | 
| 85 | 
            +
                    streaming_text_chunk_size: int = 10,
         | 
| 86 | 
            +
                    streaming_text_reserved_len: int = 300,
         | 
| 87 | 
            +
                    streaming_audio_chunk_size: int = 50,
         | 
| 88 | 
            +
                    attn_implementation: str = "sdpa",
         | 
| 89 | 
            +
                    use_mlp: bool = True,
         | 
| 90 | 
            +
                    aug_loss_weight: bool = True,
         | 
| 91 | 
            +
                    do_sample: bool = True,
         | 
| 92 | 
            +
                    top_p: float = 0.7,
         | 
| 93 | 
            +
                    top_k: int = 20,
         | 
| 94 | 
            +
                    repetition_penalty: float = 1.0,
         | 
| 95 | 
            +
                    **kwargs,
         | 
| 96 | 
            +
                ):
         | 
| 97 | 
            +
                    super().__init__(**kwargs)
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                    self.llm_dim = llm_dim
         | 
| 100 | 
            +
                    self.hidden_size = hidden_size
         | 
| 101 | 
            +
                    self.intermediate_size = intermediate_size
         | 
| 102 | 
            +
                    self.num_attention_heads = num_attention_heads
         | 
| 103 | 
            +
                    self.num_hidden_layers = num_hidden_layers
         | 
| 104 | 
            +
                    self.max_position_embeddings = max_position_embeddings
         | 
| 105 | 
            +
                    self.num_audio_tokens = num_audio_tokens
         | 
| 106 | 
            +
                    self.num_text_tokens = num_text_tokens
         | 
| 107 | 
            +
                    self.num_mel_bins = num_mel_bins
         | 
| 108 | 
            +
                    self.num_vq = num_vq
         | 
| 109 | 
            +
                    self.use_speaker_embedding = use_speaker_embedding
         | 
| 110 | 
            +
                    self.use_llm_hidden_state = use_llm_hidden_state
         | 
| 111 | 
            +
                    self.spk_emb_token_id = spk_emb_token_id
         | 
| 112 | 
            +
                    self.num_spk_embs = num_spk_embs
         | 
| 113 | 
            +
                    self.audio_bos_token_id = audio_bos_token_id
         | 
| 114 | 
            +
                    self.text_eos_token_id = text_eos_token_id
         | 
| 115 | 
            +
                    self.use_text = use_text
         | 
| 116 | 
            +
                    self.streaming = streaming
         | 
| 117 | 
            +
                    self.streaming_text_chunk_size = streaming_text_chunk_size
         | 
| 118 | 
            +
                    self.streaming_text_reserved_len = streaming_text_reserved_len
         | 
| 119 | 
            +
                    self.streaming_audio_chunk_size = streaming_audio_chunk_size
         | 
| 120 | 
            +
                    self.attn_implementation = attn_implementation
         | 
| 121 | 
            +
                    self.use_mlp = use_mlp
         | 
| 122 | 
            +
                    self.aug_loss_weight = aug_loss_weight
         | 
| 123 | 
            +
                    self.do_sample = do_sample
         | 
| 124 | 
            +
                    self.top_p = top_p
         | 
| 125 | 
            +
                    self.top_k = top_k
         | 
| 126 | 
            +
                    self.repetition_penalty = repetition_penalty
         | 
| 127 | 
            +
             | 
| 128 | 
            +
             | 
| 129 | 
            +
            class MiniCPMOConfig(Qwen2Config):
         | 
| 130 | 
            +
                model_type = "minicpmo"
         | 
| 131 | 
            +
                keys_to_ignore_at_inference = ["past_key_values"]
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                default_vision_config = {
         | 
| 134 | 
            +
                    "hidden_size": 1152,
         | 
| 135 | 
            +
                    "image_size": 980,
         | 
| 136 | 
            +
                    "intermediate_size": 4304,
         | 
| 137 | 
            +
                    "model_type": "siglip",
         | 
| 138 | 
            +
                    "num_attention_heads": 16,
         | 
| 139 | 
            +
                    "num_hidden_layers": 27,
         | 
| 140 | 
            +
                    "patch_size": 14,
         | 
| 141 | 
            +
                }
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                def __init__(
         | 
| 144 | 
            +
                    self,
         | 
| 145 | 
            +
                    use_cache=True,
         | 
| 146 | 
            +
                    query_num=64,
         | 
| 147 | 
            +
                    image_size=448,
         | 
| 148 | 
            +
                    drop_vision_last_layer=True,
         | 
| 149 | 
            +
                    batch_vision_input=True,
         | 
| 150 | 
            +
                    slice_config=None,
         | 
| 151 | 
            +
                    vision_config=None,
         | 
| 152 | 
            +
                    audio_config=None,
         | 
| 153 | 
            +
                    tts_config=None,
         | 
| 154 | 
            +
                    use_image_id=True,
         | 
| 155 | 
            +
                    vision_batch_size=16,
         | 
| 156 | 
            +
                    audio_pool_step=2,
         | 
| 157 | 
            +
                    audio_chunk_length=1.0,
         | 
| 158 | 
            +
                    stream_input=False,
         | 
| 159 | 
            +
                    init_vision=True,
         | 
| 160 | 
            +
                    init_audio=True,
         | 
| 161 | 
            +
                    init_tts=True,
         | 
| 162 | 
            +
                    **kwargs,
         | 
| 163 | 
            +
                ):
         | 
| 164 | 
            +
                    self.use_cache = use_cache
         | 
| 165 | 
            +
                    self.query_num = query_num
         | 
| 166 | 
            +
                    self.image_size = image_size
         | 
| 167 | 
            +
                    self.drop_vision_last_layer = drop_vision_last_layer
         | 
| 168 | 
            +
                    self.batch_vision_input = batch_vision_input
         | 
| 169 | 
            +
                    self.use_image_id = use_image_id
         | 
| 170 | 
            +
                    self.vision_batch_size = vision_batch_size
         | 
| 171 | 
            +
                    self.audio_pool_step = audio_pool_step
         | 
| 172 | 
            +
                    self.audio_chunk_length = audio_chunk_length
         | 
| 173 | 
            +
                    self.stream_input = stream_input
         | 
| 174 | 
            +
                    self.init_vision = init_vision
         | 
| 175 | 
            +
                    self.init_audio = init_audio
         | 
| 176 | 
            +
                    self.init_tts = init_tts
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                    if slice_config is None:
         | 
| 179 | 
            +
                        self.slice_config = MiniCPMVSliceConfig(max_slice_nums=1)
         | 
| 180 | 
            +
                    else:
         | 
| 181 | 
            +
                        self.slice_config = MiniCPMVSliceConfig(**slice_config)
         | 
| 182 | 
            +
                    self.slice_mode = True
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                    # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
         | 
| 185 | 
            +
                    if vision_config is None:
         | 
| 186 | 
            +
                        self.vision_config = SiglipVisionConfig(**self.default_vision_config)
         | 
| 187 | 
            +
                        logger.info("vision_config is None, using default vision config")
         | 
| 188 | 
            +
                    elif isinstance(vision_config, dict):
         | 
| 189 | 
            +
                        self.vision_config = SiglipVisionConfig(**vision_config)
         | 
| 190 | 
            +
                    elif isinstance(vision_config, SiglipVisionConfig):
         | 
| 191 | 
            +
                        self.vision_config = vision_config
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                    # same as openai/whisper-medium add use_cache
         | 
| 194 | 
            +
                    if audio_config is None:
         | 
| 195 | 
            +
                        self.audio_config = WhisperConfig()
         | 
| 196 | 
            +
                    elif isinstance(audio_config, dict):
         | 
| 197 | 
            +
                        self.audio_config = WhisperConfig(**audio_config)
         | 
| 198 | 
            +
                    elif isinstance(audio_config, WhisperConfig):
         | 
| 199 | 
            +
                        self.audio_config = audio_config
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                    if tts_config is None:
         | 
| 202 | 
            +
                        self.tts_config = ConditionalChatTTSConfig()
         | 
| 203 | 
            +
                    elif isinstance(tts_config, dict):
         | 
| 204 | 
            +
                        self.tts_config = ConditionalChatTTSConfig(**tts_config)
         | 
| 205 | 
            +
                    elif isinstance(tts_config, ConditionalChatTTSConfig):
         | 
| 206 | 
            +
                        self.tts_config = tts_config
         | 
| 207 | 
            +
             | 
| 208 | 
            +
                    self.patch_size = self.vision_config.patch_size
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                    super().__init__(**kwargs)
         | 
    	
        generation_config.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_from_model_config": true,
         | 
| 3 | 
            +
              "bos_token_id": 151643,
         | 
| 4 | 
            +
              "eos_token_id": 151645,
         | 
| 5 | 
            +
              "transformers_version": "4.45.2"
         | 
| 6 | 
            +
            }
         | 
    	
        image_processing_minicpmv.py
    ADDED
    
    | @@ -0,0 +1,407 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2025 The OpenBMB Team. All rights reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            import math
         | 
| 17 | 
            +
            from typing import Any
         | 
| 18 | 
            +
            from typing import Dict
         | 
| 19 | 
            +
            from typing import List
         | 
| 20 | 
            +
            from typing import Optional
         | 
| 21 | 
            +
            from typing import Union
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            import numpy as np
         | 
| 24 | 
            +
            import PIL
         | 
| 25 | 
            +
            import PIL.Image
         | 
| 26 | 
            +
            import PIL.ImageSequence
         | 
| 27 | 
            +
            import torch
         | 
| 28 | 
            +
            from PIL import Image
         | 
| 29 | 
            +
            from transformers import AutoImageProcessor
         | 
| 30 | 
            +
            from transformers.image_processing_utils import BaseImageProcessor
         | 
| 31 | 
            +
            from transformers.image_processing_utils import BatchFeature
         | 
| 32 | 
            +
            from transformers.image_transforms import to_channel_dimension_format
         | 
| 33 | 
            +
            from transformers.image_utils import ChannelDimension
         | 
| 34 | 
            +
            from transformers.image_utils import infer_channel_dimension_format
         | 
| 35 | 
            +
            from transformers.image_utils import is_torch_tensor
         | 
| 36 | 
            +
            from transformers.image_utils import to_numpy_array
         | 
| 37 | 
            +
            from transformers.image_utils import valid_images
         | 
| 38 | 
            +
            from transformers.utils import is_torch_device
         | 
| 39 | 
            +
            from transformers.utils import is_torch_dtype
         | 
| 40 | 
            +
            from transformers.utils import requires_backends
         | 
| 41 | 
            +
            from transformers.utils import TensorType
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            def recursive_converter(converter, value):
         | 
| 45 | 
            +
                if isinstance(value, list):
         | 
| 46 | 
            +
                    new_value = []
         | 
| 47 | 
            +
                    for v in value:
         | 
| 48 | 
            +
                        new_value += [recursive_converter(converter, v)]
         | 
| 49 | 
            +
                    return new_value
         | 
| 50 | 
            +
                else:
         | 
| 51 | 
            +
                    return converter(value)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
             | 
| 54 | 
            +
            class MiniCPMOBatchFeature(BatchFeature):
         | 
| 55 | 
            +
                r"""
         | 
| 56 | 
            +
                Extend from BatchFeature for supporting various image size
         | 
| 57 | 
            +
                """
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
         | 
| 60 | 
            +
                    super().__init__(data)
         | 
| 61 | 
            +
                    self.convert_to_tensors(tensor_type=tensor_type)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
         | 
| 64 | 
            +
                    if tensor_type is None:
         | 
| 65 | 
            +
                        return self
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                    is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    def converter(value):
         | 
| 70 | 
            +
                        try:
         | 
| 71 | 
            +
                            if not is_tensor(value):
         | 
| 72 | 
            +
                                tensor = as_tensor(value)
         | 
| 73 | 
            +
                                return tensor
         | 
| 74 | 
            +
                        except:  # noqa E722
         | 
| 75 | 
            +
                            if key == "overflowing_values":
         | 
| 76 | 
            +
                                raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
         | 
| 77 | 
            +
                            raise ValueError(
         | 
| 78 | 
            +
                                "Unable to create tensor, you should probably activate padding "
         | 
| 79 | 
            +
                                "with 'padding=True' to have batched tensors with the same length."
         | 
| 80 | 
            +
                            )
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                    for key, value in self.items():
         | 
| 83 | 
            +
                        self[key] = recursive_converter(converter, value)
         | 
| 84 | 
            +
                    return self
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                def to(self, *args, **kwargs) -> "MiniCPMOBatchFeature":
         | 
| 87 | 
            +
                    requires_backends(self, ["torch"])
         | 
| 88 | 
            +
                    import torch
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                    def cast_tensor(v):
         | 
| 91 | 
            +
                        # check if v is a floating point
         | 
| 92 | 
            +
                        if torch.is_floating_point(v):
         | 
| 93 | 
            +
                            # cast and send to device
         | 
| 94 | 
            +
                            return v.to(*args, **kwargs)
         | 
| 95 | 
            +
                        elif device is not None:
         | 
| 96 | 
            +
                            return v.to(device=device)
         | 
| 97 | 
            +
                        else:
         | 
| 98 | 
            +
                            return v
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    new_data = {}
         | 
| 101 | 
            +
                    device = kwargs.get("device")
         | 
| 102 | 
            +
                    # Check if the args are a device or a dtype
         | 
| 103 | 
            +
                    if device is None and len(args) > 0:
         | 
| 104 | 
            +
                        # device should be always the first argument
         | 
| 105 | 
            +
                        arg = args[0]
         | 
| 106 | 
            +
                        if is_torch_dtype(arg):
         | 
| 107 | 
            +
                            # The first argument is a dtype
         | 
| 108 | 
            +
                            pass
         | 
| 109 | 
            +
                        elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
         | 
| 110 | 
            +
                            device = arg
         | 
| 111 | 
            +
                        else:
         | 
| 112 | 
            +
                            # it's something else
         | 
| 113 | 
            +
                            raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
         | 
| 114 | 
            +
                    # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
         | 
| 115 | 
            +
                    for k, v in self.items():
         | 
| 116 | 
            +
                        new_data[k] = recursive_converter(cast_tensor, v)
         | 
| 117 | 
            +
                    self.data = new_data
         | 
| 118 | 
            +
                    return self
         | 
| 119 | 
            +
             | 
| 120 | 
            +
             | 
| 121 | 
            +
            class MiniCPMVImageProcessor(BaseImageProcessor):
         | 
| 122 | 
            +
                model_input_names = ["pixel_values"]
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                def __init__(self, max_slice_nums=9, scale_resolution=448, patch_size=14, **kwargs):
         | 
| 125 | 
            +
                    super().__init__(**kwargs)
         | 
| 126 | 
            +
                    self.max_slice_nums = max_slice_nums
         | 
| 127 | 
            +
                    self.scale_resolution = scale_resolution
         | 
| 128 | 
            +
                    self.patch_size = patch_size
         | 
| 129 | 
            +
                    self.use_image_id = kwargs.pop("use_image_id", False)
         | 
| 130 | 
            +
                    self.image_feature_size = kwargs.pop("image_feature_size", 64)
         | 
| 131 | 
            +
                    self.im_start_token = kwargs.pop("im_start", "<image>")
         | 
| 132 | 
            +
                    self.im_end_token = kwargs.pop("im_end", "</image>")
         | 
| 133 | 
            +
                    self.slice_start_token = kwargs.pop("slice_start", "<slice>")
         | 
| 134 | 
            +
                    self.slice_end_token = kwargs.pop("slice_end", "</slice>")
         | 
| 135 | 
            +
                    self.unk_token = kwargs.pop("unk", "<unk>")
         | 
| 136 | 
            +
                    self.im_id_start = kwargs.pop("im_id_start", "<image_id>")
         | 
| 137 | 
            +
                    self.im_id_end = kwargs.pop("im_id_end", "</image_id>")
         | 
| 138 | 
            +
                    self.slice_mode = kwargs.pop("slice_mode", True)
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                    self.mean = np.array(kwargs.pop("norm_mean", [0.5, 0.5, 0.5]))
         | 
| 141 | 
            +
                    self.std = np.array(kwargs.pop("norm_std", [0.5, 0.5, 0.5]))
         | 
| 142 | 
            +
                    self.version = kwargs.pop("version", 2.0)
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                def ensure_divide(self, length, patch_size):
         | 
| 145 | 
            +
                    return max(round(length / patch_size) * patch_size, patch_size)
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                def find_best_resize(self, original_size, scale_resolution, patch_size, allow_upscale=False):
         | 
| 148 | 
            +
                    width, height = original_size
         | 
| 149 | 
            +
                    if (width * height > scale_resolution * scale_resolution) or allow_upscale:
         | 
| 150 | 
            +
                        r = width / height
         | 
| 151 | 
            +
                        height = int(scale_resolution / math.sqrt(r))
         | 
| 152 | 
            +
                        width = int(height * r)
         | 
| 153 | 
            +
                    best_width = self.ensure_divide(width, patch_size)
         | 
| 154 | 
            +
                    best_height = self.ensure_divide(height, patch_size)
         | 
| 155 | 
            +
                    return (best_width, best_height)
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                def get_refine_size(self, original_size, grid, scale_resolution, patch_size, allow_upscale=False):
         | 
| 158 | 
            +
                    width, height = original_size
         | 
| 159 | 
            +
                    grid_x, grid_y = grid
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    refine_width = self.ensure_divide(width, grid_x)
         | 
| 162 | 
            +
                    refine_height = self.ensure_divide(height, grid_y)
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                    grid_width = refine_width / grid_x
         | 
| 165 | 
            +
                    grid_height = refine_height / grid_y
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                    best_grid_size = self.find_best_resize(
         | 
| 168 | 
            +
                        (grid_width, grid_height), scale_resolution, patch_size, allow_upscale=allow_upscale
         | 
| 169 | 
            +
                    )
         | 
| 170 | 
            +
                    refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
         | 
| 171 | 
            +
                    return refine_size
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                def split_to_patches(self, image, grid):
         | 
| 174 | 
            +
                    patches = []
         | 
| 175 | 
            +
                    width, height = image.size
         | 
| 176 | 
            +
                    grid_x = int(width / grid[0])
         | 
| 177 | 
            +
                    grid_y = int(height / grid[1])
         | 
| 178 | 
            +
                    for i in range(0, height, grid_y):
         | 
| 179 | 
            +
                        images = []
         | 
| 180 | 
            +
                        for j in range(0, width, grid_x):
         | 
| 181 | 
            +
                            box = (j, i, j + grid_x, i + grid_y)
         | 
| 182 | 
            +
                            patch = image.crop(box)
         | 
| 183 | 
            +
                            images.append(patch)
         | 
| 184 | 
            +
                        patches.append(images)
         | 
| 185 | 
            +
                    return patches
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                def slice_image(self, image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
         | 
| 188 | 
            +
                    original_size = image.size
         | 
| 189 | 
            +
                    source_image = None
         | 
| 190 | 
            +
                    best_grid = self.get_sliced_grid(original_size, max_slice_nums, never_split)
         | 
| 191 | 
            +
                    patches = []
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                    if best_grid is None:
         | 
| 194 | 
            +
                        # dont need to slice, upsample
         | 
| 195 | 
            +
                        best_size = self.find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=True)
         | 
| 196 | 
            +
                        source_image = image.resize(best_size, resample=Image.Resampling.BICUBIC)
         | 
| 197 | 
            +
                    else:
         | 
| 198 | 
            +
                        # source image, down-sampling and ensure divided by patch_size
         | 
| 199 | 
            +
                        best_resize = self.find_best_resize(original_size, scale_resolution, patch_size)
         | 
| 200 | 
            +
                        source_image = image.copy().resize(best_resize, resample=Image.Resampling.BICUBIC)
         | 
| 201 | 
            +
                        refine_size = self.get_refine_size(
         | 
| 202 | 
            +
                            original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
         | 
| 203 | 
            +
                        )
         | 
| 204 | 
            +
                        refine_image = image.resize(refine_size, resample=Image.Resampling.BICUBIC)
         | 
| 205 | 
            +
                        patches = self.split_to_patches(refine_image, best_grid)
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                    return source_image, patches, best_grid
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                def get_grid_placeholder(self, grid):
         | 
| 210 | 
            +
                    if grid is None:
         | 
| 211 | 
            +
                        return ""
         | 
| 212 | 
            +
                    slice_image_placeholder = (
         | 
| 213 | 
            +
                        self.slice_start_token + self.unk_token * self.image_feature_size + self.slice_end_token
         | 
| 214 | 
            +
                    )
         | 
| 215 | 
            +
             | 
| 216 | 
            +
                    cols = grid[0]
         | 
| 217 | 
            +
                    rows = grid[1]
         | 
| 218 | 
            +
                    slices = []
         | 
| 219 | 
            +
                    for i in range(rows):
         | 
| 220 | 
            +
                        lines = []
         | 
| 221 | 
            +
                        for j in range(cols):
         | 
| 222 | 
            +
                            lines.append(slice_image_placeholder)
         | 
| 223 | 
            +
                        slices.append("".join(lines))
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                    slice_placeholder = "\n".join(slices)
         | 
| 226 | 
            +
                    return slice_placeholder
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                def get_image_id_placeholder(self, idx=0):
         | 
| 229 | 
            +
                    return f"{self.im_id_start}{idx}{self.im_id_end}"
         | 
| 230 | 
            +
             | 
| 231 | 
            +
                def get_sliced_images(self, image, max_slice_nums=None):
         | 
| 232 | 
            +
                    slice_images = []
         | 
| 233 | 
            +
             | 
| 234 | 
            +
                    if not self.slice_mode:
         | 
| 235 | 
            +
                        return [image]
         | 
| 236 | 
            +
             | 
| 237 | 
            +
                    max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
         | 
| 238 | 
            +
                    assert max_slice_nums > 0
         | 
| 239 | 
            +
                    source_image, patches, sliced_grid = self.slice_image(
         | 
| 240 | 
            +
                        image, max_slice_nums, self.scale_resolution, self.patch_size  # default: 9  # default: 448  # default: 14
         | 
| 241 | 
            +
                    )
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                    slice_images.append(source_image)
         | 
| 244 | 
            +
                    if len(patches) > 0:
         | 
| 245 | 
            +
                        for i in range(len(patches)):
         | 
| 246 | 
            +
                            for j in range(len(patches[0])):
         | 
| 247 | 
            +
                                slice_images.append(patches[i][j])
         | 
| 248 | 
            +
                    return slice_images
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                def get_sliced_grid(self, image_size, max_slice_nums, nerver_split=False):
         | 
| 251 | 
            +
                    original_width, original_height = image_size
         | 
| 252 | 
            +
                    log_ratio = math.log(original_width / original_height)
         | 
| 253 | 
            +
                    ratio = original_width * original_height / (self.scale_resolution * self.scale_resolution)
         | 
| 254 | 
            +
                    multiple = min(math.ceil(ratio), max_slice_nums)
         | 
| 255 | 
            +
                    if multiple <= 1 or nerver_split:
         | 
| 256 | 
            +
                        return None
         | 
| 257 | 
            +
                    candidate_split_grids_nums = []
         | 
| 258 | 
            +
                    for i in [multiple - 1, multiple, multiple + 1]:
         | 
| 259 | 
            +
                        if i == 1 or i > max_slice_nums:
         | 
| 260 | 
            +
                            continue
         | 
| 261 | 
            +
                        candidate_split_grids_nums.append(i)
         | 
| 262 | 
            +
             | 
| 263 | 
            +
                    candidate_grids = []
         | 
| 264 | 
            +
                    for split_grids_nums in candidate_split_grids_nums:
         | 
| 265 | 
            +
                        m = 1
         | 
| 266 | 
            +
                        while m <= split_grids_nums:
         | 
| 267 | 
            +
                            if split_grids_nums % m == 0:
         | 
| 268 | 
            +
                                candidate_grids.append([m, split_grids_nums // m])
         | 
| 269 | 
            +
                            m += 1
         | 
| 270 | 
            +
             | 
| 271 | 
            +
                    best_grid = [1, 1]
         | 
| 272 | 
            +
                    min_error = float("inf")
         | 
| 273 | 
            +
                    for grid in candidate_grids:
         | 
| 274 | 
            +
                        error = abs(log_ratio - math.log(grid[0] / grid[1]))
         | 
| 275 | 
            +
                        if error < min_error:
         | 
| 276 | 
            +
                            best_grid = grid
         | 
| 277 | 
            +
                            min_error = error
         | 
| 278 | 
            +
             | 
| 279 | 
            +
                    return best_grid
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                def get_slice_image_placeholder(self, image_size, image_idx=0, max_slice_nums=None, use_image_id=None):
         | 
| 282 | 
            +
                    max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
         | 
| 283 | 
            +
                    assert max_slice_nums > 0
         | 
| 284 | 
            +
                    grid = self.get_sliced_grid(image_size=image_size, max_slice_nums=max_slice_nums)
         | 
| 285 | 
            +
             | 
| 286 | 
            +
                    image_placeholder = self.im_start_token + self.unk_token * self.image_feature_size + self.im_end_token
         | 
| 287 | 
            +
                    use_image_id = self.use_image_id if use_image_id is None else bool(use_image_id)
         | 
| 288 | 
            +
                    if use_image_id:
         | 
| 289 | 
            +
                        final_placeholder = self.get_image_id_placeholder(image_idx) + image_placeholder
         | 
| 290 | 
            +
                    else:
         | 
| 291 | 
            +
                        final_placeholder = image_placeholder
         | 
| 292 | 
            +
             | 
| 293 | 
            +
                    if self.slice_mode:
         | 
| 294 | 
            +
                        final_placeholder = final_placeholder + self.get_grid_placeholder(grid=grid)
         | 
| 295 | 
            +
                    return final_placeholder
         | 
| 296 | 
            +
             | 
| 297 | 
            +
                def to_pil_image(self, image, rescale=None) -> PIL.Image.Image:
         | 
| 298 | 
            +
                    """
         | 
| 299 | 
            +
                    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
         | 
| 300 | 
            +
                    needed.
         | 
| 301 | 
            +
             | 
| 302 | 
            +
                    Args:
         | 
| 303 | 
            +
                        image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
         | 
| 304 | 
            +
                            The image to convert to the PIL Image format.
         | 
| 305 | 
            +
                        rescale (`bool`, *optional*):
         | 
| 306 | 
            +
                            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
         | 
| 307 | 
            +
                            default to `True` if the image type is a floating type, `False` otherwise.
         | 
| 308 | 
            +
                    """
         | 
| 309 | 
            +
                    if isinstance(image, PIL.Image.Image):
         | 
| 310 | 
            +
                        return image
         | 
| 311 | 
            +
                    if is_torch_tensor(image):
         | 
| 312 | 
            +
                        image = image.numpy()
         | 
| 313 | 
            +
             | 
| 314 | 
            +
                    if isinstance(image, np.ndarray):
         | 
| 315 | 
            +
                        if rescale is None:
         | 
| 316 | 
            +
                            # rescale default to the array being of floating type.
         | 
| 317 | 
            +
                            rescale = isinstance(image.flat[0], np.floating)
         | 
| 318 | 
            +
                        # If the channel as been moved to first dim, we put it back at the end.
         | 
| 319 | 
            +
                        if image.ndim == 3 and image.shape[0] in [1, 3]:
         | 
| 320 | 
            +
                            image = image.transpose(1, 2, 0)
         | 
| 321 | 
            +
                        if rescale:
         | 
| 322 | 
            +
                            image = image * 255
         | 
| 323 | 
            +
                        image = image.astype(np.uint8)
         | 
| 324 | 
            +
                        return PIL.Image.fromarray(image)
         | 
| 325 | 
            +
                    return image
         | 
| 326 | 
            +
             | 
| 327 | 
            +
                def reshape_by_patch(self, image):
         | 
| 328 | 
            +
                    """
         | 
| 329 | 
            +
                    :param image: shape [3, H, W]
         | 
| 330 | 
            +
                    :param patch_size:
         | 
| 331 | 
            +
                    :return: [3, patch_size, HW/patch_size]
         | 
| 332 | 
            +
                    """
         | 
| 333 | 
            +
                    image = torch.from_numpy(image)
         | 
| 334 | 
            +
                    patch_size = self.patch_size
         | 
| 335 | 
            +
                    patches = torch.nn.functional.unfold(image, (patch_size, patch_size), stride=(patch_size, patch_size))
         | 
| 336 | 
            +
             | 
| 337 | 
            +
                    patches = patches.reshape(image.size(0), patch_size, patch_size, -1)
         | 
| 338 | 
            +
                    patches = patches.permute(0, 1, 3, 2).reshape(image.size(0), patch_size, -1)
         | 
| 339 | 
            +
                    return patches.numpy()
         | 
| 340 | 
            +
             | 
| 341 | 
            +
                def preprocess(
         | 
| 342 | 
            +
                    self,
         | 
| 343 | 
            +
                    images: Union[Image.Image, List[Image.Image], List[List[Image.Image]]],
         | 
| 344 | 
            +
                    do_pad: Optional[bool] = True,
         | 
| 345 | 
            +
                    max_slice_nums: int = None,
         | 
| 346 | 
            +
                    return_tensors: Optional[Union[str, TensorType]] = None,
         | 
| 347 | 
            +
                    **kwargs,
         | 
| 348 | 
            +
                ) -> MiniCPMOBatchFeature:
         | 
| 349 | 
            +
                    if isinstance(images, Image.Image):
         | 
| 350 | 
            +
                        images_list = [[images]]
         | 
| 351 | 
            +
                    elif isinstance(images[0], Image.Image):
         | 
| 352 | 
            +
                        images_list = [images]
         | 
| 353 | 
            +
                    else:
         | 
| 354 | 
            +
                        images_list = images
         | 
| 355 | 
            +
             | 
| 356 | 
            +
                    new_images_list = []
         | 
| 357 | 
            +
                    image_sizes_list = []
         | 
| 358 | 
            +
                    tgt_sizes_list = []
         | 
| 359 | 
            +
             | 
| 360 | 
            +
                    for _images in images_list:
         | 
| 361 | 
            +
                        if _images is None or len(_images) == 0:
         | 
| 362 | 
            +
                            new_images_list.append([])
         | 
| 363 | 
            +
                            image_sizes_list.append([])
         | 
| 364 | 
            +
                            tgt_sizes_list.append([])
         | 
| 365 | 
            +
                            continue
         | 
| 366 | 
            +
                        if not valid_images(_images):
         | 
| 367 | 
            +
                            raise ValueError(
         | 
| 368 | 
            +
                                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
         | 
| 369 | 
            +
                                "torch.Tensor, tf.Tensor or jax.ndarray."
         | 
| 370 | 
            +
                            )
         | 
| 371 | 
            +
             | 
| 372 | 
            +
                        _images = [self.to_pil_image(image).convert("RGB") for image in _images]
         | 
| 373 | 
            +
                        input_data_format = infer_channel_dimension_format(np.array(_images[0]))
         | 
| 374 | 
            +
             | 
| 375 | 
            +
                        new_images = []
         | 
| 376 | 
            +
                        image_sizes = [image.size for image in _images]
         | 
| 377 | 
            +
                        tgt_sizes = []
         | 
| 378 | 
            +
                        for image in _images:
         | 
| 379 | 
            +
                            image_patches = self.get_sliced_images(image, max_slice_nums)
         | 
| 380 | 
            +
                            image_patches = [to_numpy_array(image).astype(np.float32) / 255 for image in image_patches]
         | 
| 381 | 
            +
                            image_patches = [
         | 
| 382 | 
            +
                                self.normalize(image=image, mean=self.mean, std=self.std, input_data_format=input_data_format)
         | 
| 383 | 
            +
                                for image in image_patches
         | 
| 384 | 
            +
                            ]
         | 
| 385 | 
            +
                            image_patches = [
         | 
| 386 | 
            +
                                to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
         | 
| 387 | 
            +
                                for image in image_patches
         | 
| 388 | 
            +
                            ]
         | 
| 389 | 
            +
                            for slice_image in image_patches:
         | 
| 390 | 
            +
                                new_images.append(self.reshape_by_patch(slice_image))
         | 
| 391 | 
            +
                                tgt_sizes.append(
         | 
| 392 | 
            +
                                    np.array((slice_image.shape[1] // self.patch_size, slice_image.shape[2] // self.patch_size))
         | 
| 393 | 
            +
                                )
         | 
| 394 | 
            +
             | 
| 395 | 
            +
                        if tgt_sizes:
         | 
| 396 | 
            +
                            tgt_sizes = np.vstack(tgt_sizes)
         | 
| 397 | 
            +
             | 
| 398 | 
            +
                        new_images_list.append(new_images)
         | 
| 399 | 
            +
                        image_sizes_list.append(image_sizes)
         | 
| 400 | 
            +
                        tgt_sizes_list.append(tgt_sizes)
         | 
| 401 | 
            +
                    return MiniCPMOBatchFeature(
         | 
| 402 | 
            +
                        data={"pixel_values": new_images_list, "image_sizes": image_sizes_list, "tgt_sizes": tgt_sizes_list},
         | 
| 403 | 
            +
                        tensor_type=return_tensors,
         | 
| 404 | 
            +
                    )
         | 
| 405 | 
            +
             | 
| 406 | 
            +
             | 
| 407 | 
            +
            AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
         | 
    	
        merges.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        model-00001-of-00004.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a55101a2b4d43e9cf5568cf19af46bdc82dd882bdfc44f888eb7de878cd95a47
         | 
| 3 | 
            +
            size 4958344888
         | 
    	
        model-00002-of-00004.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:6c092cd5ac643890d332e77599743d14c85ade650742c90e2c273f058a0dbbd4
         | 
| 3 | 
            +
            size 4991496392
         | 
    	
        model-00003-of-00004.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:0c23d7d13bdb95eb198f51e90451a0e299cdac8d5a65f9cd2a836e4a2d01aae9
         | 
| 3 | 
            +
            size 4991496344
         | 
    	
        model-00004-of-00004.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:13dd38be4707268813a532a5fb2dfc43861b3d6070f490814146c102fd02081e
         | 
| 3 | 
            +
            size 1905109016
         | 
    	
        model.safetensors.index.json
    ADDED
    
    | @@ -0,0 +1,1167 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "metadata": {
         | 
| 3 | 
            +
                "total_size": 16846314976
         | 
| 4 | 
            +
              },
         | 
| 5 | 
            +
              "weight_map": {
         | 
| 6 | 
            +
                "apm.conv1.bias": "model-00001-of-00004.safetensors",
         | 
| 7 | 
            +
                "apm.conv1.weight": "model-00001-of-00004.safetensors",
         | 
| 8 | 
            +
                "apm.conv2.bias": "model-00001-of-00004.safetensors",
         | 
| 9 | 
            +
                "apm.conv2.weight": "model-00001-of-00004.safetensors",
         | 
| 10 | 
            +
                "apm.embed_positions.weight": "model-00001-of-00004.safetensors",
         | 
| 11 | 
            +
                "apm.layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 12 | 
            +
                "apm.layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 13 | 
            +
                "apm.layers.0.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 14 | 
            +
                "apm.layers.0.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 15 | 
            +
                "apm.layers.0.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 16 | 
            +
                "apm.layers.0.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 17 | 
            +
                "apm.layers.0.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 18 | 
            +
                "apm.layers.0.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 19 | 
            +
                "apm.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 20 | 
            +
                "apm.layers.0.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 21 | 
            +
                "apm.layers.0.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 22 | 
            +
                "apm.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 23 | 
            +
                "apm.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 24 | 
            +
                "apm.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 25 | 
            +
                "apm.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 26 | 
            +
                "apm.layers.0.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 27 | 
            +
                "apm.layers.0.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 28 | 
            +
                "apm.layers.1.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 29 | 
            +
                "apm.layers.1.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 30 | 
            +
                "apm.layers.1.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 31 | 
            +
                "apm.layers.1.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 32 | 
            +
                "apm.layers.1.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 33 | 
            +
                "apm.layers.1.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 34 | 
            +
                "apm.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 35 | 
            +
                "apm.layers.1.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 36 | 
            +
                "apm.layers.1.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 37 | 
            +
                "apm.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 38 | 
            +
                "apm.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 39 | 
            +
                "apm.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 40 | 
            +
                "apm.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 41 | 
            +
                "apm.layers.1.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 42 | 
            +
                "apm.layers.1.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 43 | 
            +
                "apm.layers.10.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 44 | 
            +
                "apm.layers.10.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 45 | 
            +
                "apm.layers.10.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 46 | 
            +
                "apm.layers.10.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 47 | 
            +
                "apm.layers.10.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 48 | 
            +
                "apm.layers.10.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 49 | 
            +
                "apm.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 50 | 
            +
                "apm.layers.10.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 51 | 
            +
                "apm.layers.10.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 52 | 
            +
                "apm.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 53 | 
            +
                "apm.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 54 | 
            +
                "apm.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 55 | 
            +
                "apm.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 56 | 
            +
                "apm.layers.10.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 57 | 
            +
                "apm.layers.10.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 58 | 
            +
                "apm.layers.11.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 59 | 
            +
                "apm.layers.11.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 60 | 
            +
                "apm.layers.11.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 61 | 
            +
                "apm.layers.11.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 62 | 
            +
                "apm.layers.11.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 63 | 
            +
                "apm.layers.11.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 64 | 
            +
                "apm.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 65 | 
            +
                "apm.layers.11.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 66 | 
            +
                "apm.layers.11.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 67 | 
            +
                "apm.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 68 | 
            +
                "apm.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 69 | 
            +
                "apm.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 70 | 
            +
                "apm.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 71 | 
            +
                "apm.layers.11.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 72 | 
            +
                "apm.layers.11.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 73 | 
            +
                "apm.layers.12.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 74 | 
            +
                "apm.layers.12.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 75 | 
            +
                "apm.layers.12.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 76 | 
            +
                "apm.layers.12.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 77 | 
            +
                "apm.layers.12.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 78 | 
            +
                "apm.layers.12.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 79 | 
            +
                "apm.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 80 | 
            +
                "apm.layers.12.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 81 | 
            +
                "apm.layers.12.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 82 | 
            +
                "apm.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 83 | 
            +
                "apm.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 84 | 
            +
                "apm.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 85 | 
            +
                "apm.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 86 | 
            +
                "apm.layers.12.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 87 | 
            +
                "apm.layers.12.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 88 | 
            +
                "apm.layers.13.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 89 | 
            +
                "apm.layers.13.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 90 | 
            +
                "apm.layers.13.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 91 | 
            +
                "apm.layers.13.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 92 | 
            +
                "apm.layers.13.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 93 | 
            +
                "apm.layers.13.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 94 | 
            +
                "apm.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 95 | 
            +
                "apm.layers.13.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 96 | 
            +
                "apm.layers.13.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 97 | 
            +
                "apm.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 98 | 
            +
                "apm.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 99 | 
            +
                "apm.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 100 | 
            +
                "apm.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 101 | 
            +
                "apm.layers.13.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 102 | 
            +
                "apm.layers.13.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 103 | 
            +
                "apm.layers.14.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 104 | 
            +
                "apm.layers.14.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 105 | 
            +
                "apm.layers.14.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 106 | 
            +
                "apm.layers.14.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 107 | 
            +
                "apm.layers.14.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 108 | 
            +
                "apm.layers.14.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 109 | 
            +
                "apm.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 110 | 
            +
                "apm.layers.14.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 111 | 
            +
                "apm.layers.14.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 112 | 
            +
                "apm.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 113 | 
            +
                "apm.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 114 | 
            +
                "apm.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 115 | 
            +
                "apm.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 116 | 
            +
                "apm.layers.14.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 117 | 
            +
                "apm.layers.14.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 118 | 
            +
                "apm.layers.15.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 119 | 
            +
                "apm.layers.15.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 120 | 
            +
                "apm.layers.15.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 121 | 
            +
                "apm.layers.15.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 122 | 
            +
                "apm.layers.15.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 123 | 
            +
                "apm.layers.15.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 124 | 
            +
                "apm.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 125 | 
            +
                "apm.layers.15.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 126 | 
            +
                "apm.layers.15.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 127 | 
            +
                "apm.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 128 | 
            +
                "apm.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 129 | 
            +
                "apm.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 130 | 
            +
                "apm.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 131 | 
            +
                "apm.layers.15.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 132 | 
            +
                "apm.layers.15.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 133 | 
            +
                "apm.layers.16.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 134 | 
            +
                "apm.layers.16.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 135 | 
            +
                "apm.layers.16.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 136 | 
            +
                "apm.layers.16.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 137 | 
            +
                "apm.layers.16.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 138 | 
            +
                "apm.layers.16.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 139 | 
            +
                "apm.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 140 | 
            +
                "apm.layers.16.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 141 | 
            +
                "apm.layers.16.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 142 | 
            +
                "apm.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 143 | 
            +
                "apm.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 144 | 
            +
                "apm.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 145 | 
            +
                "apm.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 146 | 
            +
                "apm.layers.16.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 147 | 
            +
                "apm.layers.16.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 148 | 
            +
                "apm.layers.17.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 149 | 
            +
                "apm.layers.17.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 150 | 
            +
                "apm.layers.17.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 151 | 
            +
                "apm.layers.17.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 152 | 
            +
                "apm.layers.17.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 153 | 
            +
                "apm.layers.17.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 154 | 
            +
                "apm.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 155 | 
            +
                "apm.layers.17.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 156 | 
            +
                "apm.layers.17.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 157 | 
            +
                "apm.layers.17.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 158 | 
            +
                "apm.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 159 | 
            +
                "apm.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 160 | 
            +
                "apm.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 161 | 
            +
                "apm.layers.17.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 162 | 
            +
                "apm.layers.17.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 163 | 
            +
                "apm.layers.18.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 164 | 
            +
                "apm.layers.18.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 165 | 
            +
                "apm.layers.18.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 166 | 
            +
                "apm.layers.18.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 167 | 
            +
                "apm.layers.18.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 168 | 
            +
                "apm.layers.18.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 169 | 
            +
                "apm.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 170 | 
            +
                "apm.layers.18.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 171 | 
            +
                "apm.layers.18.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 172 | 
            +
                "apm.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 173 | 
            +
                "apm.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 174 | 
            +
                "apm.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 175 | 
            +
                "apm.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 176 | 
            +
                "apm.layers.18.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 177 | 
            +
                "apm.layers.18.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 178 | 
            +
                "apm.layers.19.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 179 | 
            +
                "apm.layers.19.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 180 | 
            +
                "apm.layers.19.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 181 | 
            +
                "apm.layers.19.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 182 | 
            +
                "apm.layers.19.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 183 | 
            +
                "apm.layers.19.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 184 | 
            +
                "apm.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 185 | 
            +
                "apm.layers.19.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 186 | 
            +
                "apm.layers.19.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 187 | 
            +
                "apm.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 188 | 
            +
                "apm.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 189 | 
            +
                "apm.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 190 | 
            +
                "apm.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 191 | 
            +
                "apm.layers.19.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 192 | 
            +
                "apm.layers.19.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 193 | 
            +
                "apm.layers.2.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 194 | 
            +
                "apm.layers.2.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 195 | 
            +
                "apm.layers.2.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 196 | 
            +
                "apm.layers.2.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 197 | 
            +
                "apm.layers.2.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 198 | 
            +
                "apm.layers.2.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 199 | 
            +
                "apm.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 200 | 
            +
                "apm.layers.2.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 201 | 
            +
                "apm.layers.2.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 202 | 
            +
                "apm.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 203 | 
            +
                "apm.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 204 | 
            +
                "apm.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 205 | 
            +
                "apm.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 206 | 
            +
                "apm.layers.2.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 207 | 
            +
                "apm.layers.2.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 208 | 
            +
                "apm.layers.20.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 209 | 
            +
                "apm.layers.20.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 210 | 
            +
                "apm.layers.20.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 211 | 
            +
                "apm.layers.20.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 212 | 
            +
                "apm.layers.20.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 213 | 
            +
                "apm.layers.20.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 214 | 
            +
                "apm.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 215 | 
            +
                "apm.layers.20.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 216 | 
            +
                "apm.layers.20.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 217 | 
            +
                "apm.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 218 | 
            +
                "apm.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 219 | 
            +
                "apm.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 220 | 
            +
                "apm.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 221 | 
            +
                "apm.layers.20.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 222 | 
            +
                "apm.layers.20.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 223 | 
            +
                "apm.layers.21.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 224 | 
            +
                "apm.layers.21.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 225 | 
            +
                "apm.layers.21.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 226 | 
            +
                "apm.layers.21.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 227 | 
            +
                "apm.layers.21.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 228 | 
            +
                "apm.layers.21.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 229 | 
            +
                "apm.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 230 | 
            +
                "apm.layers.21.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 231 | 
            +
                "apm.layers.21.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 232 | 
            +
                "apm.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 233 | 
            +
                "apm.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 234 | 
            +
                "apm.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 235 | 
            +
                "apm.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 236 | 
            +
                "apm.layers.21.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 237 | 
            +
                "apm.layers.21.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 238 | 
            +
                "apm.layers.22.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 239 | 
            +
                "apm.layers.22.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 240 | 
            +
                "apm.layers.22.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 241 | 
            +
                "apm.layers.22.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 242 | 
            +
                "apm.layers.22.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 243 | 
            +
                "apm.layers.22.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 244 | 
            +
                "apm.layers.22.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 245 | 
            +
                "apm.layers.22.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 246 | 
            +
                "apm.layers.22.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 247 | 
            +
                "apm.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 248 | 
            +
                "apm.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 249 | 
            +
                "apm.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 250 | 
            +
                "apm.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 251 | 
            +
                "apm.layers.22.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 252 | 
            +
                "apm.layers.22.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 253 | 
            +
                "apm.layers.23.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 254 | 
            +
                "apm.layers.23.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 255 | 
            +
                "apm.layers.23.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 256 | 
            +
                "apm.layers.23.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 257 | 
            +
                "apm.layers.23.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 258 | 
            +
                "apm.layers.23.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 259 | 
            +
                "apm.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 260 | 
            +
                "apm.layers.23.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 261 | 
            +
                "apm.layers.23.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 262 | 
            +
                "apm.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 263 | 
            +
                "apm.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 264 | 
            +
                "apm.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 265 | 
            +
                "apm.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 266 | 
            +
                "apm.layers.23.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 267 | 
            +
                "apm.layers.23.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 268 | 
            +
                "apm.layers.3.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 269 | 
            +
                "apm.layers.3.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 270 | 
            +
                "apm.layers.3.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 271 | 
            +
                "apm.layers.3.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 272 | 
            +
                "apm.layers.3.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 273 | 
            +
                "apm.layers.3.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 274 | 
            +
                "apm.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 275 | 
            +
                "apm.layers.3.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 276 | 
            +
                "apm.layers.3.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 277 | 
            +
                "apm.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 278 | 
            +
                "apm.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 279 | 
            +
                "apm.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 280 | 
            +
                "apm.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 281 | 
            +
                "apm.layers.3.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 282 | 
            +
                "apm.layers.3.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 283 | 
            +
                "apm.layers.4.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 284 | 
            +
                "apm.layers.4.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 285 | 
            +
                "apm.layers.4.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 286 | 
            +
                "apm.layers.4.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 287 | 
            +
                "apm.layers.4.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 288 | 
            +
                "apm.layers.4.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 289 | 
            +
                "apm.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 290 | 
            +
                "apm.layers.4.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 291 | 
            +
                "apm.layers.4.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 292 | 
            +
                "apm.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 293 | 
            +
                "apm.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 294 | 
            +
                "apm.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 295 | 
            +
                "apm.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 296 | 
            +
                "apm.layers.4.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 297 | 
            +
                "apm.layers.4.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 298 | 
            +
                "apm.layers.5.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 299 | 
            +
                "apm.layers.5.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 300 | 
            +
                "apm.layers.5.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 301 | 
            +
                "apm.layers.5.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 302 | 
            +
                "apm.layers.5.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 303 | 
            +
                "apm.layers.5.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 304 | 
            +
                "apm.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 305 | 
            +
                "apm.layers.5.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 306 | 
            +
                "apm.layers.5.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 307 | 
            +
                "apm.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 308 | 
            +
                "apm.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 309 | 
            +
                "apm.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 310 | 
            +
                "apm.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 311 | 
            +
                "apm.layers.5.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 312 | 
            +
                "apm.layers.5.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 313 | 
            +
                "apm.layers.6.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 314 | 
            +
                "apm.layers.6.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 315 | 
            +
                "apm.layers.6.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 316 | 
            +
                "apm.layers.6.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 317 | 
            +
                "apm.layers.6.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 318 | 
            +
                "apm.layers.6.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 319 | 
            +
                "apm.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 320 | 
            +
                "apm.layers.6.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 321 | 
            +
                "apm.layers.6.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 322 | 
            +
                "apm.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 323 | 
            +
                "apm.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 324 | 
            +
                "apm.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 325 | 
            +
                "apm.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 326 | 
            +
                "apm.layers.6.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 327 | 
            +
                "apm.layers.6.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 328 | 
            +
                "apm.layers.7.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 329 | 
            +
                "apm.layers.7.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 330 | 
            +
                "apm.layers.7.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 331 | 
            +
                "apm.layers.7.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 332 | 
            +
                "apm.layers.7.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 333 | 
            +
                "apm.layers.7.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 334 | 
            +
                "apm.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 335 | 
            +
                "apm.layers.7.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 336 | 
            +
                "apm.layers.7.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 337 | 
            +
                "apm.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 338 | 
            +
                "apm.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 339 | 
            +
                "apm.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 340 | 
            +
                "apm.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 341 | 
            +
                "apm.layers.7.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 342 | 
            +
                "apm.layers.7.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 343 | 
            +
                "apm.layers.8.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 344 | 
            +
                "apm.layers.8.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 345 | 
            +
                "apm.layers.8.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 346 | 
            +
                "apm.layers.8.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 347 | 
            +
                "apm.layers.8.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 348 | 
            +
                "apm.layers.8.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 349 | 
            +
                "apm.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 350 | 
            +
                "apm.layers.8.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 351 | 
            +
                "apm.layers.8.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 352 | 
            +
                "apm.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 353 | 
            +
                "apm.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 354 | 
            +
                "apm.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 355 | 
            +
                "apm.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 356 | 
            +
                "apm.layers.8.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 357 | 
            +
                "apm.layers.8.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 358 | 
            +
                "apm.layers.9.fc1.bias": "model-00001-of-00004.safetensors",
         | 
| 359 | 
            +
                "apm.layers.9.fc1.weight": "model-00001-of-00004.safetensors",
         | 
| 360 | 
            +
                "apm.layers.9.fc2.bias": "model-00001-of-00004.safetensors",
         | 
| 361 | 
            +
                "apm.layers.9.fc2.weight": "model-00001-of-00004.safetensors",
         | 
| 362 | 
            +
                "apm.layers.9.final_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 363 | 
            +
                "apm.layers.9.final_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 364 | 
            +
                "apm.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 365 | 
            +
                "apm.layers.9.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 366 | 
            +
                "apm.layers.9.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 367 | 
            +
                "apm.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 368 | 
            +
                "apm.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 369 | 
            +
                "apm.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 370 | 
            +
                "apm.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 371 | 
            +
                "apm.layers.9.self_attn_layer_norm.bias": "model-00001-of-00004.safetensors",
         | 
| 372 | 
            +
                "apm.layers.9.self_attn_layer_norm.weight": "model-00001-of-00004.safetensors",
         | 
| 373 | 
            +
                "audio_projection_layer.linear1.bias": "model-00001-of-00004.safetensors",
         | 
| 374 | 
            +
                "audio_projection_layer.linear1.weight": "model-00001-of-00004.safetensors",
         | 
| 375 | 
            +
                "audio_projection_layer.linear2.bias": "model-00001-of-00004.safetensors",
         | 
| 376 | 
            +
                "audio_projection_layer.linear2.weight": "model-00001-of-00004.safetensors",
         | 
| 377 | 
            +
                "llm.lm_head.weight": "model-00001-of-00004.safetensors",
         | 
| 378 | 
            +
                "llm.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
         | 
| 379 | 
            +
                "llm.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 380 | 
            +
                "llm.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 381 | 
            +
                "llm.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 382 | 
            +
                "llm.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 383 | 
            +
                "llm.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 384 | 
            +
                "llm.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 385 | 
            +
                "llm.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 386 | 
            +
                "llm.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 387 | 
            +
                "llm.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 388 | 
            +
                "llm.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 389 | 
            +
                "llm.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 390 | 
            +
                "llm.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 391 | 
            +
                "llm.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 392 | 
            +
                "llm.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 393 | 
            +
                "llm.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 394 | 
            +
                "llm.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 395 | 
            +
                "llm.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 396 | 
            +
                "llm.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 397 | 
            +
                "llm.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 398 | 
            +
                "llm.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 399 | 
            +
                "llm.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 400 | 
            +
                "llm.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 401 | 
            +
                "llm.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 402 | 
            +
                "llm.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 403 | 
            +
                "llm.model.layers.10.input_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 404 | 
            +
                "llm.model.layers.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 405 | 
            +
                "llm.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 406 | 
            +
                "llm.model.layers.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 407 | 
            +
                "llm.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 408 | 
            +
                "llm.model.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 409 | 
            +
                "llm.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 410 | 
            +
                "llm.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 411 | 
            +
                "llm.model.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 412 | 
            +
                "llm.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 413 | 
            +
                "llm.model.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 414 | 
            +
                "llm.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 415 | 
            +
                "llm.model.layers.11.input_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 416 | 
            +
                "llm.model.layers.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 417 | 
            +
                "llm.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 418 | 
            +
                "llm.model.layers.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 419 | 
            +
                "llm.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 420 | 
            +
                "llm.model.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 421 | 
            +
                "llm.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 422 | 
            +
                "llm.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 423 | 
            +
                "llm.model.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 424 | 
            +
                "llm.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 425 | 
            +
                "llm.model.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
         | 
| 426 | 
            +
                "llm.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 427 | 
            +
                "llm.model.layers.12.input_layernorm.weight": "model-00001-of-00004.safetensors",
         | 
| 428 | 
            +
                "llm.model.layers.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 429 | 
            +
                "llm.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
         | 
| 430 | 
            +
                "llm.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 431 | 
            +
                "llm.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 432 | 
            +
                "llm.model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 433 | 
            +
                "llm.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 434 | 
            +
                "llm.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 435 | 
            +
                "llm.model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 436 | 
            +
                "llm.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 437 | 
            +
                "llm.model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 438 | 
            +
                "llm.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 439 | 
            +
                "llm.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 440 | 
            +
                "llm.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 441 | 
            +
                "llm.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 442 | 
            +
                "llm.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 443 | 
            +
                "llm.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 444 | 
            +
                "llm.model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 445 | 
            +
                "llm.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 446 | 
            +
                "llm.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 447 | 
            +
                "llm.model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 448 | 
            +
                "llm.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 449 | 
            +
                "llm.model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 450 | 
            +
                "llm.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 451 | 
            +
                "llm.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 452 | 
            +
                "llm.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 453 | 
            +
                "llm.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 454 | 
            +
                "llm.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 455 | 
            +
                "llm.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 456 | 
            +
                "llm.model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 457 | 
            +
                "llm.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 458 | 
            +
                "llm.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 459 | 
            +
                "llm.model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 460 | 
            +
                "llm.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 461 | 
            +
                "llm.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 462 | 
            +
                "llm.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 463 | 
            +
                "llm.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 464 | 
            +
                "llm.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 465 | 
            +
                "llm.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 466 | 
            +
                "llm.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 467 | 
            +
                "llm.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 468 | 
            +
                "llm.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 469 | 
            +
                "llm.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 470 | 
            +
                "llm.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 471 | 
            +
                "llm.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 472 | 
            +
                "llm.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 473 | 
            +
                "llm.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 474 | 
            +
                "llm.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 475 | 
            +
                "llm.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 476 | 
            +
                "llm.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 477 | 
            +
                "llm.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 478 | 
            +
                "llm.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 479 | 
            +
                "llm.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 480 | 
            +
                "llm.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 481 | 
            +
                "llm.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 482 | 
            +
                "llm.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 483 | 
            +
                "llm.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 484 | 
            +
                "llm.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 485 | 
            +
                "llm.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 486 | 
            +
                "llm.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 487 | 
            +
                "llm.model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 488 | 
            +
                "llm.model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 489 | 
            +
                "llm.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 490 | 
            +
                "llm.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 491 | 
            +
                "llm.model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 492 | 
            +
                "llm.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 493 | 
            +
                "llm.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 494 | 
            +
                "llm.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 495 | 
            +
                "llm.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 496 | 
            +
                "llm.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 497 | 
            +
                "llm.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 498 | 
            +
                "llm.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 499 | 
            +
                "llm.model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 500 | 
            +
                "llm.model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 501 | 
            +
                "llm.model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 502 | 
            +
                "llm.model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 503 | 
            +
                "llm.model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 504 | 
            +
                "llm.model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 505 | 
            +
                "llm.model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 506 | 
            +
                "llm.model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 507 | 
            +
                "llm.model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 508 | 
            +
                "llm.model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 509 | 
            +
                "llm.model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 510 | 
            +
                "llm.model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 511 | 
            +
                "llm.model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 512 | 
            +
                "llm.model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 513 | 
            +
                "llm.model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 514 | 
            +
                "llm.model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 515 | 
            +
                "llm.model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 516 | 
            +
                "llm.model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 517 | 
            +
                "llm.model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 518 | 
            +
                "llm.model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 519 | 
            +
                "llm.model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 520 | 
            +
                "llm.model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 521 | 
            +
                "llm.model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 522 | 
            +
                "llm.model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 523 | 
            +
                "llm.model.layers.2.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 524 | 
            +
                "llm.model.layers.2.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 525 | 
            +
                "llm.model.layers.2.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 526 | 
            +
                "llm.model.layers.2.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 527 | 
            +
                "llm.model.layers.2.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 528 | 
            +
                "llm.model.layers.2.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 529 | 
            +
                "llm.model.layers.2.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 530 | 
            +
                "llm.model.layers.2.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 531 | 
            +
                "llm.model.layers.2.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 532 | 
            +
                "llm.model.layers.2.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 533 | 
            +
                "llm.model.layers.2.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 534 | 
            +
                "llm.model.layers.2.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 535 | 
            +
                "llm.model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 536 | 
            +
                "llm.model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 537 | 
            +
                "llm.model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 538 | 
            +
                "llm.model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 539 | 
            +
                "llm.model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 540 | 
            +
                "llm.model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 541 | 
            +
                "llm.model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 542 | 
            +
                "llm.model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 543 | 
            +
                "llm.model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 544 | 
            +
                "llm.model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 545 | 
            +
                "llm.model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 546 | 
            +
                "llm.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 547 | 
            +
                "llm.model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 548 | 
            +
                "llm.model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 549 | 
            +
                "llm.model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 550 | 
            +
                "llm.model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 551 | 
            +
                "llm.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 552 | 
            +
                "llm.model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 553 | 
            +
                "llm.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 554 | 
            +
                "llm.model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 555 | 
            +
                "llm.model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 556 | 
            +
                "llm.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 557 | 
            +
                "llm.model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
         | 
| 558 | 
            +
                "llm.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 559 | 
            +
                "llm.model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
         | 
| 560 | 
            +
                "llm.model.layers.22.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
         | 
| 561 | 
            +
                "llm.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 562 | 
            +
                "llm.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 563 | 
            +
                "llm.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 564 | 
            +
                "llm.model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 565 | 
            +
                "llm.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 566 | 
            +
                "llm.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 567 | 
            +
                "llm.model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 568 | 
            +
                "llm.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 569 | 
            +
                "llm.model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 570 | 
            +
                "llm.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 571 | 
            +
                "llm.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 572 | 
            +
                "llm.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 573 | 
            +
                "llm.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 574 | 
            +
                "llm.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 575 | 
            +
                "llm.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 576 | 
            +
                "llm.model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 577 | 
            +
                "llm.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 578 | 
            +
                "llm.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 579 | 
            +
                "llm.model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 580 | 
            +
                "llm.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 581 | 
            +
                "llm.model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 582 | 
            +
                "llm.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 583 | 
            +
                "llm.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 584 | 
            +
                "llm.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 585 | 
            +
                "llm.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 586 | 
            +
                "llm.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 587 | 
            +
                "llm.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 588 | 
            +
                "llm.model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 589 | 
            +
                "llm.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 590 | 
            +
                "llm.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 591 | 
            +
                "llm.model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 592 | 
            +
                "llm.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 593 | 
            +
                "llm.model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 594 | 
            +
                "llm.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 595 | 
            +
                "llm.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 596 | 
            +
                "llm.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 597 | 
            +
                "llm.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 598 | 
            +
                "llm.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 599 | 
            +
                "llm.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 600 | 
            +
                "llm.model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 601 | 
            +
                "llm.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 602 | 
            +
                "llm.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 603 | 
            +
                "llm.model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 604 | 
            +
                "llm.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 605 | 
            +
                "llm.model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 606 | 
            +
                "llm.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 607 | 
            +
                "llm.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 608 | 
            +
                "llm.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 609 | 
            +
                "llm.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 610 | 
            +
                "llm.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 611 | 
            +
                "llm.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 612 | 
            +
                "llm.model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 613 | 
            +
                "llm.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 614 | 
            +
                "llm.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 615 | 
            +
                "llm.model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 616 | 
            +
                "llm.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 617 | 
            +
                "llm.model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 618 | 
            +
                "llm.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 619 | 
            +
                "llm.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 620 | 
            +
                "llm.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 621 | 
            +
                "llm.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 622 | 
            +
                "llm.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 623 | 
            +
                "llm.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 624 | 
            +
                "llm.model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 625 | 
            +
                "llm.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 626 | 
            +
                "llm.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 627 | 
            +
                "llm.model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 628 | 
            +
                "llm.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 629 | 
            +
                "llm.model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 630 | 
            +
                "llm.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 631 | 
            +
                "llm.model.layers.3.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 632 | 
            +
                "llm.model.layers.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 633 | 
            +
                "llm.model.layers.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 634 | 
            +
                "llm.model.layers.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 635 | 
            +
                "llm.model.layers.3.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 636 | 
            +
                "llm.model.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 637 | 
            +
                "llm.model.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 638 | 
            +
                "llm.model.layers.3.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 639 | 
            +
                "llm.model.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 640 | 
            +
                "llm.model.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 641 | 
            +
                "llm.model.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 642 | 
            +
                "llm.model.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 643 | 
            +
                "llm.model.layers.4.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 644 | 
            +
                "llm.model.layers.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 645 | 
            +
                "llm.model.layers.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 646 | 
            +
                "llm.model.layers.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 647 | 
            +
                "llm.model.layers.4.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 648 | 
            +
                "llm.model.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 649 | 
            +
                "llm.model.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 650 | 
            +
                "llm.model.layers.4.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 651 | 
            +
                "llm.model.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 652 | 
            +
                "llm.model.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 653 | 
            +
                "llm.model.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 654 | 
            +
                "llm.model.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 655 | 
            +
                "llm.model.layers.5.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 656 | 
            +
                "llm.model.layers.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 657 | 
            +
                "llm.model.layers.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 658 | 
            +
                "llm.model.layers.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 659 | 
            +
                "llm.model.layers.5.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 660 | 
            +
                "llm.model.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 661 | 
            +
                "llm.model.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 662 | 
            +
                "llm.model.layers.5.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 663 | 
            +
                "llm.model.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 664 | 
            +
                "llm.model.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 665 | 
            +
                "llm.model.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 666 | 
            +
                "llm.model.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 667 | 
            +
                "llm.model.layers.6.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 668 | 
            +
                "llm.model.layers.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 669 | 
            +
                "llm.model.layers.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 670 | 
            +
                "llm.model.layers.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 671 | 
            +
                "llm.model.layers.6.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 672 | 
            +
                "llm.model.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 673 | 
            +
                "llm.model.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 674 | 
            +
                "llm.model.layers.6.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 675 | 
            +
                "llm.model.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 676 | 
            +
                "llm.model.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 677 | 
            +
                "llm.model.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 678 | 
            +
                "llm.model.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 679 | 
            +
                "llm.model.layers.7.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 680 | 
            +
                "llm.model.layers.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 681 | 
            +
                "llm.model.layers.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 682 | 
            +
                "llm.model.layers.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 683 | 
            +
                "llm.model.layers.7.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 684 | 
            +
                "llm.model.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 685 | 
            +
                "llm.model.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 686 | 
            +
                "llm.model.layers.7.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 687 | 
            +
                "llm.model.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 688 | 
            +
                "llm.model.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 689 | 
            +
                "llm.model.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
         | 
| 690 | 
            +
                "llm.model.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
         | 
| 691 | 
            +
                "llm.model.layers.8.input_layernorm.weight": "model-00003-of-00004.safetensors",
         | 
| 692 | 
            +
                "llm.model.layers.8.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 693 | 
            +
                "llm.model.layers.8.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 694 | 
            +
                "llm.model.layers.8.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 695 | 
            +
                "llm.model.layers.8.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
         | 
| 696 | 
            +
                "llm.model.layers.8.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 697 | 
            +
                "llm.model.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 698 | 
            +
                "llm.model.layers.8.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 699 | 
            +
                "llm.model.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 700 | 
            +
                "llm.model.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 701 | 
            +
                "llm.model.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 702 | 
            +
                "llm.model.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 703 | 
            +
                "llm.model.layers.9.input_layernorm.weight": "model-00004-of-00004.safetensors",
         | 
| 704 | 
            +
                "llm.model.layers.9.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 705 | 
            +
                "llm.model.layers.9.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 706 | 
            +
                "llm.model.layers.9.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 707 | 
            +
                "llm.model.layers.9.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
         | 
| 708 | 
            +
                "llm.model.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 709 | 
            +
                "llm.model.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 710 | 
            +
                "llm.model.layers.9.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 711 | 
            +
                "llm.model.layers.9.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 712 | 
            +
                "llm.model.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 713 | 
            +
                "llm.model.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 714 | 
            +
                "llm.model.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 715 | 
            +
                "llm.model.norm.weight": "model-00004-of-00004.safetensors",
         | 
| 716 | 
            +
                "resampler.attn.in_proj_bias": "model-00004-of-00004.safetensors",
         | 
| 717 | 
            +
                "resampler.attn.in_proj_weight": "model-00004-of-00004.safetensors",
         | 
| 718 | 
            +
                "resampler.attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 719 | 
            +
                "resampler.attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 720 | 
            +
                "resampler.kv_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 721 | 
            +
                "resampler.ln_kv.bias": "model-00004-of-00004.safetensors",
         | 
| 722 | 
            +
                "resampler.ln_kv.weight": "model-00004-of-00004.safetensors",
         | 
| 723 | 
            +
                "resampler.ln_post.bias": "model-00004-of-00004.safetensors",
         | 
| 724 | 
            +
                "resampler.ln_post.weight": "model-00004-of-00004.safetensors",
         | 
| 725 | 
            +
                "resampler.ln_q.bias": "model-00004-of-00004.safetensors",
         | 
| 726 | 
            +
                "resampler.ln_q.weight": "model-00004-of-00004.safetensors",
         | 
| 727 | 
            +
                "resampler.proj": "model-00004-of-00004.safetensors",
         | 
| 728 | 
            +
                "resampler.query": "model-00004-of-00004.safetensors",
         | 
| 729 | 
            +
                "vpm.embeddings.patch_embedding.bias": "model-00004-of-00004.safetensors",
         | 
| 730 | 
            +
                "vpm.embeddings.patch_embedding.weight": "model-00004-of-00004.safetensors",
         | 
| 731 | 
            +
                "vpm.embeddings.position_embedding.weight": "model-00004-of-00004.safetensors",
         | 
| 732 | 
            +
                "vpm.encoder.layers.0.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 733 | 
            +
                "vpm.encoder.layers.0.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 734 | 
            +
                "vpm.encoder.layers.0.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 735 | 
            +
                "vpm.encoder.layers.0.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 736 | 
            +
                "vpm.encoder.layers.0.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 737 | 
            +
                "vpm.encoder.layers.0.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 738 | 
            +
                "vpm.encoder.layers.0.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 739 | 
            +
                "vpm.encoder.layers.0.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 740 | 
            +
                "vpm.encoder.layers.0.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 741 | 
            +
                "vpm.encoder.layers.0.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 742 | 
            +
                "vpm.encoder.layers.0.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 743 | 
            +
                "vpm.encoder.layers.0.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 744 | 
            +
                "vpm.encoder.layers.0.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 745 | 
            +
                "vpm.encoder.layers.0.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 746 | 
            +
                "vpm.encoder.layers.0.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 747 | 
            +
                "vpm.encoder.layers.0.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 748 | 
            +
                "vpm.encoder.layers.1.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 749 | 
            +
                "vpm.encoder.layers.1.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 750 | 
            +
                "vpm.encoder.layers.1.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 751 | 
            +
                "vpm.encoder.layers.1.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 752 | 
            +
                "vpm.encoder.layers.1.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 753 | 
            +
                "vpm.encoder.layers.1.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 754 | 
            +
                "vpm.encoder.layers.1.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 755 | 
            +
                "vpm.encoder.layers.1.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 756 | 
            +
                "vpm.encoder.layers.1.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 757 | 
            +
                "vpm.encoder.layers.1.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 758 | 
            +
                "vpm.encoder.layers.1.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 759 | 
            +
                "vpm.encoder.layers.1.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 760 | 
            +
                "vpm.encoder.layers.1.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 761 | 
            +
                "vpm.encoder.layers.1.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 762 | 
            +
                "vpm.encoder.layers.1.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 763 | 
            +
                "vpm.encoder.layers.1.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 764 | 
            +
                "vpm.encoder.layers.10.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 765 | 
            +
                "vpm.encoder.layers.10.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 766 | 
            +
                "vpm.encoder.layers.10.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 767 | 
            +
                "vpm.encoder.layers.10.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 768 | 
            +
                "vpm.encoder.layers.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 769 | 
            +
                "vpm.encoder.layers.10.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 770 | 
            +
                "vpm.encoder.layers.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 771 | 
            +
                "vpm.encoder.layers.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 772 | 
            +
                "vpm.encoder.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 773 | 
            +
                "vpm.encoder.layers.10.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 774 | 
            +
                "vpm.encoder.layers.10.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 775 | 
            +
                "vpm.encoder.layers.10.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 776 | 
            +
                "vpm.encoder.layers.10.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 777 | 
            +
                "vpm.encoder.layers.10.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 778 | 
            +
                "vpm.encoder.layers.10.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 779 | 
            +
                "vpm.encoder.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 780 | 
            +
                "vpm.encoder.layers.11.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 781 | 
            +
                "vpm.encoder.layers.11.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 782 | 
            +
                "vpm.encoder.layers.11.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 783 | 
            +
                "vpm.encoder.layers.11.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 784 | 
            +
                "vpm.encoder.layers.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 785 | 
            +
                "vpm.encoder.layers.11.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 786 | 
            +
                "vpm.encoder.layers.11.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 787 | 
            +
                "vpm.encoder.layers.11.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 788 | 
            +
                "vpm.encoder.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 789 | 
            +
                "vpm.encoder.layers.11.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 790 | 
            +
                "vpm.encoder.layers.11.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 791 | 
            +
                "vpm.encoder.layers.11.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 792 | 
            +
                "vpm.encoder.layers.11.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 793 | 
            +
                "vpm.encoder.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 794 | 
            +
                "vpm.encoder.layers.11.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 795 | 
            +
                "vpm.encoder.layers.11.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 796 | 
            +
                "vpm.encoder.layers.12.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 797 | 
            +
                "vpm.encoder.layers.12.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 798 | 
            +
                "vpm.encoder.layers.12.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 799 | 
            +
                "vpm.encoder.layers.12.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 800 | 
            +
                "vpm.encoder.layers.12.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 801 | 
            +
                "vpm.encoder.layers.12.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 802 | 
            +
                "vpm.encoder.layers.12.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 803 | 
            +
                "vpm.encoder.layers.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 804 | 
            +
                "vpm.encoder.layers.12.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 805 | 
            +
                "vpm.encoder.layers.12.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 806 | 
            +
                "vpm.encoder.layers.12.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 807 | 
            +
                "vpm.encoder.layers.12.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 808 | 
            +
                "vpm.encoder.layers.12.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 809 | 
            +
                "vpm.encoder.layers.12.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 810 | 
            +
                "vpm.encoder.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 811 | 
            +
                "vpm.encoder.layers.12.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 812 | 
            +
                "vpm.encoder.layers.13.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 813 | 
            +
                "vpm.encoder.layers.13.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 814 | 
            +
                "vpm.encoder.layers.13.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 815 | 
            +
                "vpm.encoder.layers.13.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 816 | 
            +
                "vpm.encoder.layers.13.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 817 | 
            +
                "vpm.encoder.layers.13.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 818 | 
            +
                "vpm.encoder.layers.13.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 819 | 
            +
                "vpm.encoder.layers.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 820 | 
            +
                "vpm.encoder.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 821 | 
            +
                "vpm.encoder.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 822 | 
            +
                "vpm.encoder.layers.13.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 823 | 
            +
                "vpm.encoder.layers.13.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 824 | 
            +
                "vpm.encoder.layers.13.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 825 | 
            +
                "vpm.encoder.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 826 | 
            +
                "vpm.encoder.layers.13.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 827 | 
            +
                "vpm.encoder.layers.13.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 828 | 
            +
                "vpm.encoder.layers.14.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 829 | 
            +
                "vpm.encoder.layers.14.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 830 | 
            +
                "vpm.encoder.layers.14.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 831 | 
            +
                "vpm.encoder.layers.14.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 832 | 
            +
                "vpm.encoder.layers.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 833 | 
            +
                "vpm.encoder.layers.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 834 | 
            +
                "vpm.encoder.layers.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 835 | 
            +
                "vpm.encoder.layers.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 836 | 
            +
                "vpm.encoder.layers.14.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 837 | 
            +
                "vpm.encoder.layers.14.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 838 | 
            +
                "vpm.encoder.layers.14.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 839 | 
            +
                "vpm.encoder.layers.14.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 840 | 
            +
                "vpm.encoder.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 841 | 
            +
                "vpm.encoder.layers.14.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 842 | 
            +
                "vpm.encoder.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 843 | 
            +
                "vpm.encoder.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 844 | 
            +
                "vpm.encoder.layers.15.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 845 | 
            +
                "vpm.encoder.layers.15.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 846 | 
            +
                "vpm.encoder.layers.15.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 847 | 
            +
                "vpm.encoder.layers.15.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 848 | 
            +
                "vpm.encoder.layers.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 849 | 
            +
                "vpm.encoder.layers.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 850 | 
            +
                "vpm.encoder.layers.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 851 | 
            +
                "vpm.encoder.layers.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 852 | 
            +
                "vpm.encoder.layers.15.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 853 | 
            +
                "vpm.encoder.layers.15.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 854 | 
            +
                "vpm.encoder.layers.15.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 855 | 
            +
                "vpm.encoder.layers.15.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 856 | 
            +
                "vpm.encoder.layers.15.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 857 | 
            +
                "vpm.encoder.layers.15.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 858 | 
            +
                "vpm.encoder.layers.15.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 859 | 
            +
                "vpm.encoder.layers.15.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 860 | 
            +
                "vpm.encoder.layers.16.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 861 | 
            +
                "vpm.encoder.layers.16.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 862 | 
            +
                "vpm.encoder.layers.16.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 863 | 
            +
                "vpm.encoder.layers.16.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 864 | 
            +
                "vpm.encoder.layers.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 865 | 
            +
                "vpm.encoder.layers.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 866 | 
            +
                "vpm.encoder.layers.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 867 | 
            +
                "vpm.encoder.layers.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 868 | 
            +
                "vpm.encoder.layers.16.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 869 | 
            +
                "vpm.encoder.layers.16.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 870 | 
            +
                "vpm.encoder.layers.16.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 871 | 
            +
                "vpm.encoder.layers.16.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 872 | 
            +
                "vpm.encoder.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 873 | 
            +
                "vpm.encoder.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 874 | 
            +
                "vpm.encoder.layers.16.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 875 | 
            +
                "vpm.encoder.layers.16.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 876 | 
            +
                "vpm.encoder.layers.17.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 877 | 
            +
                "vpm.encoder.layers.17.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 878 | 
            +
                "vpm.encoder.layers.17.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 879 | 
            +
                "vpm.encoder.layers.17.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 880 | 
            +
                "vpm.encoder.layers.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 881 | 
            +
                "vpm.encoder.layers.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 882 | 
            +
                "vpm.encoder.layers.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 883 | 
            +
                "vpm.encoder.layers.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 884 | 
            +
                "vpm.encoder.layers.17.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 885 | 
            +
                "vpm.encoder.layers.17.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 886 | 
            +
                "vpm.encoder.layers.17.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 887 | 
            +
                "vpm.encoder.layers.17.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 888 | 
            +
                "vpm.encoder.layers.17.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 889 | 
            +
                "vpm.encoder.layers.17.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 890 | 
            +
                "vpm.encoder.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 891 | 
            +
                "vpm.encoder.layers.17.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 892 | 
            +
                "vpm.encoder.layers.18.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 893 | 
            +
                "vpm.encoder.layers.18.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 894 | 
            +
                "vpm.encoder.layers.18.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 895 | 
            +
                "vpm.encoder.layers.18.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 896 | 
            +
                "vpm.encoder.layers.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 897 | 
            +
                "vpm.encoder.layers.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 898 | 
            +
                "vpm.encoder.layers.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 899 | 
            +
                "vpm.encoder.layers.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 900 | 
            +
                "vpm.encoder.layers.18.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 901 | 
            +
                "vpm.encoder.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 902 | 
            +
                "vpm.encoder.layers.18.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 903 | 
            +
                "vpm.encoder.layers.18.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 904 | 
            +
                "vpm.encoder.layers.18.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 905 | 
            +
                "vpm.encoder.layers.18.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 906 | 
            +
                "vpm.encoder.layers.18.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 907 | 
            +
                "vpm.encoder.layers.18.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 908 | 
            +
                "vpm.encoder.layers.19.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 909 | 
            +
                "vpm.encoder.layers.19.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 910 | 
            +
                "vpm.encoder.layers.19.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 911 | 
            +
                "vpm.encoder.layers.19.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 912 | 
            +
                "vpm.encoder.layers.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 913 | 
            +
                "vpm.encoder.layers.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 914 | 
            +
                "vpm.encoder.layers.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 915 | 
            +
                "vpm.encoder.layers.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 916 | 
            +
                "vpm.encoder.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 917 | 
            +
                "vpm.encoder.layers.19.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 918 | 
            +
                "vpm.encoder.layers.19.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 919 | 
            +
                "vpm.encoder.layers.19.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 920 | 
            +
                "vpm.encoder.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 921 | 
            +
                "vpm.encoder.layers.19.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 922 | 
            +
                "vpm.encoder.layers.19.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 923 | 
            +
                "vpm.encoder.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 924 | 
            +
                "vpm.encoder.layers.2.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 925 | 
            +
                "vpm.encoder.layers.2.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 926 | 
            +
                "vpm.encoder.layers.2.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 927 | 
            +
                "vpm.encoder.layers.2.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 928 | 
            +
                "vpm.encoder.layers.2.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 929 | 
            +
                "vpm.encoder.layers.2.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 930 | 
            +
                "vpm.encoder.layers.2.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 931 | 
            +
                "vpm.encoder.layers.2.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 932 | 
            +
                "vpm.encoder.layers.2.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 933 | 
            +
                "vpm.encoder.layers.2.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 934 | 
            +
                "vpm.encoder.layers.2.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 935 | 
            +
                "vpm.encoder.layers.2.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 936 | 
            +
                "vpm.encoder.layers.2.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 937 | 
            +
                "vpm.encoder.layers.2.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 938 | 
            +
                "vpm.encoder.layers.2.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 939 | 
            +
                "vpm.encoder.layers.2.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 940 | 
            +
                "vpm.encoder.layers.20.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 941 | 
            +
                "vpm.encoder.layers.20.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 942 | 
            +
                "vpm.encoder.layers.20.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 943 | 
            +
                "vpm.encoder.layers.20.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 944 | 
            +
                "vpm.encoder.layers.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 945 | 
            +
                "vpm.encoder.layers.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 946 | 
            +
                "vpm.encoder.layers.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 947 | 
            +
                "vpm.encoder.layers.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 948 | 
            +
                "vpm.encoder.layers.20.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 949 | 
            +
                "vpm.encoder.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 950 | 
            +
                "vpm.encoder.layers.20.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 951 | 
            +
                "vpm.encoder.layers.20.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 952 | 
            +
                "vpm.encoder.layers.20.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 953 | 
            +
                "vpm.encoder.layers.20.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 954 | 
            +
                "vpm.encoder.layers.20.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 955 | 
            +
                "vpm.encoder.layers.20.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 956 | 
            +
                "vpm.encoder.layers.21.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 957 | 
            +
                "vpm.encoder.layers.21.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 958 | 
            +
                "vpm.encoder.layers.21.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 959 | 
            +
                "vpm.encoder.layers.21.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 960 | 
            +
                "vpm.encoder.layers.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 961 | 
            +
                "vpm.encoder.layers.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 962 | 
            +
                "vpm.encoder.layers.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 963 | 
            +
                "vpm.encoder.layers.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 964 | 
            +
                "vpm.encoder.layers.21.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 965 | 
            +
                "vpm.encoder.layers.21.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 966 | 
            +
                "vpm.encoder.layers.21.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 967 | 
            +
                "vpm.encoder.layers.21.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 968 | 
            +
                "vpm.encoder.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 969 | 
            +
                "vpm.encoder.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 970 | 
            +
                "vpm.encoder.layers.21.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 971 | 
            +
                "vpm.encoder.layers.21.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 972 | 
            +
                "vpm.encoder.layers.22.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 973 | 
            +
                "vpm.encoder.layers.22.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 974 | 
            +
                "vpm.encoder.layers.22.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 975 | 
            +
                "vpm.encoder.layers.22.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 976 | 
            +
                "vpm.encoder.layers.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 977 | 
            +
                "vpm.encoder.layers.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 978 | 
            +
                "vpm.encoder.layers.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 979 | 
            +
                "vpm.encoder.layers.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 980 | 
            +
                "vpm.encoder.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 981 | 
            +
                "vpm.encoder.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 982 | 
            +
                "vpm.encoder.layers.22.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 983 | 
            +
                "vpm.encoder.layers.22.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 984 | 
            +
                "vpm.encoder.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 985 | 
            +
                "vpm.encoder.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 986 | 
            +
                "vpm.encoder.layers.22.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 987 | 
            +
                "vpm.encoder.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 988 | 
            +
                "vpm.encoder.layers.23.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 989 | 
            +
                "vpm.encoder.layers.23.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 990 | 
            +
                "vpm.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 991 | 
            +
                "vpm.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 992 | 
            +
                "vpm.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 993 | 
            +
                "vpm.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 994 | 
            +
                "vpm.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 995 | 
            +
                "vpm.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 996 | 
            +
                "vpm.encoder.layers.23.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 997 | 
            +
                "vpm.encoder.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 998 | 
            +
                "vpm.encoder.layers.23.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 999 | 
            +
                "vpm.encoder.layers.23.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1000 | 
            +
                "vpm.encoder.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1001 | 
            +
                "vpm.encoder.layers.23.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1002 | 
            +
                "vpm.encoder.layers.23.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1003 | 
            +
                "vpm.encoder.layers.23.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1004 | 
            +
                "vpm.encoder.layers.24.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1005 | 
            +
                "vpm.encoder.layers.24.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1006 | 
            +
                "vpm.encoder.layers.24.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1007 | 
            +
                "vpm.encoder.layers.24.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1008 | 
            +
                "vpm.encoder.layers.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1009 | 
            +
                "vpm.encoder.layers.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1010 | 
            +
                "vpm.encoder.layers.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1011 | 
            +
                "vpm.encoder.layers.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1012 | 
            +
                "vpm.encoder.layers.24.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1013 | 
            +
                "vpm.encoder.layers.24.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1014 | 
            +
                "vpm.encoder.layers.24.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1015 | 
            +
                "vpm.encoder.layers.24.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1016 | 
            +
                "vpm.encoder.layers.24.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1017 | 
            +
                "vpm.encoder.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1018 | 
            +
                "vpm.encoder.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1019 | 
            +
                "vpm.encoder.layers.24.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1020 | 
            +
                "vpm.encoder.layers.25.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1021 | 
            +
                "vpm.encoder.layers.25.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1022 | 
            +
                "vpm.encoder.layers.25.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1023 | 
            +
                "vpm.encoder.layers.25.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1024 | 
            +
                "vpm.encoder.layers.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1025 | 
            +
                "vpm.encoder.layers.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1026 | 
            +
                "vpm.encoder.layers.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1027 | 
            +
                "vpm.encoder.layers.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1028 | 
            +
                "vpm.encoder.layers.25.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1029 | 
            +
                "vpm.encoder.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1030 | 
            +
                "vpm.encoder.layers.25.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1031 | 
            +
                "vpm.encoder.layers.25.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1032 | 
            +
                "vpm.encoder.layers.25.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1033 | 
            +
                "vpm.encoder.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1034 | 
            +
                "vpm.encoder.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1035 | 
            +
                "vpm.encoder.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1036 | 
            +
                "vpm.encoder.layers.26.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1037 | 
            +
                "vpm.encoder.layers.26.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1038 | 
            +
                "vpm.encoder.layers.26.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1039 | 
            +
                "vpm.encoder.layers.26.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1040 | 
            +
                "vpm.encoder.layers.26.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1041 | 
            +
                "vpm.encoder.layers.26.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1042 | 
            +
                "vpm.encoder.layers.26.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1043 | 
            +
                "vpm.encoder.layers.26.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1044 | 
            +
                "vpm.encoder.layers.26.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1045 | 
            +
                "vpm.encoder.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1046 | 
            +
                "vpm.encoder.layers.26.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1047 | 
            +
                "vpm.encoder.layers.26.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1048 | 
            +
                "vpm.encoder.layers.26.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1049 | 
            +
                "vpm.encoder.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1050 | 
            +
                "vpm.encoder.layers.26.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1051 | 
            +
                "vpm.encoder.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1052 | 
            +
                "vpm.encoder.layers.3.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1053 | 
            +
                "vpm.encoder.layers.3.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1054 | 
            +
                "vpm.encoder.layers.3.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1055 | 
            +
                "vpm.encoder.layers.3.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1056 | 
            +
                "vpm.encoder.layers.3.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1057 | 
            +
                "vpm.encoder.layers.3.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1058 | 
            +
                "vpm.encoder.layers.3.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1059 | 
            +
                "vpm.encoder.layers.3.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1060 | 
            +
                "vpm.encoder.layers.3.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1061 | 
            +
                "vpm.encoder.layers.3.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1062 | 
            +
                "vpm.encoder.layers.3.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1063 | 
            +
                "vpm.encoder.layers.3.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1064 | 
            +
                "vpm.encoder.layers.3.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1065 | 
            +
                "vpm.encoder.layers.3.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1066 | 
            +
                "vpm.encoder.layers.3.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1067 | 
            +
                "vpm.encoder.layers.3.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1068 | 
            +
                "vpm.encoder.layers.4.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1069 | 
            +
                "vpm.encoder.layers.4.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1070 | 
            +
                "vpm.encoder.layers.4.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1071 | 
            +
                "vpm.encoder.layers.4.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1072 | 
            +
                "vpm.encoder.layers.4.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1073 | 
            +
                "vpm.encoder.layers.4.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1074 | 
            +
                "vpm.encoder.layers.4.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1075 | 
            +
                "vpm.encoder.layers.4.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1076 | 
            +
                "vpm.encoder.layers.4.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1077 | 
            +
                "vpm.encoder.layers.4.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1078 | 
            +
                "vpm.encoder.layers.4.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1079 | 
            +
                "vpm.encoder.layers.4.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1080 | 
            +
                "vpm.encoder.layers.4.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1081 | 
            +
                "vpm.encoder.layers.4.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1082 | 
            +
                "vpm.encoder.layers.4.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1083 | 
            +
                "vpm.encoder.layers.4.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1084 | 
            +
                "vpm.encoder.layers.5.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1085 | 
            +
                "vpm.encoder.layers.5.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1086 | 
            +
                "vpm.encoder.layers.5.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1087 | 
            +
                "vpm.encoder.layers.5.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1088 | 
            +
                "vpm.encoder.layers.5.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1089 | 
            +
                "vpm.encoder.layers.5.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1090 | 
            +
                "vpm.encoder.layers.5.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1091 | 
            +
                "vpm.encoder.layers.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1092 | 
            +
                "vpm.encoder.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1093 | 
            +
                "vpm.encoder.layers.5.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1094 | 
            +
                "vpm.encoder.layers.5.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1095 | 
            +
                "vpm.encoder.layers.5.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1096 | 
            +
                "vpm.encoder.layers.5.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1097 | 
            +
                "vpm.encoder.layers.5.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1098 | 
            +
                "vpm.encoder.layers.5.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1099 | 
            +
                "vpm.encoder.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1100 | 
            +
                "vpm.encoder.layers.6.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1101 | 
            +
                "vpm.encoder.layers.6.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1102 | 
            +
                "vpm.encoder.layers.6.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1103 | 
            +
                "vpm.encoder.layers.6.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1104 | 
            +
                "vpm.encoder.layers.6.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1105 | 
            +
                "vpm.encoder.layers.6.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1106 | 
            +
                "vpm.encoder.layers.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1107 | 
            +
                "vpm.encoder.layers.6.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1108 | 
            +
                "vpm.encoder.layers.6.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1109 | 
            +
                "vpm.encoder.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1110 | 
            +
                "vpm.encoder.layers.6.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1111 | 
            +
                "vpm.encoder.layers.6.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1112 | 
            +
                "vpm.encoder.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1113 | 
            +
                "vpm.encoder.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1114 | 
            +
                "vpm.encoder.layers.6.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1115 | 
            +
                "vpm.encoder.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1116 | 
            +
                "vpm.encoder.layers.7.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1117 | 
            +
                "vpm.encoder.layers.7.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1118 | 
            +
                "vpm.encoder.layers.7.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1119 | 
            +
                "vpm.encoder.layers.7.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1120 | 
            +
                "vpm.encoder.layers.7.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1121 | 
            +
                "vpm.encoder.layers.7.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1122 | 
            +
                "vpm.encoder.layers.7.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1123 | 
            +
                "vpm.encoder.layers.7.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1124 | 
            +
                "vpm.encoder.layers.7.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1125 | 
            +
                "vpm.encoder.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1126 | 
            +
                "vpm.encoder.layers.7.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1127 | 
            +
                "vpm.encoder.layers.7.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1128 | 
            +
                "vpm.encoder.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1129 | 
            +
                "vpm.encoder.layers.7.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1130 | 
            +
                "vpm.encoder.layers.7.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1131 | 
            +
                "vpm.encoder.layers.7.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1132 | 
            +
                "vpm.encoder.layers.8.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1133 | 
            +
                "vpm.encoder.layers.8.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1134 | 
            +
                "vpm.encoder.layers.8.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1135 | 
            +
                "vpm.encoder.layers.8.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1136 | 
            +
                "vpm.encoder.layers.8.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1137 | 
            +
                "vpm.encoder.layers.8.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1138 | 
            +
                "vpm.encoder.layers.8.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1139 | 
            +
                "vpm.encoder.layers.8.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1140 | 
            +
                "vpm.encoder.layers.8.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1141 | 
            +
                "vpm.encoder.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1142 | 
            +
                "vpm.encoder.layers.8.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1143 | 
            +
                "vpm.encoder.layers.8.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1144 | 
            +
                "vpm.encoder.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1145 | 
            +
                "vpm.encoder.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1146 | 
            +
                "vpm.encoder.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1147 | 
            +
                "vpm.encoder.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1148 | 
            +
                "vpm.encoder.layers.9.layer_norm1.bias": "model-00004-of-00004.safetensors",
         | 
| 1149 | 
            +
                "vpm.encoder.layers.9.layer_norm1.weight": "model-00004-of-00004.safetensors",
         | 
| 1150 | 
            +
                "vpm.encoder.layers.9.layer_norm2.bias": "model-00004-of-00004.safetensors",
         | 
| 1151 | 
            +
                "vpm.encoder.layers.9.layer_norm2.weight": "model-00004-of-00004.safetensors",
         | 
| 1152 | 
            +
                "vpm.encoder.layers.9.mlp.fc1.bias": "model-00004-of-00004.safetensors",
         | 
| 1153 | 
            +
                "vpm.encoder.layers.9.mlp.fc1.weight": "model-00004-of-00004.safetensors",
         | 
| 1154 | 
            +
                "vpm.encoder.layers.9.mlp.fc2.bias": "model-00004-of-00004.safetensors",
         | 
| 1155 | 
            +
                "vpm.encoder.layers.9.mlp.fc2.weight": "model-00004-of-00004.safetensors",
         | 
| 1156 | 
            +
                "vpm.encoder.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1157 | 
            +
                "vpm.encoder.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1158 | 
            +
                "vpm.encoder.layers.9.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1159 | 
            +
                "vpm.encoder.layers.9.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1160 | 
            +
                "vpm.encoder.layers.9.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1161 | 
            +
                "vpm.encoder.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1162 | 
            +
                "vpm.encoder.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
         | 
| 1163 | 
            +
                "vpm.encoder.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
         | 
| 1164 | 
            +
                "vpm.post_layernorm.bias": "model-00004-of-00004.safetensors",
         | 
| 1165 | 
            +
                "vpm.post_layernorm.weight": "model-00004-of-00004.safetensors"
         | 
| 1166 | 
            +
              }
         | 
| 1167 | 
            +
            }
         | 
    	
        modeling_minicpmo.py
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        modeling_navit_siglip.py
    ADDED
    
    | @@ -0,0 +1,940 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License.
         | 
| 15 | 
            +
            """ PyTorch Siglip model. """
         | 
| 16 | 
            +
            # Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
         | 
| 17 | 
            +
             | 
| 18 | 
            +
             | 
| 19 | 
            +
            import math
         | 
| 20 | 
            +
            import os
         | 
| 21 | 
            +
            import warnings
         | 
| 22 | 
            +
            from dataclasses import dataclass
         | 
| 23 | 
            +
            from typing import Optional
         | 
| 24 | 
            +
            from typing import Tuple
         | 
| 25 | 
            +
            from typing import Union
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            import numpy as np
         | 
| 28 | 
            +
            import torch
         | 
| 29 | 
            +
            import torch.nn.functional as F
         | 
| 30 | 
            +
            import torch.utils.checkpoint
         | 
| 31 | 
            +
            from torch import nn
         | 
| 32 | 
            +
            from torch.nn.init import _calculate_fan_in_and_fan_out
         | 
| 33 | 
            +
            from transformers.activations import ACT2FN
         | 
| 34 | 
            +
            from transformers.configuration_utils import PretrainedConfig
         | 
| 35 | 
            +
            from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
         | 
| 36 | 
            +
            from transformers.modeling_outputs import BaseModelOutput
         | 
| 37 | 
            +
            from transformers.modeling_outputs import BaseModelOutputWithPooling
         | 
| 38 | 
            +
            from transformers.modeling_utils import PreTrainedModel
         | 
| 39 | 
            +
            from transformers.utils import add_start_docstrings
         | 
| 40 | 
            +
            from transformers.utils import add_start_docstrings_to_model_forward
         | 
| 41 | 
            +
            from transformers.utils import is_flash_attn_2_available
         | 
| 42 | 
            +
            from transformers.utils import logging
         | 
| 43 | 
            +
            from transformers.utils import ModelOutput
         | 
| 44 | 
            +
            from transformers.utils import replace_return_docstrings
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            logger = logging.get_logger(__name__)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
             | 
| 49 | 
            +
            class SiglipVisionConfig(PretrainedConfig):
         | 
| 50 | 
            +
                r"""
         | 
| 51 | 
            +
                This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
         | 
| 52 | 
            +
                Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
         | 
| 53 | 
            +
                configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
         | 
| 54 | 
            +
                [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
         | 
| 55 | 
            +
                Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
         | 
| 56 | 
            +
                documentation from [`PretrainedConfig`] for more information.
         | 
| 57 | 
            +
                Args:
         | 
| 58 | 
            +
                    hidden_size (`int`, *optional*, defaults to 768):
         | 
| 59 | 
            +
                        Dimensionality of the encoder layers and the pooler layer.
         | 
| 60 | 
            +
                    intermediate_size (`int`, *optional*, defaults to 3072):
         | 
| 61 | 
            +
                        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         | 
| 62 | 
            +
                    num_hidden_layers (`int`, *optional*, defaults to 12):
         | 
| 63 | 
            +
                        Number of hidden layers in the Transformer encoder.
         | 
| 64 | 
            +
                    num_attention_heads (`int`, *optional*, defaults to 12):
         | 
| 65 | 
            +
                        Number of attention heads for each attention layer in the Transformer encoder.
         | 
| 66 | 
            +
                    num_channels (`int`, *optional*, defaults to 3):
         | 
| 67 | 
            +
                        Number of channels in the input images.
         | 
| 68 | 
            +
                    image_size (`int`, *optional*, defaults to 224):
         | 
| 69 | 
            +
                        The size (resolution) of each image.
         | 
| 70 | 
            +
                    patch_size (`int`, *optional*, defaults to 16):
         | 
| 71 | 
            +
                        The size (resolution) of each patch.
         | 
| 72 | 
            +
                    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
         | 
| 73 | 
            +
                        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
         | 
| 74 | 
            +
                        `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
         | 
| 75 | 
            +
                    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
         | 
| 76 | 
            +
                        The epsilon used by the layer normalization layers.
         | 
| 77 | 
            +
                    attention_dropout (`float`, *optional*, defaults to 0.0):
         | 
| 78 | 
            +
                        The dropout ratio for the attention probabilities.
         | 
| 79 | 
            +
                Example:
         | 
| 80 | 
            +
                ```python
         | 
| 81 | 
            +
                >>> from transformers import SiglipVisionConfig, SiglipVisionModel
         | 
| 82 | 
            +
                >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
         | 
| 83 | 
            +
                >>> configuration = SiglipVisionConfig()
         | 
| 84 | 
            +
                >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
         | 
| 85 | 
            +
                >>> model = SiglipVisionModel(configuration)
         | 
| 86 | 
            +
                >>> # Accessing the model configuration
         | 
| 87 | 
            +
                >>> configuration = model.config
         | 
| 88 | 
            +
                ```"""
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                model_type = "siglip_vision_model"
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                def __init__(
         | 
| 93 | 
            +
                    self,
         | 
| 94 | 
            +
                    hidden_size=768,
         | 
| 95 | 
            +
                    intermediate_size=3072,
         | 
| 96 | 
            +
                    num_hidden_layers=12,
         | 
| 97 | 
            +
                    num_attention_heads=12,
         | 
| 98 | 
            +
                    num_channels=3,
         | 
| 99 | 
            +
                    image_size=224,
         | 
| 100 | 
            +
                    patch_size=16,
         | 
| 101 | 
            +
                    hidden_act="gelu_pytorch_tanh",
         | 
| 102 | 
            +
                    layer_norm_eps=1e-6,
         | 
| 103 | 
            +
                    attention_dropout=0.0,
         | 
| 104 | 
            +
                    **kwargs,
         | 
| 105 | 
            +
                ):
         | 
| 106 | 
            +
                    super().__init__(**kwargs)
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    self.hidden_size = hidden_size
         | 
| 109 | 
            +
                    self.intermediate_size = intermediate_size
         | 
| 110 | 
            +
                    self.num_hidden_layers = num_hidden_layers
         | 
| 111 | 
            +
                    self.num_attention_heads = num_attention_heads
         | 
| 112 | 
            +
                    self.num_channels = num_channels
         | 
| 113 | 
            +
                    self.patch_size = patch_size
         | 
| 114 | 
            +
                    self.image_size = image_size
         | 
| 115 | 
            +
                    self.attention_dropout = attention_dropout
         | 
| 116 | 
            +
                    self.layer_norm_eps = layer_norm_eps
         | 
| 117 | 
            +
                    self.hidden_act = hidden_act
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                @classmethod
         | 
| 120 | 
            +
                def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         | 
| 121 | 
            +
                    cls._set_token_in_kwargs(kwargs)
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                    config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                    # get the vision config dict if we are loading from SiglipConfig
         | 
| 126 | 
            +
                    if config_dict.get("model_type") == "siglip":
         | 
| 127 | 
            +
                        config_dict = config_dict["vision_config"]
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                    if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
         | 
| 130 | 
            +
                        logger.warning(
         | 
| 131 | 
            +
                            f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
         | 
| 132 | 
            +
                            f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
         | 
| 133 | 
            +
                        )
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                    return cls.from_dict(config_dict, **kwargs)
         | 
| 136 | 
            +
             | 
| 137 | 
            +
             | 
| 138 | 
            +
            _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
         | 
| 139 | 
            +
             | 
| 140 | 
            +
            SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
         | 
| 141 | 
            +
                "google/siglip-base-patch16-224",
         | 
| 142 | 
            +
                # See all SigLIP models at https://huggingface.co/models?filter=siglip
         | 
| 143 | 
            +
            ]
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            if is_flash_attn_2_available():
         | 
| 146 | 
            +
                from flash_attn import flash_attn_func
         | 
| 147 | 
            +
                from flash_attn import flash_attn_varlen_func
         | 
| 148 | 
            +
                from flash_attn.bert_padding import index_first_axis  # noqa
         | 
| 149 | 
            +
                from flash_attn.bert_padding import pad_input
         | 
| 150 | 
            +
                from flash_attn.bert_padding import unpad_input
         | 
| 151 | 
            +
             | 
| 152 | 
            +
             | 
| 153 | 
            +
            # Copied from transformers.models.llama.modeling_llama._get_unpad_data
         | 
| 154 | 
            +
            def _get_unpad_data(attention_mask):
         | 
| 155 | 
            +
                seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
         | 
| 156 | 
            +
                indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
         | 
| 157 | 
            +
                max_seqlen_in_batch = seqlens_in_batch.max().item()
         | 
| 158 | 
            +
                cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
         | 
| 159 | 
            +
                return (
         | 
| 160 | 
            +
                    indices,
         | 
| 161 | 
            +
                    cu_seqlens,
         | 
| 162 | 
            +
                    max_seqlen_in_batch,
         | 
| 163 | 
            +
                )
         | 
| 164 | 
            +
             | 
| 165 | 
            +
             | 
| 166 | 
            +
            def _trunc_normal_(tensor, mean, std, a, b):
         | 
| 167 | 
            +
                # Cut & paste from PyTorch official master until it's in a few official releases - RW
         | 
| 168 | 
            +
                # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
         | 
| 169 | 
            +
                def norm_cdf(x):
         | 
| 170 | 
            +
                    # Computes standard normal cumulative distribution function
         | 
| 171 | 
            +
                    return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                if (mean < a - 2 * std) or (mean > b + 2 * std):
         | 
| 174 | 
            +
                    warnings.warn(
         | 
| 175 | 
            +
                        "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
         | 
| 176 | 
            +
                        "The distribution of values may be incorrect.",
         | 
| 177 | 
            +
                        stacklevel=2,
         | 
| 178 | 
            +
                    )
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                # Values are generated by using a truncated uniform distribution and
         | 
| 181 | 
            +
                # then using the inverse CDF for the normal distribution.
         | 
| 182 | 
            +
                # Get upper and lower cdf values
         | 
| 183 | 
            +
                l = norm_cdf((a - mean) / std)
         | 
| 184 | 
            +
                u = norm_cdf((b - mean) / std)
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                # Uniformly fill tensor with values from [l, u], then translate to
         | 
| 187 | 
            +
                # [2l-1, 2u-1].
         | 
| 188 | 
            +
                tensor.uniform_(2 * l - 1, 2 * u - 1)
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                # Use inverse cdf transform for normal distribution to get truncated
         | 
| 191 | 
            +
                # standard normal
         | 
| 192 | 
            +
                if tensor.dtype in [torch.float16, torch.bfloat16]:
         | 
| 193 | 
            +
                    # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
         | 
| 194 | 
            +
                    og_dtype = tensor.dtype
         | 
| 195 | 
            +
                    tensor = tensor.to(torch.float32)
         | 
| 196 | 
            +
                    tensor.erfinv_()
         | 
| 197 | 
            +
                    tensor = tensor.to(og_dtype)
         | 
| 198 | 
            +
                else:
         | 
| 199 | 
            +
                    tensor.erfinv_()
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                # Transform to proper mean, std
         | 
| 202 | 
            +
                tensor.mul_(std * math.sqrt(2.0))
         | 
| 203 | 
            +
                tensor.add_(mean)
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                # Clamp to ensure it's in the proper range
         | 
| 206 | 
            +
                if tensor.dtype == torch.float16:
         | 
| 207 | 
            +
                    # The `clamp_` op is not (yet?) defined in float16+cpu
         | 
| 208 | 
            +
                    tensor = tensor.to(torch.float32)
         | 
| 209 | 
            +
                    tensor.clamp_(min=a, max=b)
         | 
| 210 | 
            +
                    tensor = tensor.to(torch.float16)
         | 
| 211 | 
            +
                else:
         | 
| 212 | 
            +
                    tensor.clamp_(min=a, max=b)
         | 
| 213 | 
            +
             | 
| 214 | 
            +
             | 
| 215 | 
            +
            def trunc_normal_tf_(
         | 
| 216 | 
            +
                tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
         | 
| 217 | 
            +
            ) -> torch.Tensor:
         | 
| 218 | 
            +
                """Fills the input Tensor with values drawn from a truncated
         | 
| 219 | 
            +
                normal distribution. The values are effectively drawn from the
         | 
| 220 | 
            +
                normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
         | 
| 221 | 
            +
                with values outside :math:`[a, b]` redrawn until they are within
         | 
| 222 | 
            +
                the bounds. The method used for generating the random values works
         | 
| 223 | 
            +
                best when :math:`a \\leq \text{mean} \\leq b`.
         | 
| 224 | 
            +
                NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
         | 
| 225 | 
            +
                bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
         | 
| 226 | 
            +
                and the result is subsquently scaled and shifted by the mean and std args.
         | 
| 227 | 
            +
                Args:
         | 
| 228 | 
            +
                    tensor: an n-dimensional `torch.Tensor`
         | 
| 229 | 
            +
                    mean: the mean of the normal distribution
         | 
| 230 | 
            +
                    std: the standard deviation of the normal distribution
         | 
| 231 | 
            +
                    a: the minimum cutoff value
         | 
| 232 | 
            +
                    b: the maximum cutoff value
         | 
| 233 | 
            +
                """
         | 
| 234 | 
            +
                with torch.no_grad():
         | 
| 235 | 
            +
                    _trunc_normal_(tensor, 0, 1.0, a, b)
         | 
| 236 | 
            +
                    tensor.mul_(std).add_(mean)
         | 
| 237 | 
            +
             | 
| 238 | 
            +
             | 
| 239 | 
            +
            def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
         | 
| 240 | 
            +
                fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
         | 
| 241 | 
            +
                if mode == "fan_in":
         | 
| 242 | 
            +
                    denom = fan_in
         | 
| 243 | 
            +
                elif mode == "fan_out":
         | 
| 244 | 
            +
                    denom = fan_out
         | 
| 245 | 
            +
                elif mode == "fan_avg":
         | 
| 246 | 
            +
                    denom = (fan_in + fan_out) / 2
         | 
| 247 | 
            +
             | 
| 248 | 
            +
                variance = scale / denom
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                if distribution == "truncated_normal":
         | 
| 251 | 
            +
                    # constant is stddev of standard normal truncated to (-2, 2)
         | 
| 252 | 
            +
                    trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
         | 
| 253 | 
            +
                elif distribution == "normal":
         | 
| 254 | 
            +
                    with torch.no_grad():
         | 
| 255 | 
            +
                        tensor.normal_(std=math.sqrt(variance))
         | 
| 256 | 
            +
                elif distribution == "uniform":
         | 
| 257 | 
            +
                    bound = math.sqrt(3 * variance)
         | 
| 258 | 
            +
                    with torch.no_grad():
         | 
| 259 | 
            +
                        tensor.uniform_(-bound, bound)
         | 
| 260 | 
            +
                else:
         | 
| 261 | 
            +
                    raise ValueError(f"invalid distribution {distribution}")
         | 
| 262 | 
            +
             | 
| 263 | 
            +
             | 
| 264 | 
            +
            def lecun_normal_(tensor):
         | 
| 265 | 
            +
                variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
         | 
| 266 | 
            +
             | 
| 267 | 
            +
             | 
| 268 | 
            +
            def default_flax_embed_init(tensor):
         | 
| 269 | 
            +
                variance_scaling_(tensor, mode="fan_in", distribution="normal")
         | 
| 270 | 
            +
             | 
| 271 | 
            +
             | 
| 272 | 
            +
            @dataclass
         | 
| 273 | 
            +
            # Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
         | 
| 274 | 
            +
            class SiglipVisionModelOutput(ModelOutput):
         | 
| 275 | 
            +
                """
         | 
| 276 | 
            +
                Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
         | 
| 277 | 
            +
                Args:
         | 
| 278 | 
            +
                    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
         | 
| 279 | 
            +
                        The image embeddings obtained by applying the projection layer to the pooler_output.
         | 
| 280 | 
            +
                    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
         | 
| 281 | 
            +
                        Sequence of hidden-states at the output of the last layer of the model.
         | 
| 282 | 
            +
                    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
         | 
| 283 | 
            +
                        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
         | 
| 284 | 
            +
                        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
         | 
| 285 | 
            +
                        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
         | 
| 286 | 
            +
                    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
         | 
| 287 | 
            +
                        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
         | 
| 288 | 
            +
                        sequence_length)`.
         | 
| 289 | 
            +
                        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
         | 
| 290 | 
            +
                        heads.
         | 
| 291 | 
            +
                """
         | 
| 292 | 
            +
             | 
| 293 | 
            +
                image_embeds: Optional[torch.FloatTensor] = None
         | 
| 294 | 
            +
                last_hidden_state: torch.FloatTensor = None
         | 
| 295 | 
            +
                hidden_states: Optional[Tuple[torch.FloatTensor]] = None
         | 
| 296 | 
            +
                attentions: Optional[Tuple[torch.FloatTensor]] = None
         | 
| 297 | 
            +
             | 
| 298 | 
            +
             | 
| 299 | 
            +
            class SiglipVisionEmbeddings(nn.Module):
         | 
| 300 | 
            +
                def __init__(self, config: SiglipVisionConfig):
         | 
| 301 | 
            +
                    super().__init__()
         | 
| 302 | 
            +
                    self.config = config
         | 
| 303 | 
            +
                    self.embed_dim = config.hidden_size
         | 
| 304 | 
            +
                    self.image_size = config.image_size
         | 
| 305 | 
            +
                    self.patch_size = config.patch_size
         | 
| 306 | 
            +
             | 
| 307 | 
            +
                    self.patch_embedding = nn.Conv2d(
         | 
| 308 | 
            +
                        in_channels=config.num_channels,
         | 
| 309 | 
            +
                        out_channels=self.embed_dim,
         | 
| 310 | 
            +
                        kernel_size=self.patch_size,
         | 
| 311 | 
            +
                        stride=self.patch_size,
         | 
| 312 | 
            +
                        padding="valid",
         | 
| 313 | 
            +
                    )
         | 
| 314 | 
            +
             | 
| 315 | 
            +
                    self.num_patches_per_side = self.image_size // self.patch_size
         | 
| 316 | 
            +
                    self.num_patches = self.num_patches_per_side**2
         | 
| 317 | 
            +
                    self.num_positions = self.num_patches
         | 
| 318 | 
            +
                    self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         | 
| 319 | 
            +
             | 
| 320 | 
            +
                def forward(
         | 
| 321 | 
            +
                    self,
         | 
| 322 | 
            +
                    pixel_values: torch.FloatTensor,
         | 
| 323 | 
            +
                    patch_attention_mask: torch.BoolTensor,
         | 
| 324 | 
            +
                    tgt_sizes: Optional[torch.IntTensor] = None,
         | 
| 325 | 
            +
                ) -> torch.Tensor:
         | 
| 326 | 
            +
                    batch_size = pixel_values.size(0)
         | 
| 327 | 
            +
             | 
| 328 | 
            +
                    patch_embeds = self.patch_embedding(pixel_values)
         | 
| 329 | 
            +
                    embeddings = patch_embeds.flatten(2).transpose(1, 2)
         | 
| 330 | 
            +
             | 
| 331 | 
            +
                    max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
         | 
| 332 | 
            +
                    max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
         | 
| 333 | 
            +
                    boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
         | 
| 334 | 
            +
                    position_ids = torch.full(
         | 
| 335 | 
            +
                        size=(
         | 
| 336 | 
            +
                            batch_size,
         | 
| 337 | 
            +
                            max_nb_patches_h * max_nb_patches_w,
         | 
| 338 | 
            +
                        ),
         | 
| 339 | 
            +
                        fill_value=0,
         | 
| 340 | 
            +
                    )
         | 
| 341 | 
            +
             | 
| 342 | 
            +
                    for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
         | 
| 343 | 
            +
                        if tgt_sizes is not None:
         | 
| 344 | 
            +
                            nb_patches_h = tgt_sizes[batch_idx][0]
         | 
| 345 | 
            +
                            nb_patches_w = tgt_sizes[batch_idx][1]
         | 
| 346 | 
            +
                        else:
         | 
| 347 | 
            +
                            nb_patches_h = p_attn_mask[:, 0].sum()
         | 
| 348 | 
            +
                            nb_patches_w = p_attn_mask[0].sum()
         | 
| 349 | 
            +
             | 
| 350 | 
            +
                        fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
         | 
| 351 | 
            +
                        fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
         | 
| 352 | 
            +
             | 
| 353 | 
            +
                        bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
         | 
| 354 | 
            +
                        bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
         | 
| 355 | 
            +
             | 
| 356 | 
            +
                        pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
         | 
| 357 | 
            +
                        position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                    position_ids = position_ids.to(self.position_embedding.weight.device)
         | 
| 360 | 
            +
             | 
| 361 | 
            +
                    embeddings = embeddings + self.position_embedding(position_ids)
         | 
| 362 | 
            +
                    return embeddings
         | 
| 363 | 
            +
             | 
| 364 | 
            +
             | 
| 365 | 
            +
            class SiglipAttention(nn.Module):
         | 
| 366 | 
            +
                """Multi-headed attention from 'Attention Is All You Need' paper"""
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
         | 
| 369 | 
            +
                def __init__(self, config):
         | 
| 370 | 
            +
                    super().__init__()
         | 
| 371 | 
            +
                    self.config = config
         | 
| 372 | 
            +
                    self.embed_dim = config.hidden_size
         | 
| 373 | 
            +
                    self.num_heads = config.num_attention_heads
         | 
| 374 | 
            +
                    self.head_dim = self.embed_dim // self.num_heads
         | 
| 375 | 
            +
                    if self.head_dim * self.num_heads != self.embed_dim:
         | 
| 376 | 
            +
                        raise ValueError(
         | 
| 377 | 
            +
                            f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
         | 
| 378 | 
            +
                            f" {self.num_heads})."
         | 
| 379 | 
            +
                        )
         | 
| 380 | 
            +
                    self.scale = self.head_dim**-0.5
         | 
| 381 | 
            +
                    self.dropout = config.attention_dropout
         | 
| 382 | 
            +
             | 
| 383 | 
            +
                    self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
         | 
| 384 | 
            +
                    self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
         | 
| 385 | 
            +
                    self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
         | 
| 386 | 
            +
                    self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
         | 
| 387 | 
            +
             | 
| 388 | 
            +
                def forward(
         | 
| 389 | 
            +
                    self,
         | 
| 390 | 
            +
                    hidden_states: torch.Tensor,
         | 
| 391 | 
            +
                    attention_mask: Optional[torch.Tensor] = None,
         | 
| 392 | 
            +
                    output_attentions: Optional[bool] = False,
         | 
| 393 | 
            +
                ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         | 
| 394 | 
            +
                    """Input shape: Batch x Time x Channel"""
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                    batch_size, q_len, _ = hidden_states.size()
         | 
| 397 | 
            +
             | 
| 398 | 
            +
                    query_states = self.q_proj(hidden_states)
         | 
| 399 | 
            +
                    key_states = self.k_proj(hidden_states)
         | 
| 400 | 
            +
                    value_states = self.v_proj(hidden_states)
         | 
| 401 | 
            +
             | 
| 402 | 
            +
                    query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         | 
| 403 | 
            +
                    key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         | 
| 404 | 
            +
                    value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         | 
| 405 | 
            +
             | 
| 406 | 
            +
                    k_v_seq_len = key_states.shape[-2]
         | 
| 407 | 
            +
                    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
         | 
| 408 | 
            +
             | 
| 409 | 
            +
                    if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
         | 
| 410 | 
            +
                        raise ValueError(
         | 
| 411 | 
            +
                            f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
         | 
| 412 | 
            +
                            f" {attn_weights.size()}"
         | 
| 413 | 
            +
                        )
         | 
| 414 | 
            +
             | 
| 415 | 
            +
                    if attention_mask is not None:
         | 
| 416 | 
            +
                        if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
         | 
| 417 | 
            +
                            raise ValueError(
         | 
| 418 | 
            +
                                f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
         | 
| 419 | 
            +
                            )
         | 
| 420 | 
            +
                        attn_weights = attn_weights + attention_mask
         | 
| 421 | 
            +
             | 
| 422 | 
            +
                    # upcast attention to fp32
         | 
| 423 | 
            +
                    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         | 
| 424 | 
            +
                    attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
         | 
| 425 | 
            +
                    attn_output = torch.matmul(attn_weights, value_states)
         | 
| 426 | 
            +
             | 
| 427 | 
            +
                    if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
         | 
| 428 | 
            +
                        raise ValueError(
         | 
| 429 | 
            +
                            f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
         | 
| 430 | 
            +
                            f" {attn_output.size()}"
         | 
| 431 | 
            +
                        )
         | 
| 432 | 
            +
             | 
| 433 | 
            +
                    attn_output = attn_output.transpose(1, 2).contiguous()
         | 
| 434 | 
            +
                    attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
         | 
| 435 | 
            +
             | 
| 436 | 
            +
                    attn_output = self.out_proj(attn_output)
         | 
| 437 | 
            +
             | 
| 438 | 
            +
                    return attn_output, attn_weights
         | 
| 439 | 
            +
             | 
| 440 | 
            +
             | 
| 441 | 
            +
            class SiglipFlashAttention2(SiglipAttention):
         | 
| 442 | 
            +
                """
         | 
| 443 | 
            +
                Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
         | 
| 444 | 
            +
                untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
         | 
| 445 | 
            +
                flash attention and deal with padding tokens in case the input contains any of them.
         | 
| 446 | 
            +
                """
         | 
| 447 | 
            +
             | 
| 448 | 
            +
                def __init__(self, *args, **kwargs):
         | 
| 449 | 
            +
                    super().__init__(*args, **kwargs)
         | 
| 450 | 
            +
                    self.is_causal = False  # Hack to make sure we don't use a causal mask
         | 
| 451 | 
            +
             | 
| 452 | 
            +
                def forward(
         | 
| 453 | 
            +
                    self,
         | 
| 454 | 
            +
                    hidden_states: torch.Tensor,
         | 
| 455 | 
            +
                    attention_mask: Optional[torch.LongTensor] = None,
         | 
| 456 | 
            +
                    position_ids: Optional[torch.LongTensor] = None,
         | 
| 457 | 
            +
                    past_key_value: Optional[Tuple[torch.Tensor]] = None,
         | 
| 458 | 
            +
                    output_attentions: bool = False,
         | 
| 459 | 
            +
                    use_cache: bool = False,
         | 
| 460 | 
            +
                    **kwargs,
         | 
| 461 | 
            +
                ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         | 
| 462 | 
            +
                    output_attentions = False
         | 
| 463 | 
            +
             | 
| 464 | 
            +
                    bsz, q_len, _ = hidden_states.size()
         | 
| 465 | 
            +
             | 
| 466 | 
            +
                    query_states = self.q_proj(hidden_states)
         | 
| 467 | 
            +
                    key_states = self.k_proj(hidden_states)
         | 
| 468 | 
            +
                    value_states = self.v_proj(hidden_states)
         | 
| 469 | 
            +
             | 
| 470 | 
            +
                    # Flash attention requires the input to have the shape
         | 
| 471 | 
            +
                    # batch_size x seq_length x head_dim x hidden_dim
         | 
| 472 | 
            +
                    # therefore we just need to keep the original shape
         | 
| 473 | 
            +
                    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         | 
| 474 | 
            +
                    key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         | 
| 475 | 
            +
                    value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         | 
| 476 | 
            +
             | 
| 477 | 
            +
                    kv_seq_len = key_states.shape[-2]
         | 
| 478 | 
            +
                    if past_key_value is not None:
         | 
| 479 | 
            +
                        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         | 
| 480 | 
            +
                    # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         | 
| 481 | 
            +
                    # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         | 
| 482 | 
            +
             | 
| 483 | 
            +
                    # if past_key_value is not None:
         | 
| 484 | 
            +
                    #     cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
         | 
| 485 | 
            +
                    #     key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         | 
| 486 | 
            +
             | 
| 487 | 
            +
                    # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
         | 
| 488 | 
            +
                    # to be able to avoid many of these transpose/reshape/view.
         | 
| 489 | 
            +
                    query_states = query_states.transpose(1, 2)
         | 
| 490 | 
            +
                    key_states = key_states.transpose(1, 2)
         | 
| 491 | 
            +
                    value_states = value_states.transpose(1, 2)
         | 
| 492 | 
            +
             | 
| 493 | 
            +
                    dropout_rate = self.dropout if self.training else 0.0
         | 
| 494 | 
            +
             | 
| 495 | 
            +
                    # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         | 
| 496 | 
            +
                    # therefore the input hidden states gets silently casted in float32. Hence, we need
         | 
| 497 | 
            +
                    # cast them back in the correct dtype just to be sure everything works as expected.
         | 
| 498 | 
            +
                    # This might slowdown training & inference so it is recommended to not cast the LayerNorms
         | 
| 499 | 
            +
                    # in fp32. (LlamaRMSNorm handles it correctly)
         | 
| 500 | 
            +
             | 
| 501 | 
            +
                    input_dtype = query_states.dtype
         | 
| 502 | 
            +
                    if input_dtype == torch.float32:
         | 
| 503 | 
            +
                        if torch.is_autocast_enabled():
         | 
| 504 | 
            +
                            target_dtype = torch.get_autocast_gpu_dtype()
         | 
| 505 | 
            +
                        # Handle the case where the model is quantized
         | 
| 506 | 
            +
                        elif hasattr(self.config, "_pre_quantization_dtype"):
         | 
| 507 | 
            +
                            target_dtype = self.config._pre_quantization_dtype
         | 
| 508 | 
            +
                        else:
         | 
| 509 | 
            +
                            target_dtype = self.q_proj.weight.dtype
         | 
| 510 | 
            +
             | 
| 511 | 
            +
                        logger.warning_once(
         | 
| 512 | 
            +
                            "The input hidden states seems to be silently casted in float32, this might be related to the fact"
         | 
| 513 | 
            +
                            " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
         | 
| 514 | 
            +
                            f" {target_dtype}."
         | 
| 515 | 
            +
                        )
         | 
| 516 | 
            +
             | 
| 517 | 
            +
                        query_states = query_states.to(target_dtype)
         | 
| 518 | 
            +
                        key_states = key_states.to(target_dtype)
         | 
| 519 | 
            +
                        value_states = value_states.to(target_dtype)
         | 
| 520 | 
            +
             | 
| 521 | 
            +
                    attn_output = self._flash_attention_forward(
         | 
| 522 | 
            +
                        query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
         | 
| 523 | 
            +
                    )
         | 
| 524 | 
            +
             | 
| 525 | 
            +
                    attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
         | 
| 526 | 
            +
                    attn_output = self.out_proj(attn_output)
         | 
| 527 | 
            +
             | 
| 528 | 
            +
                    if not output_attentions:
         | 
| 529 | 
            +
                        attn_weights = None
         | 
| 530 | 
            +
             | 
| 531 | 
            +
                    return attn_output, attn_weights
         | 
| 532 | 
            +
             | 
| 533 | 
            +
                def _flash_attention_forward(
         | 
| 534 | 
            +
                    self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
         | 
| 535 | 
            +
                ):
         | 
| 536 | 
            +
                    """
         | 
| 537 | 
            +
                    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         | 
| 538 | 
            +
                    first unpad the input, then computes the attention scores and pad the final attention scores.
         | 
| 539 | 
            +
                    Args:
         | 
| 540 | 
            +
                        query_states (`torch.Tensor`):
         | 
| 541 | 
            +
                            Input query states to be passed to Flash Attention API
         | 
| 542 | 
            +
                        key_states (`torch.Tensor`):
         | 
| 543 | 
            +
                            Input key states to be passed to Flash Attention API
         | 
| 544 | 
            +
                        value_states (`torch.Tensor`):
         | 
| 545 | 
            +
                            Input value states to be passed to Flash Attention API
         | 
| 546 | 
            +
                        attention_mask (`torch.Tensor`):
         | 
| 547 | 
            +
                            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
         | 
| 548 | 
            +
                            position of padding tokens and 1 for the position of non-padding tokens.
         | 
| 549 | 
            +
                        dropout (`int`, *optional*):
         | 
| 550 | 
            +
                            Attention dropout
         | 
| 551 | 
            +
                        softmax_scale (`float`, *optional*):
         | 
| 552 | 
            +
                            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         | 
| 553 | 
            +
                    """
         | 
| 554 | 
            +
             | 
| 555 | 
            +
                    # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
         | 
| 556 | 
            +
                    causal = self.is_causal and query_length != 1
         | 
| 557 | 
            +
             | 
| 558 | 
            +
                    # Contains at least one padding token in the sequence
         | 
| 559 | 
            +
                    if attention_mask is not None:
         | 
| 560 | 
            +
                        batch_size = query_states.shape[0]
         | 
| 561 | 
            +
                        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
         | 
| 562 | 
            +
                            query_states, key_states, value_states, attention_mask, query_length
         | 
| 563 | 
            +
                        )
         | 
| 564 | 
            +
             | 
| 565 | 
            +
                        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
         | 
| 566 | 
            +
                        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
         | 
| 567 | 
            +
             | 
| 568 | 
            +
                        attn_output_unpad = flash_attn_varlen_func(
         | 
| 569 | 
            +
                            query_states,
         | 
| 570 | 
            +
                            key_states,
         | 
| 571 | 
            +
                            value_states,
         | 
| 572 | 
            +
                            cu_seqlens_q=cu_seqlens_q,
         | 
| 573 | 
            +
                            cu_seqlens_k=cu_seqlens_k,
         | 
| 574 | 
            +
                            max_seqlen_q=max_seqlen_in_batch_q,
         | 
| 575 | 
            +
                            max_seqlen_k=max_seqlen_in_batch_k,
         | 
| 576 | 
            +
                            dropout_p=dropout,
         | 
| 577 | 
            +
                            softmax_scale=softmax_scale,
         | 
| 578 | 
            +
                            causal=causal,
         | 
| 579 | 
            +
                        )
         | 
| 580 | 
            +
             | 
| 581 | 
            +
                        attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         | 
| 582 | 
            +
                    else:
         | 
| 583 | 
            +
                        attn_output = flash_attn_func(
         | 
| 584 | 
            +
                            query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
         | 
| 585 | 
            +
                        )
         | 
| 586 | 
            +
             | 
| 587 | 
            +
                    return attn_output
         | 
| 588 | 
            +
             | 
| 589 | 
            +
                def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         | 
| 590 | 
            +
                    indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         | 
| 591 | 
            +
                    batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
         | 
| 592 | 
            +
             | 
| 593 | 
            +
                    key_layer = index_first_axis(
         | 
| 594 | 
            +
                        key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         | 
| 595 | 
            +
                    )
         | 
| 596 | 
            +
                    value_layer = index_first_axis(
         | 
| 597 | 
            +
                        value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         | 
| 598 | 
            +
                    )
         | 
| 599 | 
            +
                    if query_length == kv_seq_len:
         | 
| 600 | 
            +
                        query_layer = index_first_axis(
         | 
| 601 | 
            +
                            query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
         | 
| 602 | 
            +
                        )
         | 
| 603 | 
            +
                        cu_seqlens_q = cu_seqlens_k
         | 
| 604 | 
            +
                        max_seqlen_in_batch_q = max_seqlen_in_batch_k
         | 
| 605 | 
            +
                        indices_q = indices_k
         | 
| 606 | 
            +
                    elif query_length == 1:
         | 
| 607 | 
            +
                        max_seqlen_in_batch_q = 1
         | 
| 608 | 
            +
                        cu_seqlens_q = torch.arange(
         | 
| 609 | 
            +
                            batch_size + 1, dtype=torch.int32, device=query_layer.device
         | 
| 610 | 
            +
                        )  # There is a memcpy here, that is very bad.
         | 
| 611 | 
            +
                        indices_q = cu_seqlens_q[:-1]
         | 
| 612 | 
            +
                        query_layer = query_layer.squeeze(1)
         | 
| 613 | 
            +
                    else:
         | 
| 614 | 
            +
                        # The -q_len: slice assumes left padding.
         | 
| 615 | 
            +
                        attention_mask = attention_mask[:, -query_length:]
         | 
| 616 | 
            +
                        query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
         | 
| 617 | 
            +
             | 
| 618 | 
            +
                    return (
         | 
| 619 | 
            +
                        query_layer,
         | 
| 620 | 
            +
                        key_layer,
         | 
| 621 | 
            +
                        value_layer,
         | 
| 622 | 
            +
                        indices_q,
         | 
| 623 | 
            +
                        (cu_seqlens_q, cu_seqlens_k),
         | 
| 624 | 
            +
                        (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
         | 
| 625 | 
            +
                    )
         | 
| 626 | 
            +
             | 
| 627 | 
            +
             | 
| 628 | 
            +
            # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
         | 
| 629 | 
            +
            class SiglipMLP(nn.Module):
         | 
| 630 | 
            +
                def __init__(self, config):
         | 
| 631 | 
            +
                    super().__init__()
         | 
| 632 | 
            +
                    self.config = config
         | 
| 633 | 
            +
                    self.activation_fn = ACT2FN[config.hidden_act]
         | 
| 634 | 
            +
                    self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
         | 
| 635 | 
            +
                    self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
         | 
| 636 | 
            +
             | 
| 637 | 
            +
                def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         | 
| 638 | 
            +
                    hidden_states = self.fc1(hidden_states)
         | 
| 639 | 
            +
                    hidden_states = self.activation_fn(hidden_states)
         | 
| 640 | 
            +
                    hidden_states = self.fc2(hidden_states)
         | 
| 641 | 
            +
                    return hidden_states
         | 
| 642 | 
            +
             | 
| 643 | 
            +
             | 
| 644 | 
            +
            # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
         | 
| 645 | 
            +
            class SiglipEncoderLayer(nn.Module):
         | 
| 646 | 
            +
                def __init__(self, config: SiglipVisionConfig):
         | 
| 647 | 
            +
                    super().__init__()
         | 
| 648 | 
            +
                    self.embed_dim = config.hidden_size
         | 
| 649 | 
            +
                    self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         | 
| 650 | 
            +
                    self.self_attn = SiglipAttention(config) if not self._use_flash_attention_2 else SiglipFlashAttention2(config)
         | 
| 651 | 
            +
                    self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         | 
| 652 | 
            +
                    self.mlp = SiglipMLP(config)
         | 
| 653 | 
            +
                    self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         | 
| 654 | 
            +
             | 
| 655 | 
            +
                def forward(
         | 
| 656 | 
            +
                    self,
         | 
| 657 | 
            +
                    hidden_states: torch.Tensor,
         | 
| 658 | 
            +
                    attention_mask: torch.Tensor,
         | 
| 659 | 
            +
                    output_attentions: Optional[bool] = False,
         | 
| 660 | 
            +
                ) -> Tuple[torch.FloatTensor]:
         | 
| 661 | 
            +
                    """
         | 
| 662 | 
            +
                    Args:
         | 
| 663 | 
            +
                        hidden_states (`torch.FloatTensor`):
         | 
| 664 | 
            +
                            Input to the layer of shape `(batch, seq_len, embed_dim)`.
         | 
| 665 | 
            +
                        attention_mask (`torch.FloatTensor`):
         | 
| 666 | 
            +
                            Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
         | 
| 667 | 
            +
                        output_attentions (`bool`, *optional*, defaults to `False`):
         | 
| 668 | 
            +
                            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
         | 
| 669 | 
            +
                            returned tensors for more detail.
         | 
| 670 | 
            +
                    """
         | 
| 671 | 
            +
                    residual = hidden_states
         | 
| 672 | 
            +
             | 
| 673 | 
            +
                    hidden_states = self.layer_norm1(hidden_states)
         | 
| 674 | 
            +
                    hidden_states, attn_weights = self.self_attn(
         | 
| 675 | 
            +
                        hidden_states=hidden_states,
         | 
| 676 | 
            +
                        attention_mask=attention_mask,
         | 
| 677 | 
            +
                        output_attentions=output_attentions,
         | 
| 678 | 
            +
                    )
         | 
| 679 | 
            +
                    hidden_states = residual + hidden_states
         | 
| 680 | 
            +
             | 
| 681 | 
            +
                    residual = hidden_states
         | 
| 682 | 
            +
                    hidden_states = self.layer_norm2(hidden_states)
         | 
| 683 | 
            +
                    hidden_states = self.mlp(hidden_states)
         | 
| 684 | 
            +
                    hidden_states = residual + hidden_states
         | 
| 685 | 
            +
             | 
| 686 | 
            +
                    outputs = (hidden_states,)
         | 
| 687 | 
            +
             | 
| 688 | 
            +
                    if output_attentions:
         | 
| 689 | 
            +
                        outputs += (attn_weights,)
         | 
| 690 | 
            +
             | 
| 691 | 
            +
                    return outputs
         | 
| 692 | 
            +
             | 
| 693 | 
            +
             | 
| 694 | 
            +
            class SiglipPreTrainedModel(PreTrainedModel):
         | 
| 695 | 
            +
                """
         | 
| 696 | 
            +
                An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
         | 
| 697 | 
            +
                models.
         | 
| 698 | 
            +
                """
         | 
| 699 | 
            +
             | 
| 700 | 
            +
                config_class = SiglipVisionConfig
         | 
| 701 | 
            +
                base_model_prefix = "siglip"
         | 
| 702 | 
            +
                supports_gradient_checkpointing = True
         | 
| 703 | 
            +
             | 
| 704 | 
            +
                def _init_weights(self, module):
         | 
| 705 | 
            +
                    """Initialize the weights"""
         | 
| 706 | 
            +
             | 
| 707 | 
            +
                    if isinstance(module, SiglipVisionEmbeddings):
         | 
| 708 | 
            +
                        width = self.config.hidden_size
         | 
| 709 | 
            +
                        nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
         | 
| 710 | 
            +
                    elif isinstance(module, nn.Embedding):
         | 
| 711 | 
            +
                        default_flax_embed_init(module.weight)
         | 
| 712 | 
            +
                    elif isinstance(module, SiglipAttention):
         | 
| 713 | 
            +
                        nn.init.normal_(module.q_proj.weight)
         | 
| 714 | 
            +
                        nn.init.normal_(module.k_proj.weight)
         | 
| 715 | 
            +
                        nn.init.normal_(module.v_proj.weight)
         | 
| 716 | 
            +
                        nn.init.normal_(module.out_proj.weight)
         | 
| 717 | 
            +
                        nn.init.zeros_(module.q_proj.bias)
         | 
| 718 | 
            +
                        nn.init.zeros_(module.k_proj.bias)
         | 
| 719 | 
            +
                        nn.init.zeros_(module.v_proj.bias)
         | 
| 720 | 
            +
                        nn.init.zeros_(module.out_proj.bias)
         | 
| 721 | 
            +
                    elif isinstance(module, SiglipMLP):
         | 
| 722 | 
            +
                        nn.init.normal_(module.fc1.weight)
         | 
| 723 | 
            +
                        nn.init.normal_(module.fc2.weight)
         | 
| 724 | 
            +
                        nn.init.normal_(module.fc1.bias, std=1e-6)
         | 
| 725 | 
            +
                        nn.init.normal_(module.fc2.bias, std=1e-6)
         | 
| 726 | 
            +
                    elif isinstance(module, (nn.Linear, nn.Conv2d)):
         | 
| 727 | 
            +
                        lecun_normal_(module.weight)
         | 
| 728 | 
            +
                        if module.bias is not None:
         | 
| 729 | 
            +
                            nn.init.zeros_(module.bias)
         | 
| 730 | 
            +
                    elif isinstance(module, nn.LayerNorm):
         | 
| 731 | 
            +
                        module.bias.data.zero_()
         | 
| 732 | 
            +
                        module.weight.data.fill_(1.0)
         | 
| 733 | 
            +
             | 
| 734 | 
            +
             | 
| 735 | 
            +
            SIGLIP_START_DOCSTRING = r"""
         | 
| 736 | 
            +
                This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
         | 
| 737 | 
            +
                library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
         | 
| 738 | 
            +
                etc.)
         | 
| 739 | 
            +
                This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
         | 
| 740 | 
            +
                Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
         | 
| 741 | 
            +
                and behavior.
         | 
| 742 | 
            +
                Parameters:
         | 
| 743 | 
            +
                    config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
         | 
| 744 | 
            +
                        Initializing with a config file does not load the weights associated with the model, only the
         | 
| 745 | 
            +
                        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
         | 
| 746 | 
            +
            """
         | 
| 747 | 
            +
             | 
| 748 | 
            +
             | 
| 749 | 
            +
            SIGLIP_VISION_INPUTS_DOCSTRING = r"""
         | 
| 750 | 
            +
                Args:
         | 
| 751 | 
            +
                    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
         | 
| 752 | 
            +
                        Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
         | 
| 753 | 
            +
                        [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
         | 
| 754 | 
            +
                    output_attentions (`bool`, *optional*):
         | 
| 755 | 
            +
                        Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
         | 
| 756 | 
            +
                        tensors for more detail.
         | 
| 757 | 
            +
                    output_hidden_states (`bool`, *optional*):
         | 
| 758 | 
            +
                        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
         | 
| 759 | 
            +
                        more detail.
         | 
| 760 | 
            +
                    return_dict (`bool`, *optional*):
         | 
| 761 | 
            +
                        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         | 
| 762 | 
            +
            """
         | 
| 763 | 
            +
             | 
| 764 | 
            +
             | 
| 765 | 
            +
            # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
         | 
| 766 | 
            +
            class SiglipEncoder(nn.Module):
         | 
| 767 | 
            +
                """
         | 
| 768 | 
            +
                Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
         | 
| 769 | 
            +
                [`SiglipEncoderLayer`].
         | 
| 770 | 
            +
                Args:
         | 
| 771 | 
            +
                    config: SiglipConfig
         | 
| 772 | 
            +
                """
         | 
| 773 | 
            +
             | 
| 774 | 
            +
                def __init__(self, config: SiglipVisionConfig):
         | 
| 775 | 
            +
                    super().__init__()
         | 
| 776 | 
            +
                    self.config = config
         | 
| 777 | 
            +
                    self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         | 
| 778 | 
            +
                    self.gradient_checkpointing = False
         | 
| 779 | 
            +
             | 
| 780 | 
            +
                # Ignore copy
         | 
| 781 | 
            +
                def forward(
         | 
| 782 | 
            +
                    self,
         | 
| 783 | 
            +
                    inputs_embeds,
         | 
| 784 | 
            +
                    attention_mask: Optional[torch.Tensor] = None,
         | 
| 785 | 
            +
                    output_attentions: Optional[bool] = None,
         | 
| 786 | 
            +
                    output_hidden_states: Optional[bool] = None,
         | 
| 787 | 
            +
                    return_dict: Optional[bool] = None,
         | 
| 788 | 
            +
                ) -> Union[Tuple, BaseModelOutput]:
         | 
| 789 | 
            +
                    r"""
         | 
| 790 | 
            +
                    Args:
         | 
| 791 | 
            +
                        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
         | 
| 792 | 
            +
                            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
         | 
| 793 | 
            +
                            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
         | 
| 794 | 
            +
                            than the model's internal embedding lookup matrix.
         | 
| 795 | 
            +
                        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
         | 
| 796 | 
            +
                            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
         | 
| 797 | 
            +
                            - 1 for tokens that are **not masked**,
         | 
| 798 | 
            +
                            - 0 for tokens that are **masked**.
         | 
| 799 | 
            +
                            [What are attention masks?](../glossary#attention-mask)
         | 
| 800 | 
            +
                        output_attentions (`bool`, *optional*):
         | 
| 801 | 
            +
                            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
         | 
| 802 | 
            +
                            returned tensors for more detail.
         | 
| 803 | 
            +
                        output_hidden_states (`bool`, *optional*):
         | 
| 804 | 
            +
                            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
         | 
| 805 | 
            +
                            for more detail.
         | 
| 806 | 
            +
                        return_dict (`bool`, *optional*):
         | 
| 807 | 
            +
                            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         | 
| 808 | 
            +
                    """
         | 
| 809 | 
            +
                    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         | 
| 810 | 
            +
                    output_hidden_states = (
         | 
| 811 | 
            +
                        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         | 
| 812 | 
            +
                    )
         | 
| 813 | 
            +
                    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         | 
| 814 | 
            +
             | 
| 815 | 
            +
                    encoder_states = () if output_hidden_states else None
         | 
| 816 | 
            +
                    all_attentions = () if output_attentions else None
         | 
| 817 | 
            +
             | 
| 818 | 
            +
                    hidden_states = inputs_embeds
         | 
| 819 | 
            +
                    for encoder_layer in self.layers:
         | 
| 820 | 
            +
                        if output_hidden_states:
         | 
| 821 | 
            +
                            encoder_states = encoder_states + (hidden_states,)
         | 
| 822 | 
            +
                        if self.gradient_checkpointing and self.training:
         | 
| 823 | 
            +
                            layer_outputs = self._gradient_checkpointing_func(
         | 
| 824 | 
            +
                                encoder_layer.__call__,
         | 
| 825 | 
            +
                                hidden_states,
         | 
| 826 | 
            +
                                attention_mask,
         | 
| 827 | 
            +
                                output_attentions,
         | 
| 828 | 
            +
                            )
         | 
| 829 | 
            +
                        else:
         | 
| 830 | 
            +
                            layer_outputs = encoder_layer(
         | 
| 831 | 
            +
                                hidden_states,
         | 
| 832 | 
            +
                                attention_mask,
         | 
| 833 | 
            +
                                output_attentions=output_attentions,
         | 
| 834 | 
            +
                            )
         | 
| 835 | 
            +
             | 
| 836 | 
            +
                        hidden_states = layer_outputs[0]
         | 
| 837 | 
            +
             | 
| 838 | 
            +
                        if output_attentions:
         | 
| 839 | 
            +
                            all_attentions = all_attentions + (layer_outputs[1],)
         | 
| 840 | 
            +
             | 
| 841 | 
            +
                    if output_hidden_states:
         | 
| 842 | 
            +
                        encoder_states = encoder_states + (hidden_states,)
         | 
| 843 | 
            +
             | 
| 844 | 
            +
                    if not return_dict:
         | 
| 845 | 
            +
                        return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         | 
| 846 | 
            +
                    return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions)
         | 
| 847 | 
            +
             | 
| 848 | 
            +
             | 
| 849 | 
            +
            @add_start_docstrings("""The vision model from SigLIP without any head or projection on top.""", SIGLIP_START_DOCSTRING)
         | 
| 850 | 
            +
            class SiglipVisionTransformer(SiglipPreTrainedModel):
         | 
| 851 | 
            +
                config_class = SiglipVisionConfig
         | 
| 852 | 
            +
                main_input_name = "pixel_values"
         | 
| 853 | 
            +
                _supports_flash_attn_2 = True
         | 
| 854 | 
            +
                _no_split_modules = []
         | 
| 855 | 
            +
             | 
| 856 | 
            +
                def __init__(self, config: SiglipVisionConfig):
         | 
| 857 | 
            +
                    super().__init__(config)
         | 
| 858 | 
            +
                    self.config = config
         | 
| 859 | 
            +
                    embed_dim = config.hidden_size
         | 
| 860 | 
            +
             | 
| 861 | 
            +
                    self.embeddings = SiglipVisionEmbeddings(config)
         | 
| 862 | 
            +
                    self.encoder = SiglipEncoder(config)
         | 
| 863 | 
            +
                    self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         | 
| 864 | 
            +
                    self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         | 
| 865 | 
            +
             | 
| 866 | 
            +
                    # Initialize weights and apply final processing
         | 
| 867 | 
            +
                    self.post_init()
         | 
| 868 | 
            +
             | 
| 869 | 
            +
                def get_input_embeddings(self) -> nn.Module:
         | 
| 870 | 
            +
                    return self.embeddings.patch_embedding
         | 
| 871 | 
            +
             | 
| 872 | 
            +
                @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
         | 
| 873 | 
            +
                @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
         | 
| 874 | 
            +
                def forward(
         | 
| 875 | 
            +
                    self,
         | 
| 876 | 
            +
                    pixel_values,
         | 
| 877 | 
            +
                    patch_attention_mask: Optional[torch.BoolTensor] = None,
         | 
| 878 | 
            +
                    tgt_sizes: Optional[torch.IntTensor] = None,
         | 
| 879 | 
            +
                    output_attentions: Optional[bool] = None,
         | 
| 880 | 
            +
                    output_hidden_states: Optional[bool] = None,
         | 
| 881 | 
            +
                    return_dict: Optional[bool] = None,
         | 
| 882 | 
            +
                ) -> Union[Tuple, BaseModelOutputWithPooling]:
         | 
| 883 | 
            +
                    r"""
         | 
| 884 | 
            +
                    Returns:
         | 
| 885 | 
            +
                    """
         | 
| 886 | 
            +
                    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         | 
| 887 | 
            +
                    output_hidden_states = (
         | 
| 888 | 
            +
                        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         | 
| 889 | 
            +
                    )
         | 
| 890 | 
            +
                    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         | 
| 891 | 
            +
             | 
| 892 | 
            +
                    batch_size = pixel_values.size(0)
         | 
| 893 | 
            +
                    if patch_attention_mask is None:
         | 
| 894 | 
            +
                        patch_attention_mask = torch.ones(
         | 
| 895 | 
            +
                            size=(
         | 
| 896 | 
            +
                                batch_size,
         | 
| 897 | 
            +
                                pixel_values.size(2) // self.config.patch_size,
         | 
| 898 | 
            +
                                pixel_values.size(3) // self.config.patch_size,
         | 
| 899 | 
            +
                            ),
         | 
| 900 | 
            +
                            dtype=torch.bool,
         | 
| 901 | 
            +
                            device=pixel_values.device,
         | 
| 902 | 
            +
                        )
         | 
| 903 | 
            +
             | 
| 904 | 
            +
                    hidden_states = self.embeddings(
         | 
| 905 | 
            +
                        pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes
         | 
| 906 | 
            +
                    )
         | 
| 907 | 
            +
             | 
| 908 | 
            +
                    patch_attention_mask = patch_attention_mask.view(batch_size, -1)
         | 
| 909 | 
            +
                    # The call to `_upad_input` in `_flash_attention_forward` is expensive
         | 
| 910 | 
            +
                    # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
         | 
| 911 | 
            +
                    # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
         | 
| 912 | 
            +
                    if not torch.any(~patch_attention_mask):
         | 
| 913 | 
            +
                        attention_mask = None
         | 
| 914 | 
            +
                    else:
         | 
| 915 | 
            +
                        attention_mask = (
         | 
| 916 | 
            +
                            _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
         | 
| 917 | 
            +
                            if not self._use_flash_attention_2
         | 
| 918 | 
            +
                            else patch_attention_mask
         | 
| 919 | 
            +
                        )
         | 
| 920 | 
            +
             | 
| 921 | 
            +
                    encoder_outputs = self.encoder(
         | 
| 922 | 
            +
                        inputs_embeds=hidden_states,
         | 
| 923 | 
            +
                        attention_mask=attention_mask,
         | 
| 924 | 
            +
                        output_attentions=output_attentions,
         | 
| 925 | 
            +
                        output_hidden_states=output_hidden_states,
         | 
| 926 | 
            +
                        return_dict=return_dict,
         | 
| 927 | 
            +
                    )
         | 
| 928 | 
            +
             | 
| 929 | 
            +
                    last_hidden_state = encoder_outputs[0]
         | 
| 930 | 
            +
                    last_hidden_state = self.post_layernorm(last_hidden_state)
         | 
| 931 | 
            +
             | 
| 932 | 
            +
                    if not return_dict:
         | 
| 933 | 
            +
                        return (last_hidden_state, None) + encoder_outputs[1:]
         | 
| 934 | 
            +
             | 
| 935 | 
            +
                    return BaseModelOutputWithPooling(
         | 
| 936 | 
            +
                        last_hidden_state=last_hidden_state,
         | 
| 937 | 
            +
                        pooler_output=None,
         | 
| 938 | 
            +
                        hidden_states=encoder_outputs.hidden_states,
         | 
| 939 | 
            +
                        attentions=encoder_outputs.attentions,
         | 
| 940 | 
            +
                    )
         | 
    	
        preprocessor_config.json
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_map": {
         | 
| 3 | 
            +
                "AutoImageProcessor": "image_processing_minicpmv.MiniCPMVImageProcessor",
         | 
| 4 | 
            +
                "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor"
         | 
| 5 | 
            +
              },
         | 
| 6 | 
            +
              "chunk_length": 30,
         | 
| 7 | 
            +
              "feature_extractor_type": "WhisperFeatureExtractor",
         | 
| 8 | 
            +
              "feature_size": 80,
         | 
| 9 | 
            +
              "hop_length": 160,
         | 
| 10 | 
            +
              "im_end": "</image>",
         | 
| 11 | 
            +
              "im_id_end": "</image_id>",
         | 
| 12 | 
            +
              "im_id_start": "<image_id>",
         | 
| 13 | 
            +
              "im_start": "<image>",
         | 
| 14 | 
            +
              "image_feature_size": 64,
         | 
| 15 | 
            +
              "image_processor_type": "MiniCPMVImageProcessor",
         | 
| 16 | 
            +
              "max_slice_nums": 9,
         | 
| 17 | 
            +
              "n_fft": 400,
         | 
| 18 | 
            +
              "n_samples": 480000,
         | 
| 19 | 
            +
              "nb_max_frames": 3000,
         | 
| 20 | 
            +
              "norm_mean": [
         | 
| 21 | 
            +
                0.5,
         | 
| 22 | 
            +
                0.5,
         | 
| 23 | 
            +
                0.5
         | 
| 24 | 
            +
              ],
         | 
| 25 | 
            +
              "norm_std": [
         | 
| 26 | 
            +
                0.5,
         | 
| 27 | 
            +
                0.5,
         | 
| 28 | 
            +
                0.5
         | 
| 29 | 
            +
              ],
         | 
| 30 | 
            +
              "padding_side": "right",
         | 
| 31 | 
            +
              "padding_value": 0.0,
         | 
| 32 | 
            +
              "patch_size": 14,
         | 
| 33 | 
            +
              "processor_class": "MiniCPMOProcessor",
         | 
| 34 | 
            +
              "return_attention_mask": false,
         | 
| 35 | 
            +
              "sampling_rate": 16000,
         | 
| 36 | 
            +
              "scale_resolution": 448,
         | 
| 37 | 
            +
              "slice_end": "</slice>",
         | 
| 38 | 
            +
              "slice_mode": true,
         | 
| 39 | 
            +
              "slice_start": "<slice>",
         | 
| 40 | 
            +
              "unk": "<unk>",
         | 
| 41 | 
            +
              "use_image_id": true,
         | 
| 42 | 
            +
              "version": 2.6
         | 
| 43 | 
            +
            }
         | 
    	
        processing_minicpmo.py
    ADDED
    
    | @@ -0,0 +1,505 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2025 The OpenBMB Team. All rights reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License.
         | 
| 15 | 
            +
            """
         | 
| 16 | 
            +
            Processor class for MiniCPMO.
         | 
| 17 | 
            +
            """
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            import math
         | 
| 20 | 
            +
            import re
         | 
| 21 | 
            +
            from typing import List
         | 
| 22 | 
            +
            from typing import Literal
         | 
| 23 | 
            +
            from typing import Optional
         | 
| 24 | 
            +
            from typing import Union
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            import numpy as np
         | 
| 27 | 
            +
            import torch
         | 
| 28 | 
            +
            import torchaudio
         | 
| 29 | 
            +
            from transformers.image_utils import ImageInput
         | 
| 30 | 
            +
            from transformers.processing_utils import ProcessorMixin
         | 
| 31 | 
            +
            from transformers.tokenization_utils_base import PreTokenizedInput
         | 
| 32 | 
            +
            from transformers.tokenization_utils_base import TextInput
         | 
| 33 | 
            +
            from transformers.utils import TensorType
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            from .image_processing_minicpmv import MiniCPMOBatchFeature
         | 
| 36 | 
            +
             | 
| 37 | 
            +
             | 
| 38 | 
            +
            class MiniCPMOProcessor(ProcessorMixin):
         | 
| 39 | 
            +
                r"""
         | 
| 40 | 
            +
                Constructs a MiniCPMV processor which wraps a MiniCPMV image processor and a MiniCPMV tokenizer into a single processor.
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                [`MiniCPMVProcessor`] offers all the functionalities of [`MiniCPMVImageProcessor`] and [`LlamaTokenizerWrapper`]. See the
         | 
| 43 | 
            +
                [`~MiniCPMVProcessor.__call__`] and [`~MiniCPMVProcessor.decode`] for more information.
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                Args:
         | 
| 46 | 
            +
                    image_processor ([`MiniCPMVImageProcessor`], *optional*):
         | 
| 47 | 
            +
                        The image processor is a required input.
         | 
| 48 | 
            +
                    tokenizer ([`LlamaTokenizerWrapper`], *optional*):
         | 
| 49 | 
            +
                        The tokenizer is a required input.
         | 
| 50 | 
            +
                """
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                attributes = ["image_processor", "feature_extractor", "tokenizer"]
         | 
| 53 | 
            +
                feature_extractor_class = "WhisperFeatureExtractor"
         | 
| 54 | 
            +
                image_processor_class = "AutoImageProcessor"
         | 
| 55 | 
            +
                tokenizer_class = "AutoTokenizer"
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                def __init__(self, image_processor=None, feature_extractor=None, tokenizer=None):
         | 
| 58 | 
            +
                    super().__init__(image_processor, feature_extractor, tokenizer)
         | 
| 59 | 
            +
                    self.version = image_processor.version
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def __call__(
         | 
| 62 | 
            +
                    self,
         | 
| 63 | 
            +
                    text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
         | 
| 64 | 
            +
                    images: ImageInput = None,
         | 
| 65 | 
            +
                    audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]] = None,
         | 
| 66 | 
            +
                    audio_parts: Optional[list] = None,
         | 
| 67 | 
            +
                    max_length: Optional[int] = None,
         | 
| 68 | 
            +
                    do_pad: Optional[bool] = True,
         | 
| 69 | 
            +
                    max_slice_nums: int = None,
         | 
| 70 | 
            +
                    use_image_id: bool = True,
         | 
| 71 | 
            +
                    chunk_input: bool = False,
         | 
| 72 | 
            +
                    return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
         | 
| 73 | 
            +
                    sampling_rate: Optional[int] = 16000,
         | 
| 74 | 
            +
                    **kwargs,
         | 
| 75 | 
            +
                ) -> MiniCPMOBatchFeature:
         | 
| 76 | 
            +
                    if images is not None:
         | 
| 77 | 
            +
                        image_inputs = self.image_processor(
         | 
| 78 | 
            +
                            images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
         | 
| 79 | 
            +
                        )
         | 
| 80 | 
            +
                    else:
         | 
| 81 | 
            +
                        image_inputs = None
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                    if audios is not None:
         | 
| 84 | 
            +
                        audio_features, audio_feature_lens, audio_phs = self.audio_feature_extract(
         | 
| 85 | 
            +
                            audios, audio_parts, chunk_input, sampling_rate
         | 
| 86 | 
            +
                        )
         | 
| 87 | 
            +
                    else:
         | 
| 88 | 
            +
                        audio_features, audio_feature_lens, audio_phs = [], [], []
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                    model_inputs = self._convert_omni_to_inputs(
         | 
| 91 | 
            +
                        image_inputs,
         | 
| 92 | 
            +
                        audio_phs,
         | 
| 93 | 
            +
                        text,
         | 
| 94 | 
            +
                        max_slice_nums=max_slice_nums,
         | 
| 95 | 
            +
                        use_image_id=use_image_id,
         | 
| 96 | 
            +
                        max_length=max_length,
         | 
| 97 | 
            +
                        **kwargs,
         | 
| 98 | 
            +
                    )
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    model_inputs["audio_features"] = audio_features
         | 
| 101 | 
            +
                    model_inputs["audio_feature_lens"] = audio_feature_lens
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                    return MiniCPMOBatchFeature(data={**model_inputs})
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                def get_audio_placeholder(self, audio_lens, chunk_input, chunk_length):
         | 
| 106 | 
            +
                    pool_step = 2
         | 
| 107 | 
            +
                    feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    feature_lens = (feature_lens - 1) // 2 + 1
         | 
| 110 | 
            +
                    output_lens = (feature_lens - pool_step) // pool_step + 1
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    if chunk_input:
         | 
| 113 | 
            +
                        fbank_feat_in_chunk = int(chunk_length * 100)
         | 
| 114 | 
            +
                        cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
         | 
| 115 | 
            +
                        audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
         | 
| 116 | 
            +
                        num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                        place_holders = ""
         | 
| 119 | 
            +
                        total_unk_len = 0
         | 
| 120 | 
            +
                        for _ in range(num_audio_chunks):
         | 
| 121 | 
            +
                            unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
         | 
| 122 | 
            +
                            place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
         | 
| 123 | 
            +
                            total_unk_len += unk_len
         | 
| 124 | 
            +
                        audio_placeholder = place_holders
         | 
| 125 | 
            +
                    else:
         | 
| 126 | 
            +
                        audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                    return audio_placeholder
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                def audio_feature_extract(
         | 
| 131 | 
            +
                    self,
         | 
| 132 | 
            +
                    audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
         | 
| 133 | 
            +
                    audio_parts: Optional[list] = None,
         | 
| 134 | 
            +
                    chunk_input: Optional[bool] = False,
         | 
| 135 | 
            +
                    sampling_rate: Optional[int] = None,
         | 
| 136 | 
            +
                    chunk_length: Optional[int] = 1,
         | 
| 137 | 
            +
                    **kwargs,
         | 
| 138 | 
            +
                ):
         | 
| 139 | 
            +
                    if isinstance(audios, np.ndarray):
         | 
| 140 | 
            +
                        audios_list = [[audios]]
         | 
| 141 | 
            +
                    elif isinstance(audios[0], np.ndarray):
         | 
| 142 | 
            +
                        audios_list = [audios]
         | 
| 143 | 
            +
                    else:
         | 
| 144 | 
            +
                        audios_list = audios
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                    if audio_parts is not None:
         | 
| 147 | 
            +
                        assert len(audio_parts) == len(audios_list)
         | 
| 148 | 
            +
                        for parts, audios in zip(audio_parts, audios_list):
         | 
| 149 | 
            +
                            assert len(parts) == len(audios)
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                    audio_feature_lens_list = []
         | 
| 152 | 
            +
                    audio_ph_list = []
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    audio_features_all = []
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    # audio placeholder not dependent on audio_parts
         | 
| 157 | 
            +
                    for audios in audios_list:
         | 
| 158 | 
            +
                        if audios:
         | 
| 159 | 
            +
                            audio_ph_list.append([self.get_audio_placeholder(len(a), chunk_input, chunk_length) for a in audios])
         | 
| 160 | 
            +
                        else:
         | 
| 161 | 
            +
                            audio_ph_list.append([])
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                    for idx, audios in enumerate(audios_list):
         | 
| 164 | 
            +
                        if audio_parts is not None:
         | 
| 165 | 
            +
                            # same audio part merge
         | 
| 166 | 
            +
                            audio_part = audio_parts[idx]
         | 
| 167 | 
            +
                            merge_audio = []
         | 
| 168 | 
            +
                            cur_audio = []
         | 
| 169 | 
            +
                            for aid, (part, audio) in enumerate(zip(audio_part, audios)):
         | 
| 170 | 
            +
                                if aid == 0 or audio_part[aid] == audio_part[aid - 1]:
         | 
| 171 | 
            +
                                    cur_audio.append(audio)
         | 
| 172 | 
            +
                                else:
         | 
| 173 | 
            +
                                    merge_audio.append(np.hstack(cur_audio))
         | 
| 174 | 
            +
                                    cur_audio = [audio]
         | 
| 175 | 
            +
                            if cur_audio:
         | 
| 176 | 
            +
                                merge_audio.append(np.hstack(cur_audio))
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                        else:
         | 
| 179 | 
            +
                            merge_audio = audios
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                        audio_feature_lens = []
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                        # If the audio exceeds 30 seconds, split it into chunks every 30 seconds.
         | 
| 184 | 
            +
                        final_merge_audio = []
         | 
| 185 | 
            +
                        max_audio_inp_len = 30 * sampling_rate
         | 
| 186 | 
            +
                        for audio in merge_audio:
         | 
| 187 | 
            +
                            if len(audio) <= max_audio_inp_len:
         | 
| 188 | 
            +
                                final_merge_audio.append(audio)
         | 
| 189 | 
            +
                            else:
         | 
| 190 | 
            +
                                for i in range(math.ceil(len(audio) / max_audio_inp_len)):
         | 
| 191 | 
            +
                                    final_merge_audio.append(audio[i * max_audio_inp_len : (i + 1) * max_audio_inp_len])
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                        if audios:
         | 
| 194 | 
            +
                            audio_inputs = self.feature_extractor(
         | 
| 195 | 
            +
                                final_merge_audio,
         | 
| 196 | 
            +
                                sampling_rate=sampling_rate,
         | 
| 197 | 
            +
                                return_attention_mask=True,
         | 
| 198 | 
            +
                                padding="max_length",
         | 
| 199 | 
            +
                                return_tensors="pt",
         | 
| 200 | 
            +
                                **kwargs,
         | 
| 201 | 
            +
                            )
         | 
| 202 | 
            +
                            audio_feature = audio_inputs["input_features"]
         | 
| 203 | 
            +
                            actual_lens = audio_inputs["attention_mask"].sum(dim=1)
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                            for feat, lens in zip(audio_feature, actual_lens):
         | 
| 206 | 
            +
                                audio_features_all.append(feat[:, :lens])
         | 
| 207 | 
            +
                                audio_feature_lens.append(lens)
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                            audio_feature_lens = torch.hstack(audio_feature_lens)
         | 
| 210 | 
            +
                            audio_feature_lens_list.append(audio_feature_lens)
         | 
| 211 | 
            +
                        else:
         | 
| 212 | 
            +
                            audio_feature_lens_list.append([])
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                    if audio_features_all:
         | 
| 215 | 
            +
                        audio_features = [i.permute(1, 0) for i in audio_features_all]
         | 
| 216 | 
            +
                        audio_features = torch.nn.utils.rnn.pad_sequence(
         | 
| 217 | 
            +
                            audio_features, batch_first=True, padding_value=0.0
         | 
| 218 | 
            +
                        ).permute(0, 2, 1)
         | 
| 219 | 
            +
                    else:
         | 
| 220 | 
            +
                        audio_features = []
         | 
| 221 | 
            +
             | 
| 222 | 
            +
                    return audio_features, audio_feature_lens_list, audio_ph_list
         | 
| 223 | 
            +
             | 
| 224 | 
            +
                # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
         | 
| 225 | 
            +
                def batch_decode(self, *args, **kwargs):
         | 
| 226 | 
            +
                    """
         | 
| 227 | 
            +
                    This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
         | 
| 228 | 
            +
                    refer to the docstring of this method for more information.
         | 
| 229 | 
            +
                    """
         | 
| 230 | 
            +
                    output_ids = args[0]
         | 
| 231 | 
            +
                    result_text = []
         | 
| 232 | 
            +
                    for result in output_ids:
         | 
| 233 | 
            +
                        result = result[result != 0]
         | 
| 234 | 
            +
                        if result[0] == self.tokenizer.bos_id:
         | 
| 235 | 
            +
                            result = result[1:]
         | 
| 236 | 
            +
                        if result[-1] == self.tokenizer.eos_id:
         | 
| 237 | 
            +
                            result = result[:-1]
         | 
| 238 | 
            +
                        result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
         | 
| 239 | 
            +
                    return result_text
         | 
| 240 | 
            +
                    # return self.tokenizer.batch_decode(*args, **kwargs)
         | 
| 241 | 
            +
             | 
| 242 | 
            +
                # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
         | 
| 243 | 
            +
                def decode(self, *args, **kwargs):
         | 
| 244 | 
            +
                    """
         | 
| 245 | 
            +
                    This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
         | 
| 246 | 
            +
                    the docstring of this method for more information.
         | 
| 247 | 
            +
                    """
         | 
| 248 | 
            +
                    result = args[0]
         | 
| 249 | 
            +
                    result = result[result != 0]
         | 
| 250 | 
            +
                    if result[0] == self.tokenizer.bos_id:
         | 
| 251 | 
            +
                        result = result[1:]
         | 
| 252 | 
            +
                    if result[-1] == self.tokenizer.eos_id or (
         | 
| 253 | 
            +
                        hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id
         | 
| 254 | 
            +
                    ):
         | 
| 255 | 
            +
                        result = result[:-1]
         | 
| 256 | 
            +
                    return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
         | 
| 257 | 
            +
             | 
| 258 | 
            +
                def _convert(self, input_str, max_inp_length: Optional[int] = None, **kwargs):
         | 
| 259 | 
            +
                    input_ids = self.tokenizer.encode(input_str, **kwargs)
         | 
| 260 | 
            +
                    if max_inp_length is not None:
         | 
| 261 | 
            +
                        input_ids = input_ids[:max_inp_length]
         | 
| 262 | 
            +
                    input_ids = torch.tensor(input_ids, dtype=torch.int32)
         | 
| 263 | 
            +
             | 
| 264 | 
            +
                    ## image bound
         | 
| 265 | 
            +
                    start_cond = (input_ids == self.tokenizer.im_start_id) | (input_ids == self.tokenizer.slice_start_id)
         | 
| 266 | 
            +
                    end_cond = (input_ids == self.tokenizer.im_end_id) | (input_ids == self.tokenizer.slice_end_id)
         | 
| 267 | 
            +
             | 
| 268 | 
            +
                    image_start_idx = torch.where(start_cond)[0]
         | 
| 269 | 
            +
                    image_start_idx += 1
         | 
| 270 | 
            +
                    image_end_idx = torch.where(end_cond)[0]
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                    valid_image_nums = max(len(image_start_idx), len(image_end_idx))
         | 
| 273 | 
            +
             | 
| 274 | 
            +
                    image_bounds = torch.hstack(
         | 
| 275 | 
            +
                        [
         | 
| 276 | 
            +
                            image_start_idx[:valid_image_nums].unsqueeze(-1),
         | 
| 277 | 
            +
                            image_end_idx[:valid_image_nums].unsqueeze(-1),
         | 
| 278 | 
            +
                        ]
         | 
| 279 | 
            +
                    )
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                    ##  audio bound
         | 
| 282 | 
            +
                    audio_start_idx = torch.where(input_ids == self.tokenizer.audio_start_id)[0]
         | 
| 283 | 
            +
                    audio_end_idx = torch.where(input_ids == self.tokenizer.audio_end_id)[0]
         | 
| 284 | 
            +
                    assert len(audio_start_idx) == len(audio_end_idx)
         | 
| 285 | 
            +
                    audio_bounds = torch.hstack([(audio_start_idx + 1).unsqueeze(-1), audio_end_idx.unsqueeze(-1)])
         | 
| 286 | 
            +
             | 
| 287 | 
            +
                    spk_start_idx = torch.where(input_ids == self.tokenizer.spk_start_id)[0]
         | 
| 288 | 
            +
                    spk_end_idx = torch.where(input_ids == self.tokenizer.spk_end_id)[0]
         | 
| 289 | 
            +
                    assert len(spk_start_idx) == len(spk_end_idx)
         | 
| 290 | 
            +
                    spk_bounds = torch.hstack([(spk_start_idx + 1).unsqueeze(-1), spk_end_idx.unsqueeze(-1)])
         | 
| 291 | 
            +
             | 
| 292 | 
            +
                    return input_ids, image_bounds, audio_bounds, spk_bounds
         | 
| 293 | 
            +
             | 
| 294 | 
            +
                def _convert_omni_to_inputs(
         | 
| 295 | 
            +
                    self,
         | 
| 296 | 
            +
                    images,
         | 
| 297 | 
            +
                    audio_phs,
         | 
| 298 | 
            +
                    texts: Union[str, List[str]],
         | 
| 299 | 
            +
                    truncation=None,
         | 
| 300 | 
            +
                    max_length=None,
         | 
| 301 | 
            +
                    max_slice_nums=None,
         | 
| 302 | 
            +
                    use_image_id=None,
         | 
| 303 | 
            +
                    return_tensors=None,
         | 
| 304 | 
            +
                    **kwargs,
         | 
| 305 | 
            +
                ):
         | 
| 306 | 
            +
                    if images is None and audio_phs is None:
         | 
| 307 | 
            +
                        model_inputs = self.tokenizer(
         | 
| 308 | 
            +
                            texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs
         | 
| 309 | 
            +
                        )
         | 
| 310 | 
            +
                        return MiniCPMOBatchFeature(data={**model_inputs})
         | 
| 311 | 
            +
             | 
| 312 | 
            +
                    image_tag = "(<image>./</image>)"
         | 
| 313 | 
            +
                    image_pattern = "\(<image>./</image>\)"
         | 
| 314 | 
            +
                    audio_tag = "(<audio>./</audio>)"
         | 
| 315 | 
            +
                    audio_pattern = "\(<audio>./</audio>\)"
         | 
| 316 | 
            +
                    split_pattern = f"({image_pattern}|{audio_pattern})"
         | 
| 317 | 
            +
             | 
| 318 | 
            +
                    if isinstance(texts, str):
         | 
| 319 | 
            +
                        texts = [texts]
         | 
| 320 | 
            +
             | 
| 321 | 
            +
                    bs = len(texts)
         | 
| 322 | 
            +
                    if images is not None:
         | 
| 323 | 
            +
                        images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
         | 
| 324 | 
            +
                    else:
         | 
| 325 | 
            +
                        images, image_sizes, tgt_sizes = [[]] * bs, [[]] * bs, [[]] * bs
         | 
| 326 | 
            +
             | 
| 327 | 
            +
                    input_ids_list = []
         | 
| 328 | 
            +
                    image_bounds_list = []
         | 
| 329 | 
            +
                    audio_bounds_list = []
         | 
| 330 | 
            +
                    spk_bounds_list = []
         | 
| 331 | 
            +
             | 
| 332 | 
            +
                    for index, text in enumerate(texts):
         | 
| 333 | 
            +
                        text_chunks = re.split(split_pattern, text)
         | 
| 334 | 
            +
             | 
| 335 | 
            +
                        image_tags = re.findall(image_pattern, text)
         | 
| 336 | 
            +
                        audio_tags = re.findall(audio_pattern, text)
         | 
| 337 | 
            +
             | 
| 338 | 
            +
                        if image_tags:
         | 
| 339 | 
            +
                            assert images is not None
         | 
| 340 | 
            +
                            assert len(image_tags) == len(image_sizes[index])
         | 
| 341 | 
            +
                        if audio_tags:
         | 
| 342 | 
            +
                            assert audio_phs is not None
         | 
| 343 | 
            +
                            assert len(audio_tags) == len(audio_phs[index])
         | 
| 344 | 
            +
             | 
| 345 | 
            +
                        image_id = 0
         | 
| 346 | 
            +
                        audio_id = 0
         | 
| 347 | 
            +
                        for i, chunk in enumerate(text_chunks):
         | 
| 348 | 
            +
                            if chunk == image_tag:
         | 
| 349 | 
            +
                                image_placeholder = self.image_processor.get_slice_image_placeholder(
         | 
| 350 | 
            +
                                    image_sizes[index][image_id], image_id, max_slice_nums, use_image_id
         | 
| 351 | 
            +
                                )
         | 
| 352 | 
            +
                                image_id += 1
         | 
| 353 | 
            +
                                text_chunks[i] = image_placeholder
         | 
| 354 | 
            +
                            elif chunk == audio_tag:
         | 
| 355 | 
            +
                                audio_placeholder = audio_phs[index][audio_id]
         | 
| 356 | 
            +
                                audio_id += 1
         | 
| 357 | 
            +
                                text_chunks[i] = audio_placeholder
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                        final_text = "".join(text_chunks)
         | 
| 360 | 
            +
                        input_ids, image_bounds, audio_bounds, spk_bounds = self._convert(final_text, max_length, **kwargs)
         | 
| 361 | 
            +
             | 
| 362 | 
            +
                        input_ids_list.append(input_ids)
         | 
| 363 | 
            +
                        image_bounds_list.append(image_bounds)
         | 
| 364 | 
            +
                        audio_bounds_list.append(audio_bounds)
         | 
| 365 | 
            +
                        spk_bounds_list.append(spk_bounds)
         | 
| 366 | 
            +
             | 
| 367 | 
            +
                    padded_input_ids, padding_lengths = self.pad(input_ids_list, padding_side="left")
         | 
| 368 | 
            +
                    attention_mask = torch.ones_like(padded_input_ids, dtype=torch.bool)
         | 
| 369 | 
            +
                    for i, length in enumerate(padding_lengths):
         | 
| 370 | 
            +
                        image_bounds_list[i] = image_bounds_list[i] + length
         | 
| 371 | 
            +
                        audio_bounds_list[i] = audio_bounds_list[i] + length
         | 
| 372 | 
            +
                        spk_bounds_list[i] = spk_bounds_list[i] + length
         | 
| 373 | 
            +
                        attention_mask[i, :length] = False
         | 
| 374 | 
            +
             | 
| 375 | 
            +
                    data = {
         | 
| 376 | 
            +
                        "input_ids": padded_input_ids,
         | 
| 377 | 
            +
                        "attention_mask": attention_mask,
         | 
| 378 | 
            +
                        "pixel_values": images,
         | 
| 379 | 
            +
                        "image_sizes": image_sizes,
         | 
| 380 | 
            +
                        "image_bound": image_bounds_list,
         | 
| 381 | 
            +
                        "tgt_sizes": tgt_sizes,
         | 
| 382 | 
            +
                        "audio_bounds": audio_bounds_list,
         | 
| 383 | 
            +
                        "spk_bounds": spk_bounds_list,
         | 
| 384 | 
            +
                    }
         | 
| 385 | 
            +
             | 
| 386 | 
            +
                    return data
         | 
| 387 | 
            +
             | 
| 388 | 
            +
                @property
         | 
| 389 | 
            +
                # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
         | 
| 390 | 
            +
                def model_input_names(self):
         | 
| 391 | 
            +
                    tokenizer_input_names = self.tokenizer.model_input_names
         | 
| 392 | 
            +
                    image_processor_input_names = self.image_processor.model_input_names
         | 
| 393 | 
            +
                    feature_extractor_input_names = self.feature_extractor.model_input_names
         | 
| 394 | 
            +
                    return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + feature_extractor_input_names))
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
         | 
| 397 | 
            +
                    items = []
         | 
| 398 | 
            +
                    if isinstance(inputs[0], list):
         | 
| 399 | 
            +
                        assert isinstance(inputs[0][0], torch.Tensor)
         | 
| 400 | 
            +
                        for it in inputs:
         | 
| 401 | 
            +
                            for tr in it:
         | 
| 402 | 
            +
                                items.append(tr)
         | 
| 403 | 
            +
                    else:
         | 
| 404 | 
            +
                        assert isinstance(inputs[0], torch.Tensor)
         | 
| 405 | 
            +
                        items = inputs
         | 
| 406 | 
            +
             | 
| 407 | 
            +
                    batch_size = len(items)
         | 
| 408 | 
            +
                    shape = items[0].shape
         | 
| 409 | 
            +
                    dim = len(shape)
         | 
| 410 | 
            +
                    assert dim <= 2
         | 
| 411 | 
            +
                    if max_length is None:
         | 
| 412 | 
            +
                        max_length = 0
         | 
| 413 | 
            +
                    max_length = max(max_length, max(item.shape[-1] for item in items))
         | 
| 414 | 
            +
                    min_length = min(item.shape[-1] for item in items)
         | 
| 415 | 
            +
                    dtype = items[0].dtype
         | 
| 416 | 
            +
             | 
| 417 | 
            +
                    if dim == 0:
         | 
| 418 | 
            +
                        return torch.stack([item for item in items], dim=0), [0]
         | 
| 419 | 
            +
                    elif dim == 1:
         | 
| 420 | 
            +
                        if max_length == min_length:
         | 
| 421 | 
            +
                            return torch.stack([item for item in items], dim=0), [0] * batch_size
         | 
| 422 | 
            +
                        tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
         | 
| 423 | 
            +
                    else:
         | 
| 424 | 
            +
                        tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
         | 
| 425 | 
            +
             | 
| 426 | 
            +
                    padding_length = []
         | 
| 427 | 
            +
                    for i, item in enumerate(items):
         | 
| 428 | 
            +
                        if dim == 1:
         | 
| 429 | 
            +
                            if padding_side == "left":
         | 
| 430 | 
            +
                                tensor[i, -len(item) :] = item.clone()
         | 
| 431 | 
            +
                            else:
         | 
| 432 | 
            +
                                tensor[i, : len(item)] = item.clone()
         | 
| 433 | 
            +
                        elif dim == 2:
         | 
| 434 | 
            +
                            if padding_side == "left":
         | 
| 435 | 
            +
                                tensor[i, -len(item) :, :] = item.clone()
         | 
| 436 | 
            +
                            else:
         | 
| 437 | 
            +
                                tensor[i, : len(item), :] = item.clone()
         | 
| 438 | 
            +
                        padding_length.append(tensor.shape[-1] - len(item))
         | 
| 439 | 
            +
             | 
| 440 | 
            +
                    return tensor, padding_length
         | 
| 441 | 
            +
             | 
| 442 | 
            +
             | 
| 443 | 
            +
            class MelSpectrogramFeatures(torch.nn.Module):
         | 
| 444 | 
            +
                def __init__(
         | 
| 445 | 
            +
                    self,
         | 
| 446 | 
            +
                    sample_rate=24000,
         | 
| 447 | 
            +
                    n_fft=1024,
         | 
| 448 | 
            +
                    hop_length=256,
         | 
| 449 | 
            +
                    n_mels=100,
         | 
| 450 | 
            +
                    padding: Literal["center", "same"] = "center",
         | 
| 451 | 
            +
                ):
         | 
| 452 | 
            +
                    super().__init__()
         | 
| 453 | 
            +
                    if padding not in ["center", "same"]:
         | 
| 454 | 
            +
                        raise ValueError("Padding must be 'center' or 'same'.")
         | 
| 455 | 
            +
                    self.padding = padding
         | 
| 456 | 
            +
                    self.mel_spec = torchaudio.transforms.MelSpectrogram(
         | 
| 457 | 
            +
                        sample_rate=sample_rate,
         | 
| 458 | 
            +
                        n_fft=n_fft,
         | 
| 459 | 
            +
                        hop_length=hop_length,
         | 
| 460 | 
            +
                        n_mels=n_mels,
         | 
| 461 | 
            +
                        center=padding == "center",
         | 
| 462 | 
            +
                        power=1,
         | 
| 463 | 
            +
                    )
         | 
| 464 | 
            +
             | 
| 465 | 
            +
                def __call__(self, audio: torch.Tensor) -> torch.Tensor:
         | 
| 466 | 
            +
                    """
         | 
| 467 | 
            +
                    audio: Tensor([num_channels, num_samples])
         | 
| 468 | 
            +
                    """
         | 
| 469 | 
            +
                    return super().__call__(audio)
         | 
| 470 | 
            +
             | 
| 471 | 
            +
                def forward(self, audio: torch.Tensor) -> torch.Tensor:
         | 
| 472 | 
            +
                    """
         | 
| 473 | 
            +
                    audio: Tensor([num_channels, num_samples])
         | 
| 474 | 
            +
                    """
         | 
| 475 | 
            +
                    mel: torch.Tensor = self.mel_spec(audio)
         | 
| 476 | 
            +
                    features = torch.log(torch.clip(mel, min=1e-5))
         | 
| 477 | 
            +
                    return features
         | 
| 478 | 
            +
             | 
| 479 | 
            +
             | 
| 480 | 
            +
            class ChatTTSProcessor:
         | 
| 481 | 
            +
                def __init__(self, text_tokenizer):
         | 
| 482 | 
            +
                    self.audio_processor = MelSpectrogramFeatures()
         | 
| 483 | 
            +
                    self.text_tokenizer = text_tokenizer
         | 
| 484 | 
            +
             | 
| 485 | 
            +
                def __call__(self, text_list, audio_list):
         | 
| 486 | 
            +
                    assert len(text_list) == len(audio_list)
         | 
| 487 | 
            +
                    input_ids_varlen = []
         | 
| 488 | 
            +
                    for text in text_list:
         | 
| 489 | 
            +
                        input_ids_ = self.text_tokenizer.encode(text, return_tensors="pt", add_special_tokens=False)  # [1, seq_len]
         | 
| 490 | 
            +
                        input_ids_ = input_ids_.squeeze(0)  # [seq_len]
         | 
| 491 | 
            +
                        input_ids_varlen.append(input_ids_)
         | 
| 492 | 
            +
             | 
| 493 | 
            +
                    audio_features_varlen = []
         | 
| 494 | 
            +
                    for audio in audio_list:
         | 
| 495 | 
            +
                        assert audio.shape.__len__() == 1  # [seq_len]
         | 
| 496 | 
            +
                        try:
         | 
| 497 | 
            +
                            mel = self.audio_processor(audio)  # [100(num_mel_bins), seq_len_mel]
         | 
| 498 | 
            +
                        except Exception as e:
         | 
| 499 | 
            +
                            raise e
         | 
| 500 | 
            +
                        audio_features_varlen.append(mel)
         | 
| 501 | 
            +
             | 
| 502 | 
            +
                    return {
         | 
| 503 | 
            +
                        "tts_input_ids_varlen": input_ids_varlen,  # return List[Tensor]
         | 
| 504 | 
            +
                        "tts_input_features_varlen": audio_features_varlen,  # return List[Tensor]
         | 
| 505 | 
            +
                    }
         | 
    	
        processor_config.json
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "auto_map": {
         | 
| 3 | 
            +
                "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor"
         | 
| 4 | 
            +
              },
         | 
| 5 | 
            +
              "processor_class": "MiniCPMOProcessor"
         | 
| 6 | 
            +
            }
         | 
    	
        resampler.py
    ADDED
    
    | @@ -0,0 +1,864 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2025 The OpenBMB Team. All rights reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            import warnings
         | 
| 17 | 
            +
            from functools import partial
         | 
| 18 | 
            +
            from typing import Optional
         | 
| 19 | 
            +
            from typing import Tuple
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            import numpy as np
         | 
| 22 | 
            +
            import torch
         | 
| 23 | 
            +
            import torch.nn.functional as F
         | 
| 24 | 
            +
            from torch import nn
         | 
| 25 | 
            +
            from torch import Tensor
         | 
| 26 | 
            +
            from torch.nn.functional import *
         | 
| 27 | 
            +
            from torch.nn.init import trunc_normal_
         | 
| 28 | 
            +
            from torch.nn.modules.activation import *
         | 
| 29 | 
            +
            from transformers.integrations import is_deepspeed_zero3_enabled
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            def get_2d_sincos_pos_embed(embed_dim, image_size):
         | 
| 33 | 
            +
                """
         | 
| 34 | 
            +
                image_size: image_size or (image_height, image_width)
         | 
| 35 | 
            +
                return:
         | 
| 36 | 
            +
                pos_embed: [image_height, image_width, embed_dim]
         | 
| 37 | 
            +
                """
         | 
| 38 | 
            +
                if isinstance(image_size, int):
         | 
| 39 | 
            +
                    grid_h_size, grid_w_size = image_size, image_size
         | 
| 40 | 
            +
                else:
         | 
| 41 | 
            +
                    grid_h_size, grid_w_size = image_size[0], image_size[1]
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                grid_h = np.arange(grid_h_size, dtype=np.float32)
         | 
| 44 | 
            +
                grid_w = np.arange(grid_w_size, dtype=np.float32)
         | 
| 45 | 
            +
                grid = np.meshgrid(grid_w, grid_h)  # here w goes first
         | 
| 46 | 
            +
                grid = np.stack(grid, axis=0)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
         | 
| 49 | 
            +
                return pos_embed
         | 
| 50 | 
            +
             | 
| 51 | 
            +
             | 
| 52 | 
            +
            def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
         | 
| 53 | 
            +
                assert embed_dim % 2 == 0
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                # use half of dimensions to encode grid_h
         | 
| 56 | 
            +
                emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0])  # (H, W, D/2)
         | 
| 57 | 
            +
                emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1])  # (H, W, D/2)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
         | 
| 60 | 
            +
                return emb
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos):
         | 
| 64 | 
            +
                """
         | 
| 65 | 
            +
                embed_dim: output dimension for each position
         | 
| 66 | 
            +
                pos: a list of positions to be encoded: size (H, W)
         | 
| 67 | 
            +
                out: (H, W, D)
         | 
| 68 | 
            +
                """
         | 
| 69 | 
            +
                assert embed_dim % 2 == 0
         | 
| 70 | 
            +
                omega = np.arange(embed_dim // 2, dtype=np.float32)
         | 
| 71 | 
            +
                omega /= embed_dim / 2.0
         | 
| 72 | 
            +
                omega = 1.0 / 10000**omega  # (D/2,)
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                emb_sin = np.sin(out)  # (H, W, D/2)
         | 
| 77 | 
            +
                emb_cos = np.cos(out)  # (H, W, D/2)
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
         | 
| 80 | 
            +
                return emb
         | 
| 81 | 
            +
             | 
| 82 | 
            +
             | 
| 83 | 
            +
            class Resampler(nn.Module):
         | 
| 84 | 
            +
                """
         | 
| 85 | 
            +
                A 2D perceiver-resampler network with one cross attention layers by
         | 
| 86 | 
            +
                   given learnable queries and 2d sincos pos_emb
         | 
| 87 | 
            +
                Outputs:
         | 
| 88 | 
            +
                    A tensor with the shape of (batch_size, num_queries, embed_dim)
         | 
| 89 | 
            +
                """
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                def __init__(
         | 
| 92 | 
            +
                    self,
         | 
| 93 | 
            +
                    num_queries,
         | 
| 94 | 
            +
                    embed_dim,
         | 
| 95 | 
            +
                    num_heads,
         | 
| 96 | 
            +
                    kv_dim=None,
         | 
| 97 | 
            +
                    norm_layer=partial(nn.LayerNorm, eps=1e-6),
         | 
| 98 | 
            +
                    adaptive=False,
         | 
| 99 | 
            +
                    max_size=(70, 70),
         | 
| 100 | 
            +
                ):
         | 
| 101 | 
            +
                    super().__init__()
         | 
| 102 | 
            +
                    self.num_queries = num_queries
         | 
| 103 | 
            +
                    self.embed_dim = embed_dim
         | 
| 104 | 
            +
                    self.num_heads = num_heads
         | 
| 105 | 
            +
                    self.adaptive = adaptive
         | 
| 106 | 
            +
                    self.max_size = max_size
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                    if kv_dim is not None and kv_dim != embed_dim:
         | 
| 111 | 
            +
                        self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
         | 
| 112 | 
            +
                    else:
         | 
| 113 | 
            +
                        self.kv_proj = nn.Identity()
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                    self.attn = MultiheadAttention(embed_dim, num_heads)
         | 
| 116 | 
            +
                    self.ln_q = norm_layer(embed_dim)
         | 
| 117 | 
            +
                    self.ln_kv = norm_layer(embed_dim)
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                    self.ln_post = norm_layer(embed_dim)
         | 
| 120 | 
            +
                    self.proj = nn.Parameter((embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    self._set_2d_pos_cache(self.max_size)
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                def _set_2d_pos_cache(self, max_size, device="cpu"):
         | 
| 125 | 
            +
                    if is_deepspeed_zero3_enabled():
         | 
| 126 | 
            +
                        device = "cuda"
         | 
| 127 | 
            +
                    pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float().to(device)
         | 
| 128 | 
            +
                    self.register_buffer("pos_embed", pos_embed, persistent=False)
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                def _adjust_pos_cache(self, tgt_sizes, device):
         | 
| 131 | 
            +
                    max_h = torch.max(tgt_sizes[:, 0])
         | 
| 132 | 
            +
                    max_w = torch.max(tgt_sizes[:, 1])
         | 
| 133 | 
            +
                    if max_h > self.max_size[0] or max_w > self.max_size[1]:
         | 
| 134 | 
            +
                        self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])]
         | 
| 135 | 
            +
                        self._set_2d_pos_cache(self.max_size, device)
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                def _init_weights(self, m):
         | 
| 138 | 
            +
                    if isinstance(m, nn.Linear):
         | 
| 139 | 
            +
                        trunc_normal_(m.weight, std=0.02)
         | 
| 140 | 
            +
                        if isinstance(m, nn.Linear) and m.bias is not None:
         | 
| 141 | 
            +
                            nn.init.constant_(m.bias, 0)
         | 
| 142 | 
            +
                    elif isinstance(m, nn.LayerNorm):
         | 
| 143 | 
            +
                        nn.init.constant_(m.bias, 0)
         | 
| 144 | 
            +
                        nn.init.constant_(m.weight, 1.0)
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                def forward(self, x, tgt_sizes=None):
         | 
| 147 | 
            +
                    assert x.shape[0] == tgt_sizes.shape[0]
         | 
| 148 | 
            +
                    bs = x.shape[0]
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    device = x.device
         | 
| 151 | 
            +
                    dtype = x.dtype
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                    self._adjust_pos_cache(tgt_sizes, device=device)
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                    max_patch_len = torch.max(patch_len)
         | 
| 158 | 
            +
                    key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool, device=device)
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                    pos_embed = []
         | 
| 161 | 
            +
                    for i in range(bs):
         | 
| 162 | 
            +
                        tgt_h, tgt_w = tgt_sizes[i]
         | 
| 163 | 
            +
                        pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype))  # patches * D
         | 
| 164 | 
            +
                        key_padding_mask[i, patch_len[i] :] = True
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
         | 
| 167 | 
            +
                        1, 0, 2
         | 
| 168 | 
            +
                    )  # BLD => L * B * D
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                    x = self.kv_proj(x)  # B * L * D
         | 
| 171 | 
            +
                    x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                    q = self.ln_q(self.query)  # Q * D
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                    out = self.attn(
         | 
| 176 | 
            +
                        self._repeat(q, bs),  # Q * B * D
         | 
| 177 | 
            +
                        x + pos_embed,  # L * B * D +  L * B * D
         | 
| 178 | 
            +
                        x,
         | 
| 179 | 
            +
                        key_padding_mask=key_padding_mask,
         | 
| 180 | 
            +
                    )[0]
         | 
| 181 | 
            +
                    #  out: Q * B * D
         | 
| 182 | 
            +
                    x = out.permute(1, 0, 2)  # B * Q * D
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                    x = self.ln_post(x)
         | 
| 185 | 
            +
                    x = x @ self.proj
         | 
| 186 | 
            +
                    return x
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                def _repeat(self, query, N: int):
         | 
| 189 | 
            +
                    return query.unsqueeze(1).repeat(1, N, 1)
         | 
| 190 | 
            +
             | 
| 191 | 
            +
             | 
| 192 | 
            +
            class MultiheadAttention(nn.MultiheadAttention):
         | 
| 193 | 
            +
                def __init__(
         | 
| 194 | 
            +
                    self,
         | 
| 195 | 
            +
                    embed_dim,
         | 
| 196 | 
            +
                    num_heads,
         | 
| 197 | 
            +
                    dropout=0.0,
         | 
| 198 | 
            +
                    bias=True,
         | 
| 199 | 
            +
                    add_bias_kv=False,
         | 
| 200 | 
            +
                    add_zero_attn=False,
         | 
| 201 | 
            +
                    kdim=None,
         | 
| 202 | 
            +
                    vdim=None,
         | 
| 203 | 
            +
                    batch_first=False,
         | 
| 204 | 
            +
                    device=None,
         | 
| 205 | 
            +
                    dtype=None,
         | 
| 206 | 
            +
                ):
         | 
| 207 | 
            +
                    super().__init__(
         | 
| 208 | 
            +
                        embed_dim, num_heads, dropout, bias, add_bias_kv, add_zero_attn, kdim, vdim, batch_first, device, dtype
         | 
| 209 | 
            +
                    )
         | 
| 210 | 
            +
             | 
| 211 | 
            +
                    # rewrite out_proj layer,with nn.Linear
         | 
| 212 | 
            +
                    self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                def forward(
         | 
| 215 | 
            +
                    self,
         | 
| 216 | 
            +
                    query: Tensor,
         | 
| 217 | 
            +
                    key: Tensor,
         | 
| 218 | 
            +
                    value: Tensor,
         | 
| 219 | 
            +
                    key_padding_mask: Optional[Tensor] = None,
         | 
| 220 | 
            +
                    need_weights: bool = True,
         | 
| 221 | 
            +
                    attn_mask: Optional[Tensor] = None,
         | 
| 222 | 
            +
                    average_attn_weights: bool = True,
         | 
| 223 | 
            +
                    is_causal: bool = False,
         | 
| 224 | 
            +
                ) -> Tuple[Tensor, Optional[Tensor]]:
         | 
| 225 | 
            +
                    why_not_fast_path = ""
         | 
| 226 | 
            +
                    if (
         | 
| 227 | 
            +
                        (attn_mask is not None and torch.is_floating_point(attn_mask))
         | 
| 228 | 
            +
                        or (key_padding_mask is not None)
         | 
| 229 | 
            +
                        and torch.is_floating_point(key_padding_mask)
         | 
| 230 | 
            +
                    ):
         | 
| 231 | 
            +
                        why_not_fast_path = "floating-point masks are not supported for fast path."
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                    is_batched = query.dim() == 3
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                    key_padding_mask = _canonical_mask(
         | 
| 236 | 
            +
                        mask=key_padding_mask,
         | 
| 237 | 
            +
                        mask_name="key_padding_mask",
         | 
| 238 | 
            +
                        other_type=F._none_or_dtype(attn_mask),
         | 
| 239 | 
            +
                        other_name="attn_mask",
         | 
| 240 | 
            +
                        target_type=query.dtype,
         | 
| 241 | 
            +
                    )
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                    attn_mask = _canonical_mask(
         | 
| 244 | 
            +
                        mask=attn_mask,
         | 
| 245 | 
            +
                        mask_name="attn_mask",
         | 
| 246 | 
            +
                        other_type=None,
         | 
| 247 | 
            +
                        other_name="",
         | 
| 248 | 
            +
                        target_type=query.dtype,
         | 
| 249 | 
            +
                        check_other=False,
         | 
| 250 | 
            +
                    )
         | 
| 251 | 
            +
             | 
| 252 | 
            +
                    if not is_batched:
         | 
| 253 | 
            +
                        why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
         | 
| 254 | 
            +
                    elif query is not key or key is not value:
         | 
| 255 | 
            +
                        # When lifting this restriction, don't forget to either
         | 
| 256 | 
            +
                        # enforce that the dtypes all match or test cases where
         | 
| 257 | 
            +
                        # they don't!
         | 
| 258 | 
            +
                        why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
         | 
| 259 | 
            +
                    elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
         | 
| 260 | 
            +
                        why_not_fast_path = (
         | 
| 261 | 
            +
                            f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
         | 
| 262 | 
            +
                        )
         | 
| 263 | 
            +
                    elif self.in_proj_weight is None:
         | 
| 264 | 
            +
                        why_not_fast_path = "in_proj_weight was None"
         | 
| 265 | 
            +
                    elif query.dtype != self.in_proj_weight.dtype:
         | 
| 266 | 
            +
                        # this case will fail anyway, but at least they'll get a useful error message.
         | 
| 267 | 
            +
                        why_not_fast_path = (
         | 
| 268 | 
            +
                            f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
         | 
| 269 | 
            +
                        )
         | 
| 270 | 
            +
                    elif self.training:
         | 
| 271 | 
            +
                        why_not_fast_path = "training is enabled"
         | 
| 272 | 
            +
                    elif (self.num_heads % 2) != 0:
         | 
| 273 | 
            +
                        why_not_fast_path = "self.num_heads is not even"
         | 
| 274 | 
            +
                    elif not self.batch_first:
         | 
| 275 | 
            +
                        why_not_fast_path = "batch_first was not True"
         | 
| 276 | 
            +
                    elif self.bias_k is not None:
         | 
| 277 | 
            +
                        why_not_fast_path = "self.bias_k was not None"
         | 
| 278 | 
            +
                    elif self.bias_v is not None:
         | 
| 279 | 
            +
                        why_not_fast_path = "self.bias_v was not None"
         | 
| 280 | 
            +
                    elif self.add_zero_attn:
         | 
| 281 | 
            +
                        why_not_fast_path = "add_zero_attn was enabled"
         | 
| 282 | 
            +
                    elif not self._qkv_same_embed_dim:
         | 
| 283 | 
            +
                        why_not_fast_path = "_qkv_same_embed_dim was not True"
         | 
| 284 | 
            +
                    elif query.is_nested and (key_padding_mask is not None or attn_mask is not None):
         | 
| 285 | 
            +
                        why_not_fast_path = "supplying both src_key_padding_mask and src_mask at the same time \
         | 
| 286 | 
            +
                                             is not supported with NestedTensor input"
         | 
| 287 | 
            +
                    elif torch.is_autocast_enabled():
         | 
| 288 | 
            +
                        why_not_fast_path = "autocast is enabled"
         | 
| 289 | 
            +
             | 
| 290 | 
            +
                    if not why_not_fast_path:
         | 
| 291 | 
            +
                        tensor_args = (
         | 
| 292 | 
            +
                            query,
         | 
| 293 | 
            +
                            key,
         | 
| 294 | 
            +
                            value,
         | 
| 295 | 
            +
                            self.in_proj_weight,
         | 
| 296 | 
            +
                            self.in_proj_bias,
         | 
| 297 | 
            +
                            self.out_proj.weight,
         | 
| 298 | 
            +
                            self.out_proj.bias,
         | 
| 299 | 
            +
                        )
         | 
| 300 | 
            +
                        # We have to use list comprehensions below because TorchScript does not support
         | 
| 301 | 
            +
                        # generator expressions.
         | 
| 302 | 
            +
                        if torch.overrides.has_torch_function(tensor_args):
         | 
| 303 | 
            +
                            why_not_fast_path = "some Tensor argument has_torch_function"
         | 
| 304 | 
            +
                        elif _is_make_fx_tracing():
         | 
| 305 | 
            +
                            why_not_fast_path = "we are running make_fx tracing"
         | 
| 306 | 
            +
                        elif not all(_check_arg_device(x) for x in tensor_args):
         | 
| 307 | 
            +
                            why_not_fast_path = (
         | 
| 308 | 
            +
                                "some Tensor argument's device is neither one of "
         | 
| 309 | 
            +
                                f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}"
         | 
| 310 | 
            +
                            )
         | 
| 311 | 
            +
                        elif torch.is_grad_enabled() and any(_arg_requires_grad(x) for x in tensor_args):
         | 
| 312 | 
            +
                            why_not_fast_path = (
         | 
| 313 | 
            +
                                "grad is enabled and at least one of query or the "
         | 
| 314 | 
            +
                                "input/output projection weights or biases requires_grad"
         | 
| 315 | 
            +
                            )
         | 
| 316 | 
            +
                        if not why_not_fast_path:
         | 
| 317 | 
            +
                            merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query)
         | 
| 318 | 
            +
             | 
| 319 | 
            +
                            if self.in_proj_bias is not None and self.in_proj_weight is not None:
         | 
| 320 | 
            +
                                return torch._native_multi_head_attention(
         | 
| 321 | 
            +
                                    query,
         | 
| 322 | 
            +
                                    key,
         | 
| 323 | 
            +
                                    value,
         | 
| 324 | 
            +
                                    self.embed_dim,
         | 
| 325 | 
            +
                                    self.num_heads,
         | 
| 326 | 
            +
                                    self.in_proj_weight,
         | 
| 327 | 
            +
                                    self.in_proj_bias,
         | 
| 328 | 
            +
                                    self.out_proj.weight,
         | 
| 329 | 
            +
                                    self.out_proj.bias,
         | 
| 330 | 
            +
                                    merged_mask,
         | 
| 331 | 
            +
                                    need_weights,
         | 
| 332 | 
            +
                                    average_attn_weights,
         | 
| 333 | 
            +
                                    mask_type,
         | 
| 334 | 
            +
                                )
         | 
| 335 | 
            +
             | 
| 336 | 
            +
                    any_nested = query.is_nested or key.is_nested or value.is_nested
         | 
| 337 | 
            +
                    assert not any_nested, (
         | 
| 338 | 
            +
                        "MultiheadAttention does not support NestedTensor outside of its fast path. "
         | 
| 339 | 
            +
                        + f"The fast path was not hit because {why_not_fast_path}"
         | 
| 340 | 
            +
                    )
         | 
| 341 | 
            +
             | 
| 342 | 
            +
                    if self.batch_first and is_batched:
         | 
| 343 | 
            +
                        # make sure that the transpose op does not affect the "is" property
         | 
| 344 | 
            +
                        if key is value:
         | 
| 345 | 
            +
                            if query is key:
         | 
| 346 | 
            +
                                query = key = value = query.transpose(1, 0)
         | 
| 347 | 
            +
                            else:
         | 
| 348 | 
            +
                                query, key = (x.transpose(1, 0) for x in (query, key))
         | 
| 349 | 
            +
                                value = key
         | 
| 350 | 
            +
                        else:
         | 
| 351 | 
            +
                            query, key, value = (x.transpose(1, 0) for x in (query, key, value))
         | 
| 352 | 
            +
             | 
| 353 | 
            +
                    if not self._qkv_same_embed_dim:
         | 
| 354 | 
            +
                        attn_output, attn_output_weights = self.multi_head_attention_forward(
         | 
| 355 | 
            +
                            query,
         | 
| 356 | 
            +
                            key,
         | 
| 357 | 
            +
                            value,
         | 
| 358 | 
            +
                            self.embed_dim,
         | 
| 359 | 
            +
                            self.num_heads,
         | 
| 360 | 
            +
                            self.in_proj_weight,
         | 
| 361 | 
            +
                            self.in_proj_bias,
         | 
| 362 | 
            +
                            self.bias_k,
         | 
| 363 | 
            +
                            self.bias_v,
         | 
| 364 | 
            +
                            self.add_zero_attn,
         | 
| 365 | 
            +
                            self.dropout,
         | 
| 366 | 
            +
                            self.out_proj.weight,
         | 
| 367 | 
            +
                            self.out_proj.bias,
         | 
| 368 | 
            +
                            training=self.training,
         | 
| 369 | 
            +
                            key_padding_mask=key_padding_mask,
         | 
| 370 | 
            +
                            need_weights=need_weights,
         | 
| 371 | 
            +
                            attn_mask=attn_mask,
         | 
| 372 | 
            +
                            use_separate_proj_weight=True,
         | 
| 373 | 
            +
                            q_proj_weight=self.q_proj_weight,
         | 
| 374 | 
            +
                            k_proj_weight=self.k_proj_weight,
         | 
| 375 | 
            +
                            v_proj_weight=self.v_proj_weight,
         | 
| 376 | 
            +
                            average_attn_weights=average_attn_weights,
         | 
| 377 | 
            +
                            is_causal=is_causal,
         | 
| 378 | 
            +
                        )
         | 
| 379 | 
            +
                    else:
         | 
| 380 | 
            +
                        attn_output, attn_output_weights = self.multi_head_attention_forward(
         | 
| 381 | 
            +
                            query,
         | 
| 382 | 
            +
                            key,
         | 
| 383 | 
            +
                            value,
         | 
| 384 | 
            +
                            self.embed_dim,
         | 
| 385 | 
            +
                            self.num_heads,
         | 
| 386 | 
            +
                            self.in_proj_weight,
         | 
| 387 | 
            +
                            self.in_proj_bias,
         | 
| 388 | 
            +
                            self.bias_k,
         | 
| 389 | 
            +
                            self.bias_v,
         | 
| 390 | 
            +
                            self.add_zero_attn,
         | 
| 391 | 
            +
                            self.dropout,
         | 
| 392 | 
            +
                            self.out_proj.weight,
         | 
| 393 | 
            +
                            self.out_proj.bias,
         | 
| 394 | 
            +
                            training=self.training,
         | 
| 395 | 
            +
                            key_padding_mask=key_padding_mask,
         | 
| 396 | 
            +
                            need_weights=need_weights,
         | 
| 397 | 
            +
                            attn_mask=attn_mask,
         | 
| 398 | 
            +
                            average_attn_weights=average_attn_weights,
         | 
| 399 | 
            +
                            is_causal=is_causal,
         | 
| 400 | 
            +
                        )
         | 
| 401 | 
            +
                    if self.batch_first and is_batched:
         | 
| 402 | 
            +
                        return attn_output.transpose(1, 0), attn_output_weights
         | 
| 403 | 
            +
                    else:
         | 
| 404 | 
            +
                        return attn_output, attn_output_weights
         | 
| 405 | 
            +
             | 
| 406 | 
            +
                def multi_head_attention_forward(
         | 
| 407 | 
            +
                    self,
         | 
| 408 | 
            +
                    query: Tensor,
         | 
| 409 | 
            +
                    key: Tensor,
         | 
| 410 | 
            +
                    value: Tensor,
         | 
| 411 | 
            +
                    embed_dim_to_check: int,
         | 
| 412 | 
            +
                    num_heads: int,
         | 
| 413 | 
            +
                    in_proj_weight: Optional[Tensor],
         | 
| 414 | 
            +
                    in_proj_bias: Optional[Tensor],
         | 
| 415 | 
            +
                    bias_k: Optional[Tensor],
         | 
| 416 | 
            +
                    bias_v: Optional[Tensor],
         | 
| 417 | 
            +
                    add_zero_attn: bool,
         | 
| 418 | 
            +
                    dropout_p: float,
         | 
| 419 | 
            +
                    out_proj_weight: Tensor,
         | 
| 420 | 
            +
                    out_proj_bias: Optional[Tensor],
         | 
| 421 | 
            +
                    training: bool = True,
         | 
| 422 | 
            +
                    key_padding_mask: Optional[Tensor] = None,
         | 
| 423 | 
            +
                    need_weights: bool = True,
         | 
| 424 | 
            +
                    attn_mask: Optional[Tensor] = None,
         | 
| 425 | 
            +
                    use_separate_proj_weight: bool = False,
         | 
| 426 | 
            +
                    q_proj_weight: Optional[Tensor] = None,
         | 
| 427 | 
            +
                    k_proj_weight: Optional[Tensor] = None,
         | 
| 428 | 
            +
                    v_proj_weight: Optional[Tensor] = None,
         | 
| 429 | 
            +
                    static_k: Optional[Tensor] = None,
         | 
| 430 | 
            +
                    static_v: Optional[Tensor] = None,
         | 
| 431 | 
            +
                    average_attn_weights: bool = True,
         | 
| 432 | 
            +
                    is_causal: bool = False,
         | 
| 433 | 
            +
                ) -> Tuple[Tensor, Optional[Tensor]]:
         | 
| 434 | 
            +
                    tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
         | 
| 435 | 
            +
             | 
| 436 | 
            +
                    is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
         | 
| 437 | 
            +
             | 
| 438 | 
            +
                    # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
         | 
| 439 | 
            +
                    # is batched, run the computation and before returning squeeze the
         | 
| 440 | 
            +
                    # batch dimension so that the output doesn't carry this temporary batch dimension.
         | 
| 441 | 
            +
                    if not is_batched:
         | 
| 442 | 
            +
                        # unsqueeze if the input is unbatched
         | 
| 443 | 
            +
                        query = query.unsqueeze(1)
         | 
| 444 | 
            +
                        key = key.unsqueeze(1)
         | 
| 445 | 
            +
                        value = value.unsqueeze(1)
         | 
| 446 | 
            +
                        if key_padding_mask is not None:
         | 
| 447 | 
            +
                            key_padding_mask = key_padding_mask.unsqueeze(0)
         | 
| 448 | 
            +
             | 
| 449 | 
            +
                    # set up shape vars
         | 
| 450 | 
            +
                    tgt_len, bsz, embed_dim = query.shape
         | 
| 451 | 
            +
                    src_len, _, _ = key.shape
         | 
| 452 | 
            +
             | 
| 453 | 
            +
                    key_padding_mask = _canonical_mask(
         | 
| 454 | 
            +
                        mask=key_padding_mask,
         | 
| 455 | 
            +
                        mask_name="key_padding_mask",
         | 
| 456 | 
            +
                        other_type=F._none_or_dtype(attn_mask),
         | 
| 457 | 
            +
                        other_name="attn_mask",
         | 
| 458 | 
            +
                        target_type=query.dtype,
         | 
| 459 | 
            +
                    )
         | 
| 460 | 
            +
             | 
| 461 | 
            +
                    if is_causal and attn_mask is None:
         | 
| 462 | 
            +
                        raise RuntimeError(
         | 
| 463 | 
            +
                            "Need attn_mask if specifying the is_causal hint. "
         | 
| 464 | 
            +
                            "You may use the Transformer module method "
         | 
| 465 | 
            +
                            "`generate_square_subsequent_mask` to create this mask."
         | 
| 466 | 
            +
                        )
         | 
| 467 | 
            +
             | 
| 468 | 
            +
                    if is_causal and key_padding_mask is None and not need_weights:
         | 
| 469 | 
            +
                        # when we have a kpm or need weights, we need attn_mask
         | 
| 470 | 
            +
                        # Otherwise, we use the is_causal hint go as is_causal
         | 
| 471 | 
            +
                        # indicator to SDPA.
         | 
| 472 | 
            +
                        attn_mask = None
         | 
| 473 | 
            +
                    else:
         | 
| 474 | 
            +
                        attn_mask = _canonical_mask(
         | 
| 475 | 
            +
                            mask=attn_mask,
         | 
| 476 | 
            +
                            mask_name="attn_mask",
         | 
| 477 | 
            +
                            other_type=None,
         | 
| 478 | 
            +
                            other_name="",
         | 
| 479 | 
            +
                            target_type=query.dtype,
         | 
| 480 | 
            +
                            check_other=False,
         | 
| 481 | 
            +
                        )
         | 
| 482 | 
            +
             | 
| 483 | 
            +
                        if key_padding_mask is not None:
         | 
| 484 | 
            +
                            # We have the attn_mask, and use that to merge kpm into it.
         | 
| 485 | 
            +
                            # Turn off use of is_causal hint, as the merged mask is no
         | 
| 486 | 
            +
                            # longer causal.
         | 
| 487 | 
            +
                            is_causal = False
         | 
| 488 | 
            +
             | 
| 489 | 
            +
                    assert (
         | 
| 490 | 
            +
                        embed_dim == embed_dim_to_check
         | 
| 491 | 
            +
                    ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
         | 
| 492 | 
            +
                    if isinstance(embed_dim, torch.Tensor):
         | 
| 493 | 
            +
                        # embed_dim can be a tensor when JIT tracing
         | 
| 494 | 
            +
                        head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
         | 
| 495 | 
            +
                    else:
         | 
| 496 | 
            +
                        head_dim = embed_dim // num_heads
         | 
| 497 | 
            +
                    assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
         | 
| 498 | 
            +
                    if use_separate_proj_weight:
         | 
| 499 | 
            +
                        # allow MHA to have different embedding dimensions when separate projection weights are used
         | 
| 500 | 
            +
                        assert (
         | 
| 501 | 
            +
                            key.shape[:2] == value.shape[:2]
         | 
| 502 | 
            +
                        ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
         | 
| 503 | 
            +
                    else:
         | 
| 504 | 
            +
                        assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
         | 
| 505 | 
            +
             | 
| 506 | 
            +
                    #
         | 
| 507 | 
            +
                    # compute in-projection
         | 
| 508 | 
            +
                    #
         | 
| 509 | 
            +
                    if not use_separate_proj_weight:
         | 
| 510 | 
            +
                        assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
         | 
| 511 | 
            +
                        q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
         | 
| 512 | 
            +
                    else:
         | 
| 513 | 
            +
                        assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
         | 
| 514 | 
            +
                        assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
         | 
| 515 | 
            +
                        assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
         | 
| 516 | 
            +
                        if in_proj_bias is None:
         | 
| 517 | 
            +
                            b_q = b_k = b_v = None
         | 
| 518 | 
            +
                        else:
         | 
| 519 | 
            +
                            b_q, b_k, b_v = in_proj_bias.chunk(3)
         | 
| 520 | 
            +
                        q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
         | 
| 521 | 
            +
             | 
| 522 | 
            +
                    # prep attention mask
         | 
| 523 | 
            +
             | 
| 524 | 
            +
                    if attn_mask is not None:
         | 
| 525 | 
            +
                        # ensure attn_mask's dim is 3
         | 
| 526 | 
            +
                        if attn_mask.dim() == 2:
         | 
| 527 | 
            +
                            correct_2d_size = (tgt_len, src_len)
         | 
| 528 | 
            +
                            if attn_mask.shape != correct_2d_size:
         | 
| 529 | 
            +
                                raise RuntimeError(
         | 
| 530 | 
            +
                                    f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}."
         | 
| 531 | 
            +
                                )
         | 
| 532 | 
            +
                            attn_mask = attn_mask.unsqueeze(0)
         | 
| 533 | 
            +
                        elif attn_mask.dim() == 3:
         | 
| 534 | 
            +
                            correct_3d_size = (bsz * num_heads, tgt_len, src_len)
         | 
| 535 | 
            +
                            if attn_mask.shape != correct_3d_size:
         | 
| 536 | 
            +
                                raise RuntimeError(
         | 
| 537 | 
            +
                                    f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}."
         | 
| 538 | 
            +
                                )
         | 
| 539 | 
            +
                        else:
         | 
| 540 | 
            +
                            raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
         | 
| 541 | 
            +
             | 
| 542 | 
            +
                    # add bias along batch dimension (currently second)
         | 
| 543 | 
            +
                    if bias_k is not None and bias_v is not None:
         | 
| 544 | 
            +
                        assert static_k is None, "bias cannot be added to static key."
         | 
| 545 | 
            +
                        assert static_v is None, "bias cannot be added to static value."
         | 
| 546 | 
            +
                        k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
         | 
| 547 | 
            +
                        v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
         | 
| 548 | 
            +
                        if attn_mask is not None:
         | 
| 549 | 
            +
                            attn_mask = pad(attn_mask, (0, 1))
         | 
| 550 | 
            +
                        if key_padding_mask is not None:
         | 
| 551 | 
            +
                            key_padding_mask = pad(key_padding_mask, (0, 1))
         | 
| 552 | 
            +
                    else:
         | 
| 553 | 
            +
                        assert bias_k is None
         | 
| 554 | 
            +
                        assert bias_v is None
         | 
| 555 | 
            +
             | 
| 556 | 
            +
                    #
         | 
| 557 | 
            +
                    # reshape q, k, v for multihead attention and make em batch first
         | 
| 558 | 
            +
                    #
         | 
| 559 | 
            +
                    q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
         | 
| 560 | 
            +
                    if static_k is None:
         | 
| 561 | 
            +
                        k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
         | 
| 562 | 
            +
                    else:
         | 
| 563 | 
            +
                        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
         | 
| 564 | 
            +
                        assert (
         | 
| 565 | 
            +
                            static_k.size(0) == bsz * num_heads
         | 
| 566 | 
            +
                        ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
         | 
| 567 | 
            +
                        assert static_k.size(2) == head_dim, f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
         | 
| 568 | 
            +
                        k = static_k
         | 
| 569 | 
            +
                    if static_v is None:
         | 
| 570 | 
            +
                        v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
         | 
| 571 | 
            +
                    else:
         | 
| 572 | 
            +
                        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
         | 
| 573 | 
            +
                        assert (
         | 
| 574 | 
            +
                            static_v.size(0) == bsz * num_heads
         | 
| 575 | 
            +
                        ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
         | 
| 576 | 
            +
                        assert static_v.size(2) == head_dim, f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
         | 
| 577 | 
            +
                        v = static_v
         | 
| 578 | 
            +
             | 
| 579 | 
            +
                    # add zero attention along batch dimension (now first)
         | 
| 580 | 
            +
                    if add_zero_attn:
         | 
| 581 | 
            +
                        zero_attn_shape = (bsz * num_heads, 1, head_dim)
         | 
| 582 | 
            +
                        k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
         | 
| 583 | 
            +
                        v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
         | 
| 584 | 
            +
                        if attn_mask is not None:
         | 
| 585 | 
            +
                            attn_mask = pad(attn_mask, (0, 1))
         | 
| 586 | 
            +
                        if key_padding_mask is not None:
         | 
| 587 | 
            +
                            key_padding_mask = pad(key_padding_mask, (0, 1))
         | 
| 588 | 
            +
             | 
| 589 | 
            +
                    # update source sequence length after adjustments
         | 
| 590 | 
            +
                    src_len = k.size(1)
         | 
| 591 | 
            +
             | 
| 592 | 
            +
                    # merge key padding and attention masks
         | 
| 593 | 
            +
                    if key_padding_mask is not None:
         | 
| 594 | 
            +
                        assert key_padding_mask.shape == (
         | 
| 595 | 
            +
                            bsz,
         | 
| 596 | 
            +
                            src_len,
         | 
| 597 | 
            +
                        ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
         | 
| 598 | 
            +
                        key_padding_mask = (
         | 
| 599 | 
            +
                            key_padding_mask.view(bsz, 1, 1, src_len)
         | 
| 600 | 
            +
                            .expand(-1, num_heads, -1, -1)
         | 
| 601 | 
            +
                            .reshape(bsz * num_heads, 1, src_len)
         | 
| 602 | 
            +
                        )
         | 
| 603 | 
            +
                        if attn_mask is None:
         | 
| 604 | 
            +
                            attn_mask = key_padding_mask
         | 
| 605 | 
            +
                        else:
         | 
| 606 | 
            +
                            attn_mask = attn_mask + key_padding_mask
         | 
| 607 | 
            +
             | 
| 608 | 
            +
                    # adjust dropout probability
         | 
| 609 | 
            +
                    if not training:
         | 
| 610 | 
            +
                        dropout_p = 0.0
         | 
| 611 | 
            +
             | 
| 612 | 
            +
                    #
         | 
| 613 | 
            +
                    # (deep breath) calculate attention and out projection
         | 
| 614 | 
            +
                    #
         | 
| 615 | 
            +
             | 
| 616 | 
            +
                    if need_weights:
         | 
| 617 | 
            +
                        B, Nt, E = q.shape
         | 
| 618 | 
            +
                        q_scaled = q / math.sqrt(E)
         | 
| 619 | 
            +
             | 
| 620 | 
            +
                        assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
         | 
| 621 | 
            +
             | 
| 622 | 
            +
                        if attn_mask is not None:
         | 
| 623 | 
            +
                            attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
         | 
| 624 | 
            +
                        else:
         | 
| 625 | 
            +
                            attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
         | 
| 626 | 
            +
                        attn_output_weights = softmax(attn_output_weights, dim=-1)
         | 
| 627 | 
            +
                        if dropout_p > 0.0:
         | 
| 628 | 
            +
                            attn_output_weights = dropout(attn_output_weights, p=dropout_p)
         | 
| 629 | 
            +
             | 
| 630 | 
            +
                        attn_output = torch.bmm(attn_output_weights, v)
         | 
| 631 | 
            +
             | 
| 632 | 
            +
                        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
         | 
| 633 | 
            +
                        attn_output = self.out_proj(attn_output)
         | 
| 634 | 
            +
                        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
         | 
| 635 | 
            +
             | 
| 636 | 
            +
                        # optionally average attention weights over heads
         | 
| 637 | 
            +
                        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
         | 
| 638 | 
            +
                        if average_attn_weights:
         | 
| 639 | 
            +
                            attn_output_weights = attn_output_weights.mean(dim=1)
         | 
| 640 | 
            +
             | 
| 641 | 
            +
                        if not is_batched:
         | 
| 642 | 
            +
                            # squeeze the output if input was unbatched
         | 
| 643 | 
            +
                            attn_output = attn_output.squeeze(1)
         | 
| 644 | 
            +
                            attn_output_weights = attn_output_weights.squeeze(0)
         | 
| 645 | 
            +
                        return attn_output, attn_output_weights
         | 
| 646 | 
            +
                    else:
         | 
| 647 | 
            +
                        # attn_mask can be either (L,S) or (N*num_heads, L, S)
         | 
| 648 | 
            +
                        # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
         | 
| 649 | 
            +
                        # in order to match the input for SDPA of (N, num_heads, L, S)
         | 
| 650 | 
            +
                        if attn_mask is not None:
         | 
| 651 | 
            +
                            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
         | 
| 652 | 
            +
                                attn_mask = attn_mask.unsqueeze(0)
         | 
| 653 | 
            +
                            else:
         | 
| 654 | 
            +
                                attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
         | 
| 655 | 
            +
             | 
| 656 | 
            +
                        q = q.view(bsz, num_heads, tgt_len, head_dim)
         | 
| 657 | 
            +
                        k = k.view(bsz, num_heads, src_len, head_dim)
         | 
| 658 | 
            +
                        v = v.view(bsz, num_heads, src_len, head_dim)
         | 
| 659 | 
            +
             | 
| 660 | 
            +
                        attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
         | 
| 661 | 
            +
                        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
         | 
| 662 | 
            +
             | 
| 663 | 
            +
                        attn_output = self.out_proj(attn_output)
         | 
| 664 | 
            +
                        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
         | 
| 665 | 
            +
                        if not is_batched:
         | 
| 666 | 
            +
                            # squeeze the output if input was unbatched
         | 
| 667 | 
            +
                            attn_output = attn_output.squeeze(1)
         | 
| 668 | 
            +
                        return attn_output, None
         | 
| 669 | 
            +
             | 
| 670 | 
            +
             | 
| 671 | 
            +
            def _mha_shape_check(
         | 
| 672 | 
            +
                query: Tensor,
         | 
| 673 | 
            +
                key: Tensor,
         | 
| 674 | 
            +
                value: Tensor,
         | 
| 675 | 
            +
                key_padding_mask: Optional[Tensor],
         | 
| 676 | 
            +
                attn_mask: Optional[Tensor],
         | 
| 677 | 
            +
                num_heads: int,
         | 
| 678 | 
            +
            ):
         | 
| 679 | 
            +
                # Verifies the expected shape for `query, `key`, `value`, `key_padding_mask` and `attn_mask`
         | 
| 680 | 
            +
                # and returns if the input is batched or not.
         | 
| 681 | 
            +
                # Raises an error if `query` is not 2-D (unbatched) or 3-D (batched) tensor.
         | 
| 682 | 
            +
             | 
| 683 | 
            +
                # Shape check.
         | 
| 684 | 
            +
                if query.dim() == 3:
         | 
| 685 | 
            +
                    # Batched Inputs
         | 
| 686 | 
            +
                    is_batched = True
         | 
| 687 | 
            +
                    assert key.dim() == 3 and value.dim() == 3, (
         | 
| 688 | 
            +
                        "For batched (3-D) `query`, expected `key` and `value` to be 3-D"
         | 
| 689 | 
            +
                        f" but found {key.dim()}-D and {value.dim()}-D tensors respectively"
         | 
| 690 | 
            +
                    )
         | 
| 691 | 
            +
                    if key_padding_mask is not None:
         | 
| 692 | 
            +
                        assert key_padding_mask.dim() == 2, (
         | 
| 693 | 
            +
                            "For batched (3-D) `query`, expected `key_padding_mask` to be `None` or 2-D"
         | 
| 694 | 
            +
                            f" but found {key_padding_mask.dim()}-D tensor instead"
         | 
| 695 | 
            +
                        )
         | 
| 696 | 
            +
                    if attn_mask is not None:
         | 
| 697 | 
            +
                        assert attn_mask.dim() in (2, 3), (
         | 
| 698 | 
            +
                            "For batched (3-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
         | 
| 699 | 
            +
                            f" but found {attn_mask.dim()}-D tensor instead"
         | 
| 700 | 
            +
                        )
         | 
| 701 | 
            +
                elif query.dim() == 2:
         | 
| 702 | 
            +
                    # Unbatched Inputs
         | 
| 703 | 
            +
                    is_batched = False
         | 
| 704 | 
            +
                    assert key.dim() == 2 and value.dim() == 2, (
         | 
| 705 | 
            +
                        "For unbatched (2-D) `query`, expected `key` and `value` to be 2-D"
         | 
| 706 | 
            +
                        f" but found {key.dim()}-D and {value.dim()}-D tensors respectively"
         | 
| 707 | 
            +
                    )
         | 
| 708 | 
            +
             | 
| 709 | 
            +
                    if key_padding_mask is not None:
         | 
| 710 | 
            +
                        assert key_padding_mask.dim() == 1, (
         | 
| 711 | 
            +
                            "For unbatched (2-D) `query`, expected `key_padding_mask` to be `None` or 1-D"
         | 
| 712 | 
            +
                            f" but found {key_padding_mask.dim()}-D tensor instead"
         | 
| 713 | 
            +
                        )
         | 
| 714 | 
            +
             | 
| 715 | 
            +
                    if attn_mask is not None:
         | 
| 716 | 
            +
                        assert attn_mask.dim() in (2, 3), (
         | 
| 717 | 
            +
                            "For unbatched (2-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
         | 
| 718 | 
            +
                            f" but found {attn_mask.dim()}-D tensor instead"
         | 
| 719 | 
            +
                        )
         | 
| 720 | 
            +
                        if attn_mask.dim() == 3:
         | 
| 721 | 
            +
                            expected_shape = (num_heads, query.shape[0], key.shape[0])
         | 
| 722 | 
            +
                            assert (
         | 
| 723 | 
            +
                                attn_mask.shape == expected_shape
         | 
| 724 | 
            +
                            ), f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}"
         | 
| 725 | 
            +
                else:
         | 
| 726 | 
            +
                    raise AssertionError(
         | 
| 727 | 
            +
                        f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor"
         | 
| 728 | 
            +
                    )
         | 
| 729 | 
            +
             | 
| 730 | 
            +
                return is_batched
         | 
| 731 | 
            +
             | 
| 732 | 
            +
             | 
| 733 | 
            +
            def _canonical_mask(
         | 
| 734 | 
            +
                mask: Optional[Tensor],
         | 
| 735 | 
            +
                mask_name: str,
         | 
| 736 | 
            +
                other_type: Optional[DType],
         | 
| 737 | 
            +
                other_name: str,
         | 
| 738 | 
            +
                target_type: DType,
         | 
| 739 | 
            +
                check_other: bool = True,
         | 
| 740 | 
            +
            ) -> Optional[Tensor]:
         | 
| 741 | 
            +
             | 
| 742 | 
            +
                if mask is not None:
         | 
| 743 | 
            +
                    _mask_dtype = mask.dtype
         | 
| 744 | 
            +
                    _mask_is_float = torch.is_floating_point(mask)
         | 
| 745 | 
            +
                    if _mask_dtype != torch.bool and not _mask_is_float:
         | 
| 746 | 
            +
                        raise AssertionError(f"only bool and floating types of {mask_name} are supported")
         | 
| 747 | 
            +
                    if check_other and other_type is not None:
         | 
| 748 | 
            +
                        if _mask_dtype != other_type:
         | 
| 749 | 
            +
                            warnings.warn(
         | 
| 750 | 
            +
                                f"Support for mismatched {mask_name} and {other_name} "
         | 
| 751 | 
            +
                                "is deprecated. Use same type for both instead."
         | 
| 752 | 
            +
                            )
         | 
| 753 | 
            +
                    if not _mask_is_float:
         | 
| 754 | 
            +
                        mask = torch.zeros_like(mask, dtype=target_type).masked_fill_(mask, float("-inf"))
         | 
| 755 | 
            +
                return mask
         | 
| 756 | 
            +
             | 
| 757 | 
            +
             | 
| 758 | 
            +
            def _in_projection_packed(
         | 
| 759 | 
            +
                q: Tensor,
         | 
| 760 | 
            +
                k: Tensor,
         | 
| 761 | 
            +
                v: Tensor,
         | 
| 762 | 
            +
                w: Tensor,
         | 
| 763 | 
            +
                b: Optional[Tensor] = None,
         | 
| 764 | 
            +
            ) -> List[Tensor]:
         | 
| 765 | 
            +
                r"""
         | 
| 766 | 
            +
                Performs the in-projection step of the attention operation, using packed weights.
         | 
| 767 | 
            +
                Output is a triple containing projection tensors for query, key and value.
         | 
| 768 | 
            +
                Args:
         | 
| 769 | 
            +
                    q, k, v: query, key and value tensors to be projected. For self-attention,
         | 
| 770 | 
            +
                        these are typically the same tensor; for encoder-decoder attention,
         | 
| 771 | 
            +
                        k and v are typically the same tensor. (We take advantage of these
         | 
| 772 | 
            +
                        identities for performance if they are present.) Regardless, q, k and v
         | 
| 773 | 
            +
                        must share a common embedding dimension; otherwise their shapes may vary.
         | 
| 774 | 
            +
                    w: projection weights for q, k and v, packed into a single tensor. Weights
         | 
| 775 | 
            +
                        are packed along dimension 0, in q, k, v order.
         | 
| 776 | 
            +
                    b: optional projection biases for q, k and v, packed into a single tensor
         | 
| 777 | 
            +
                        in q, k, v order.
         | 
| 778 | 
            +
                Shape:
         | 
| 779 | 
            +
                    Inputs:
         | 
| 780 | 
            +
                    - q: :math:`(..., E)` where E is the embedding dimension
         | 
| 781 | 
            +
                    - k: :math:`(..., E)` where E is the embedding dimension
         | 
| 782 | 
            +
                    - v: :math:`(..., E)` where E is the embedding dimension
         | 
| 783 | 
            +
                    - w: :math:`(E * 3, E)` where E is the embedding dimension
         | 
| 784 | 
            +
                    - b: :math:`E * 3` where E is the embedding dimension
         | 
| 785 | 
            +
                    Output:
         | 
| 786 | 
            +
                    - in output list :math:`[q', k', v']`, each output tensor will have the
         | 
| 787 | 
            +
                        same shape as the corresponding input tensor.
         | 
| 788 | 
            +
                """
         | 
| 789 | 
            +
                E = q.size(-1)
         | 
| 790 | 
            +
                if k is v:
         | 
| 791 | 
            +
                    if q is k:
         | 
| 792 | 
            +
                        # self-attention
         | 
| 793 | 
            +
                        proj = linear(q, w, b)
         | 
| 794 | 
            +
                        # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
         | 
| 795 | 
            +
                        proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
         | 
| 796 | 
            +
                        return proj[0], proj[1], proj[2]
         | 
| 797 | 
            +
                    else:
         | 
| 798 | 
            +
                        # encoder-decoder attention
         | 
| 799 | 
            +
                        w_q, w_kv = w.split([E, E * 2])
         | 
| 800 | 
            +
                        if b is None:
         | 
| 801 | 
            +
                            b_q = b_kv = None
         | 
| 802 | 
            +
                        else:
         | 
| 803 | 
            +
                            b_q, b_kv = b.split([E, E * 2])
         | 
| 804 | 
            +
                        q_proj = linear(q, w_q, b_q)
         | 
| 805 | 
            +
                        kv_proj = linear(k, w_kv, b_kv)
         | 
| 806 | 
            +
                        # reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk()
         | 
| 807 | 
            +
                        kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
         | 
| 808 | 
            +
                        return (q_proj, kv_proj[0], kv_proj[1])
         | 
| 809 | 
            +
                else:
         | 
| 810 | 
            +
                    w_q, w_k, w_v = w.chunk(3)
         | 
| 811 | 
            +
                    if b is None:
         | 
| 812 | 
            +
                        b_q = b_k = b_v = None
         | 
| 813 | 
            +
                    else:
         | 
| 814 | 
            +
                        b_q, b_k, b_v = b.chunk(3)
         | 
| 815 | 
            +
                    return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
         | 
| 816 | 
            +
             | 
| 817 | 
            +
             | 
| 818 | 
            +
            def _in_projection(
         | 
| 819 | 
            +
                q: Tensor,
         | 
| 820 | 
            +
                k: Tensor,
         | 
| 821 | 
            +
                v: Tensor,
         | 
| 822 | 
            +
                w_q: Tensor,
         | 
| 823 | 
            +
                w_k: Tensor,
         | 
| 824 | 
            +
                w_v: Tensor,
         | 
| 825 | 
            +
                b_q: Optional[Tensor] = None,
         | 
| 826 | 
            +
                b_k: Optional[Tensor] = None,
         | 
| 827 | 
            +
                b_v: Optional[Tensor] = None,
         | 
| 828 | 
            +
            ) -> Tuple[Tensor, Tensor, Tensor]:
         | 
| 829 | 
            +
                r"""
         | 
| 830 | 
            +
                Performs the in-projection step of the attention operation. This is simply
         | 
| 831 | 
            +
                a triple of linear projections, with shape constraints on the weights which
         | 
| 832 | 
            +
                ensure embedding dimension uniformity in the projected outputs.
         | 
| 833 | 
            +
                Output is a triple containing projection tensors for query, key and value.
         | 
| 834 | 
            +
                Args:
         | 
| 835 | 
            +
                    q, k, v: query, key and value tensors to be projected.
         | 
| 836 | 
            +
                    w_q, w_k, w_v: weights for q, k and v, respectively.
         | 
| 837 | 
            +
                    b_q, b_k, b_v: optional biases for q, k and v, respectively.
         | 
| 838 | 
            +
                Shape:
         | 
| 839 | 
            +
                    Inputs:
         | 
| 840 | 
            +
                    - q: :math:`(Qdims..., Eq)` where Eq is the query embedding dimension and Qdims are any
         | 
| 841 | 
            +
                        number of leading dimensions.
         | 
| 842 | 
            +
                    - k: :math:`(Kdims..., Ek)` where Ek is the key embedding dimension and Kdims are any
         | 
| 843 | 
            +
                        number of leading dimensions.
         | 
| 844 | 
            +
                    - v: :math:`(Vdims..., Ev)` where Ev is the value embedding dimension and Vdims are any
         | 
| 845 | 
            +
                        number of leading dimensions.
         | 
| 846 | 
            +
                    - w_q: :math:`(Eq, Eq)`
         | 
| 847 | 
            +
                    - w_k: :math:`(Eq, Ek)`
         | 
| 848 | 
            +
                    - w_v: :math:`(Eq, Ev)`
         | 
| 849 | 
            +
                    - b_q: :math:`(Eq)`
         | 
| 850 | 
            +
                    - b_k: :math:`(Eq)`
         | 
| 851 | 
            +
                    - b_v: :math:`(Eq)`
         | 
| 852 | 
            +
                    Output: in output triple :math:`(q', k', v')`,
         | 
| 853 | 
            +
                     - q': :math:`[Qdims..., Eq]`
         | 
| 854 | 
            +
                     - k': :math:`[Kdims..., Eq]`
         | 
| 855 | 
            +
                     - v': :math:`[Vdims..., Eq]`
         | 
| 856 | 
            +
                """
         | 
| 857 | 
            +
                Eq, Ek, Ev = q.size(-1), k.size(-1), v.size(-1)
         | 
| 858 | 
            +
                assert w_q.shape == (Eq, Eq), f"expecting query weights shape of {(Eq, Eq)}, but got {w_q.shape}"
         | 
| 859 | 
            +
                assert w_k.shape == (Eq, Ek), f"expecting key weights shape of {(Eq, Ek)}, but got {w_k.shape}"
         | 
| 860 | 
            +
                assert w_v.shape == (Eq, Ev), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}"
         | 
| 861 | 
            +
                assert b_q is None or b_q.shape == (Eq,), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
         | 
| 862 | 
            +
                assert b_k is None or b_k.shape == (Eq,), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
         | 
| 863 | 
            +
                assert b_v is None or b_v.shape == (Eq,), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
         | 
| 864 | 
            +
                return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,74 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<image>",
         | 
| 4 | 
            +
                "</image>",
         | 
| 5 | 
            +
                "<ref>",
         | 
| 6 | 
            +
                "</ref>",
         | 
| 7 | 
            +
                "<box>",
         | 
| 8 | 
            +
                "</box>",
         | 
| 9 | 
            +
                "<quad>",
         | 
| 10 | 
            +
                "</quad>",
         | 
| 11 | 
            +
                "<point>",
         | 
| 12 | 
            +
                "</point>",
         | 
| 13 | 
            +
                "<slice>",
         | 
| 14 | 
            +
                "</slice>",
         | 
| 15 | 
            +
                "<image_id>",
         | 
| 16 | 
            +
                "</image_id>",
         | 
| 17 | 
            +
                "<unit>",
         | 
| 18 | 
            +
                "</unit>",
         | 
| 19 | 
            +
                "<asr>",
         | 
| 20 | 
            +
                "</asr>",
         | 
| 21 | 
            +
                "<query>",
         | 
| 22 | 
            +
                "</query>",
         | 
| 23 | 
            +
                "<|audio_start|>",
         | 
| 24 | 
            +
                "<|audio|>",
         | 
| 25 | 
            +
                "<|audio_end|>",
         | 
| 26 | 
            +
                "<|spk_bos|>",
         | 
| 27 | 
            +
                "<|spk|>",
         | 
| 28 | 
            +
                "<|spk_eos|>",
         | 
| 29 | 
            +
                "<|tts_bos|>",
         | 
| 30 | 
            +
                "<|tts_eos|>",
         | 
| 31 | 
            +
                "<|listen|>",
         | 
| 32 | 
            +
                "<|speak|>",
         | 
| 33 | 
            +
                "<|interrupt|>",
         | 
| 34 | 
            +
                "<|vad_start|>",
         | 
| 35 | 
            +
                "<|vad_end|>",
         | 
| 36 | 
            +
                "<reserved_43>",
         | 
| 37 | 
            +
                "<reserved_53>",
         | 
| 38 | 
            +
                {
         | 
| 39 | 
            +
                  "content": "<|im_end|>",
         | 
| 40 | 
            +
                  "lstrip": false,
         | 
| 41 | 
            +
                  "normalized": false,
         | 
| 42 | 
            +
                  "rstrip": false,
         | 
| 43 | 
            +
                  "single_word": false
         | 
| 44 | 
            +
                }
         | 
| 45 | 
            +
              ],
         | 
| 46 | 
            +
              "bos_token": {
         | 
| 47 | 
            +
                "content": "<|im_start|>",
         | 
| 48 | 
            +
                "lstrip": false,
         | 
| 49 | 
            +
                "normalized": false,
         | 
| 50 | 
            +
                "rstrip": false,
         | 
| 51 | 
            +
                "single_word": false
         | 
| 52 | 
            +
              },
         | 
| 53 | 
            +
              "eos_token": {
         | 
| 54 | 
            +
                "content": "<|im_end|>",
         | 
| 55 | 
            +
                "lstrip": false,
         | 
| 56 | 
            +
                "normalized": false,
         | 
| 57 | 
            +
                "rstrip": false,
         | 
| 58 | 
            +
                "single_word": false
         | 
| 59 | 
            +
              },
         | 
| 60 | 
            +
              "pad_token": {
         | 
| 61 | 
            +
                "content": "<|endoftext|>",
         | 
| 62 | 
            +
                "lstrip": false,
         | 
| 63 | 
            +
                "normalized": false,
         | 
| 64 | 
            +
                "rstrip": false,
         | 
| 65 | 
            +
                "single_word": false
         | 
| 66 | 
            +
              },
         | 
| 67 | 
            +
              "unk_token": {
         | 
| 68 | 
            +
                "content": "<unk>",
         | 
| 69 | 
            +
                "lstrip": false,
         | 
| 70 | 
            +
                "normalized": false,
         | 
| 71 | 
            +
                "rstrip": false,
         | 
| 72 | 
            +
                "single_word": false
         | 
| 73 | 
            +
              }
         | 
| 74 | 
            +
            }
         | 
    	
        tokenization_minicpmo_fast.py
    ADDED
    
    | @@ -0,0 +1,110 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2025 The OpenBMB Team. All rights reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            from transformers import Qwen2TokenizerFast
         | 
| 17 | 
            +
             | 
| 18 | 
            +
             | 
| 19 | 
            +
            class MiniCPMOTokenizerFast(Qwen2TokenizerFast):
         | 
| 20 | 
            +
                def __init__(self, **kwargs):
         | 
| 21 | 
            +
                    super().__init__(**kwargs)
         | 
| 22 | 
            +
                    # image
         | 
| 23 | 
            +
                    self.im_start = "<image>"
         | 
| 24 | 
            +
                    self.im_end = "</image>"
         | 
| 25 | 
            +
                    self.ref_start = "<ref>"
         | 
| 26 | 
            +
                    self.ref_end = "</ref>"
         | 
| 27 | 
            +
                    self.box_start = "<box>"
         | 
| 28 | 
            +
                    self.box_end = "</box>"
         | 
| 29 | 
            +
                    self.quad_start = "<quad>"
         | 
| 30 | 
            +
                    self.quad_end = "</quad>"
         | 
| 31 | 
            +
                    self.slice_start = "<slice>"
         | 
| 32 | 
            +
                    self.slice_end = "</slice>"
         | 
| 33 | 
            +
                    self.im_id_start = "<image_id>"
         | 
| 34 | 
            +
                    self.im_id_end = "</image_id>"
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    # audio
         | 
| 37 | 
            +
                    self.audio_start = "<|audio_start|>"
         | 
| 38 | 
            +
                    self.audio_end = "<|audio_end|>"
         | 
| 39 | 
            +
                    self.spk_start = "<|spk_bos|>"
         | 
| 40 | 
            +
                    self.spk_end = "<|spk_eos|>"
         | 
| 41 | 
            +
                    self.tts_start = "<|tts_bos|>"
         | 
| 42 | 
            +
                    self.tts_end = "<|tts_eos|>"
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                @property
         | 
| 45 | 
            +
                def eos_id(self):
         | 
| 46 | 
            +
                    return self.eos_token_id
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                @property
         | 
| 49 | 
            +
                def bos_id(self):
         | 
| 50 | 
            +
                    return self.bos_token_id
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                @property
         | 
| 53 | 
            +
                def unk_id(self):
         | 
| 54 | 
            +
                    return self.unk_token_id
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                @property
         | 
| 57 | 
            +
                def im_start_id(self):
         | 
| 58 | 
            +
                    return self.convert_tokens_to_ids(self.im_start)
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                @property
         | 
| 61 | 
            +
                def im_end_id(self):
         | 
| 62 | 
            +
                    return self.convert_tokens_to_ids(self.im_end)
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                @property
         | 
| 65 | 
            +
                def slice_start_id(self):
         | 
| 66 | 
            +
                    return self.convert_tokens_to_ids(self.slice_start)
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                @property
         | 
| 69 | 
            +
                def slice_end_id(self):
         | 
| 70 | 
            +
                    return self.convert_tokens_to_ids(self.slice_end)
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                @property
         | 
| 73 | 
            +
                def im_id_start_id(self):
         | 
| 74 | 
            +
                    return self.convert_tokens_to_ids(self.im_id_start)
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                @property
         | 
| 77 | 
            +
                def im_id_end_id(self):
         | 
| 78 | 
            +
                    return self.convert_tokens_to_ids(self.im_id_end)
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                @property
         | 
| 81 | 
            +
                def audio_start_id(self):
         | 
| 82 | 
            +
                    return self.convert_tokens_to_ids(self.audio_start)
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                @property
         | 
| 85 | 
            +
                def audio_end_id(self):
         | 
| 86 | 
            +
                    return self.convert_tokens_to_ids(self.audio_end)
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                @property
         | 
| 89 | 
            +
                def spk_start_id(self):
         | 
| 90 | 
            +
                    return self.convert_tokens_to_ids(self.spk_start)
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                @property
         | 
| 93 | 
            +
                def spk_end_id(self):
         | 
| 94 | 
            +
                    return self.convert_tokens_to_ids(self.spk_end)
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                @property
         | 
| 97 | 
            +
                def tts_start_id(self):
         | 
| 98 | 
            +
                    return self.convert_tokens_to_ids(self.tts_start)
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                @property
         | 
| 101 | 
            +
                def tts_end_id(self):
         | 
| 102 | 
            +
                    return self.convert_tokens_to_ids(self.tts_end)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                @staticmethod
         | 
| 105 | 
            +
                def escape(text: str) -> str:
         | 
| 106 | 
            +
                    return text
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                @staticmethod
         | 
| 109 | 
            +
                def unescape(text: str) -> str:
         | 
| 110 | 
            +
                    return text
         | 
    	
        tokenizer.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:e43057e8380937b8edf2c9ea6ad34a3875d89c390e97024dfac307f3a4a11321
         | 
| 3 | 
            +
            size 11428583
         | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,527 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "add_bos_token": false,
         | 
| 3 | 
            +
              "add_prefix_space": false,
         | 
| 4 | 
            +
              "added_tokens_decoder": {
         | 
| 5 | 
            +
                "128244": {
         | 
| 6 | 
            +
                  "content": "<unk>",
         | 
| 7 | 
            +
                  "lstrip": false,
         | 
| 8 | 
            +
                  "normalized": false,
         | 
| 9 | 
            +
                  "rstrip": false,
         | 
| 10 | 
            +
                  "single_word": false,
         | 
| 11 | 
            +
                  "special": true
         | 
| 12 | 
            +
                },
         | 
| 13 | 
            +
                "151643": {
         | 
| 14 | 
            +
                  "content": "<|endoftext|>",
         | 
| 15 | 
            +
                  "lstrip": false,
         | 
| 16 | 
            +
                  "normalized": false,
         | 
| 17 | 
            +
                  "rstrip": false,
         | 
| 18 | 
            +
                  "single_word": false,
         | 
| 19 | 
            +
                  "special": true
         | 
| 20 | 
            +
                },
         | 
| 21 | 
            +
                "151644": {
         | 
| 22 | 
            +
                  "content": "<|im_start|>",
         | 
| 23 | 
            +
                  "lstrip": false,
         | 
| 24 | 
            +
                  "normalized": false,
         | 
| 25 | 
            +
                  "rstrip": false,
         | 
| 26 | 
            +
                  "single_word": false,
         | 
| 27 | 
            +
                  "special": true
         | 
| 28 | 
            +
                },
         | 
| 29 | 
            +
                "151645": {
         | 
| 30 | 
            +
                  "content": "<|im_end|>",
         | 
| 31 | 
            +
                  "lstrip": false,
         | 
| 32 | 
            +
                  "normalized": false,
         | 
| 33 | 
            +
                  "rstrip": false,
         | 
| 34 | 
            +
                  "single_word": false,
         | 
| 35 | 
            +
                  "special": true
         | 
| 36 | 
            +
                },
         | 
| 37 | 
            +
                "151646": {
         | 
| 38 | 
            +
                  "content": "<|object_ref_start|>",
         | 
| 39 | 
            +
                  "lstrip": false,
         | 
| 40 | 
            +
                  "normalized": false,
         | 
| 41 | 
            +
                  "rstrip": false,
         | 
| 42 | 
            +
                  "single_word": false,
         | 
| 43 | 
            +
                  "special": true
         | 
| 44 | 
            +
                },
         | 
| 45 | 
            +
                "151647": {
         | 
| 46 | 
            +
                  "content": "<|object_ref_end|>",
         | 
| 47 | 
            +
                  "lstrip": false,
         | 
| 48 | 
            +
                  "normalized": false,
         | 
| 49 | 
            +
                  "rstrip": false,
         | 
| 50 | 
            +
                  "single_word": false,
         | 
| 51 | 
            +
                  "special": true
         | 
| 52 | 
            +
                },
         | 
| 53 | 
            +
                "151648": {
         | 
| 54 | 
            +
                  "content": "<|box_start|>",
         | 
| 55 | 
            +
                  "lstrip": false,
         | 
| 56 | 
            +
                  "normalized": false,
         | 
| 57 | 
            +
                  "rstrip": false,
         | 
| 58 | 
            +
                  "single_word": false,
         | 
| 59 | 
            +
                  "special": true
         | 
| 60 | 
            +
                },
         | 
| 61 | 
            +
                "151649": {
         | 
| 62 | 
            +
                  "content": "<|box_end|>",
         | 
| 63 | 
            +
                  "lstrip": false,
         | 
| 64 | 
            +
                  "normalized": false,
         | 
| 65 | 
            +
                  "rstrip": false,
         | 
| 66 | 
            +
                  "single_word": false,
         | 
| 67 | 
            +
                  "special": true
         | 
| 68 | 
            +
                },
         | 
| 69 | 
            +
                "151650": {
         | 
| 70 | 
            +
                  "content": "<|quad_start|>",
         | 
| 71 | 
            +
                  "lstrip": false,
         | 
| 72 | 
            +
                  "normalized": false,
         | 
| 73 | 
            +
                  "rstrip": false,
         | 
| 74 | 
            +
                  "single_word": false,
         | 
| 75 | 
            +
                  "special": true
         | 
| 76 | 
            +
                },
         | 
| 77 | 
            +
                "151651": {
         | 
| 78 | 
            +
                  "content": "<|quad_end|>",
         | 
| 79 | 
            +
                  "lstrip": false,
         | 
| 80 | 
            +
                  "normalized": false,
         | 
| 81 | 
            +
                  "rstrip": false,
         | 
| 82 | 
            +
                  "single_word": false,
         | 
| 83 | 
            +
                  "special": true
         | 
| 84 | 
            +
                },
         | 
| 85 | 
            +
                "151652": {
         | 
| 86 | 
            +
                  "content": "<|vision_start|>",
         | 
| 87 | 
            +
                  "lstrip": false,
         | 
| 88 | 
            +
                  "normalized": false,
         | 
| 89 | 
            +
                  "rstrip": false,
         | 
| 90 | 
            +
                  "single_word": false,
         | 
| 91 | 
            +
                  "special": true
         | 
| 92 | 
            +
                },
         | 
| 93 | 
            +
                "151653": {
         | 
| 94 | 
            +
                  "content": "<|vision_end|>",
         | 
| 95 | 
            +
                  "lstrip": false,
         | 
| 96 | 
            +
                  "normalized": false,
         | 
| 97 | 
            +
                  "rstrip": false,
         | 
| 98 | 
            +
                  "single_word": false,
         | 
| 99 | 
            +
                  "special": true
         | 
| 100 | 
            +
                },
         | 
| 101 | 
            +
                "151654": {
         | 
| 102 | 
            +
                  "content": "<|vision_pad|>",
         | 
| 103 | 
            +
                  "lstrip": false,
         | 
| 104 | 
            +
                  "normalized": false,
         | 
| 105 | 
            +
                  "rstrip": false,
         | 
| 106 | 
            +
                  "single_word": false,
         | 
| 107 | 
            +
                  "special": true
         | 
| 108 | 
            +
                },
         | 
| 109 | 
            +
                "151655": {
         | 
| 110 | 
            +
                  "content": "<|image_pad|>",
         | 
| 111 | 
            +
                  "lstrip": false,
         | 
| 112 | 
            +
                  "normalized": false,
         | 
| 113 | 
            +
                  "rstrip": false,
         | 
| 114 | 
            +
                  "single_word": false,
         | 
| 115 | 
            +
                  "special": true
         | 
| 116 | 
            +
                },
         | 
| 117 | 
            +
                "151656": {
         | 
| 118 | 
            +
                  "content": "<|video_pad|>",
         | 
| 119 | 
            +
                  "lstrip": false,
         | 
| 120 | 
            +
                  "normalized": false,
         | 
| 121 | 
            +
                  "rstrip": false,
         | 
| 122 | 
            +
                  "single_word": false,
         | 
| 123 | 
            +
                  "special": true
         | 
| 124 | 
            +
                },
         | 
| 125 | 
            +
                "151657": {
         | 
| 126 | 
            +
                  "content": "<tool_call>",
         | 
| 127 | 
            +
                  "lstrip": false,
         | 
| 128 | 
            +
                  "normalized": false,
         | 
| 129 | 
            +
                  "rstrip": false,
         | 
| 130 | 
            +
                  "single_word": false,
         | 
| 131 | 
            +
                  "special": false
         | 
| 132 | 
            +
                },
         | 
| 133 | 
            +
                "151658": {
         | 
| 134 | 
            +
                  "content": "</tool_call>",
         | 
| 135 | 
            +
                  "lstrip": false,
         | 
| 136 | 
            +
                  "normalized": false,
         | 
| 137 | 
            +
                  "rstrip": false,
         | 
| 138 | 
            +
                  "single_word": false,
         | 
| 139 | 
            +
                  "special": false
         | 
| 140 | 
            +
                },
         | 
| 141 | 
            +
                "151659": {
         | 
| 142 | 
            +
                  "content": "<|fim_prefix|>",
         | 
| 143 | 
            +
                  "lstrip": false,
         | 
| 144 | 
            +
                  "normalized": false,
         | 
| 145 | 
            +
                  "rstrip": false,
         | 
| 146 | 
            +
                  "single_word": false,
         | 
| 147 | 
            +
                  "special": false
         | 
| 148 | 
            +
                },
         | 
| 149 | 
            +
                "151660": {
         | 
| 150 | 
            +
                  "content": "<|fim_middle|>",
         | 
| 151 | 
            +
                  "lstrip": false,
         | 
| 152 | 
            +
                  "normalized": false,
         | 
| 153 | 
            +
                  "rstrip": false,
         | 
| 154 | 
            +
                  "single_word": false,
         | 
| 155 | 
            +
                  "special": false
         | 
| 156 | 
            +
                },
         | 
| 157 | 
            +
                "151661": {
         | 
| 158 | 
            +
                  "content": "<|fim_suffix|>",
         | 
| 159 | 
            +
                  "lstrip": false,
         | 
| 160 | 
            +
                  "normalized": false,
         | 
| 161 | 
            +
                  "rstrip": false,
         | 
| 162 | 
            +
                  "single_word": false,
         | 
| 163 | 
            +
                  "special": false
         | 
| 164 | 
            +
                },
         | 
| 165 | 
            +
                "151662": {
         | 
| 166 | 
            +
                  "content": "<|fim_pad|>",
         | 
| 167 | 
            +
                  "lstrip": false,
         | 
| 168 | 
            +
                  "normalized": false,
         | 
| 169 | 
            +
                  "rstrip": false,
         | 
| 170 | 
            +
                  "single_word": false,
         | 
| 171 | 
            +
                  "special": false
         | 
| 172 | 
            +
                },
         | 
| 173 | 
            +
                "151663": {
         | 
| 174 | 
            +
                  "content": "<|repo_name|>",
         | 
| 175 | 
            +
                  "lstrip": false,
         | 
| 176 | 
            +
                  "normalized": false,
         | 
| 177 | 
            +
                  "rstrip": false,
         | 
| 178 | 
            +
                  "single_word": false,
         | 
| 179 | 
            +
                  "special": false
         | 
| 180 | 
            +
                },
         | 
| 181 | 
            +
                "151664": {
         | 
| 182 | 
            +
                  "content": "<|file_sep|>",
         | 
| 183 | 
            +
                  "lstrip": false,
         | 
| 184 | 
            +
                  "normalized": false,
         | 
| 185 | 
            +
                  "rstrip": false,
         | 
| 186 | 
            +
                  "single_word": false,
         | 
| 187 | 
            +
                  "special": false
         | 
| 188 | 
            +
                },
         | 
| 189 | 
            +
                "151665": {
         | 
| 190 | 
            +
                  "content": "<image>",
         | 
| 191 | 
            +
                  "lstrip": false,
         | 
| 192 | 
            +
                  "normalized": false,
         | 
| 193 | 
            +
                  "rstrip": false,
         | 
| 194 | 
            +
                  "single_word": false,
         | 
| 195 | 
            +
                  "special": true
         | 
| 196 | 
            +
                },
         | 
| 197 | 
            +
                "151666": {
         | 
| 198 | 
            +
                  "content": "</image>",
         | 
| 199 | 
            +
                  "lstrip": false,
         | 
| 200 | 
            +
                  "normalized": false,
         | 
| 201 | 
            +
                  "rstrip": false,
         | 
| 202 | 
            +
                  "single_word": false,
         | 
| 203 | 
            +
                  "special": true
         | 
| 204 | 
            +
                },
         | 
| 205 | 
            +
                "151667": {
         | 
| 206 | 
            +
                  "content": "<ref>",
         | 
| 207 | 
            +
                  "lstrip": false,
         | 
| 208 | 
            +
                  "normalized": false,
         | 
| 209 | 
            +
                  "rstrip": false,
         | 
| 210 | 
            +
                  "single_word": false,
         | 
| 211 | 
            +
                  "special": true
         | 
| 212 | 
            +
                },
         | 
| 213 | 
            +
                "151668": {
         | 
| 214 | 
            +
                  "content": "</ref>",
         | 
| 215 | 
            +
                  "lstrip": false,
         | 
| 216 | 
            +
                  "normalized": false,
         | 
| 217 | 
            +
                  "rstrip": false,
         | 
| 218 | 
            +
                  "single_word": false,
         | 
| 219 | 
            +
                  "special": true
         | 
| 220 | 
            +
                },
         | 
| 221 | 
            +
                "151669": {
         | 
| 222 | 
            +
                  "content": "<box>",
         | 
| 223 | 
            +
                  "lstrip": false,
         | 
| 224 | 
            +
                  "normalized": false,
         | 
| 225 | 
            +
                  "rstrip": false,
         | 
| 226 | 
            +
                  "single_word": false,
         | 
| 227 | 
            +
                  "special": true
         | 
| 228 | 
            +
                },
         | 
| 229 | 
            +
                "151670": {
         | 
| 230 | 
            +
                  "content": "</box>",
         | 
| 231 | 
            +
                  "lstrip": false,
         | 
| 232 | 
            +
                  "normalized": false,
         | 
| 233 | 
            +
                  "rstrip": false,
         | 
| 234 | 
            +
                  "single_word": false,
         | 
| 235 | 
            +
                  "special": true
         | 
| 236 | 
            +
                },
         | 
| 237 | 
            +
                "151671": {
         | 
| 238 | 
            +
                  "content": "<quad>",
         | 
| 239 | 
            +
                  "lstrip": false,
         | 
| 240 | 
            +
                  "normalized": false,
         | 
| 241 | 
            +
                  "rstrip": false,
         | 
| 242 | 
            +
                  "single_word": false,
         | 
| 243 | 
            +
                  "special": true
         | 
| 244 | 
            +
                },
         | 
| 245 | 
            +
                "151672": {
         | 
| 246 | 
            +
                  "content": "</quad>",
         | 
| 247 | 
            +
                  "lstrip": false,
         | 
| 248 | 
            +
                  "normalized": false,
         | 
| 249 | 
            +
                  "rstrip": false,
         | 
| 250 | 
            +
                  "single_word": false,
         | 
| 251 | 
            +
                  "special": true
         | 
| 252 | 
            +
                },
         | 
| 253 | 
            +
                "151673": {
         | 
| 254 | 
            +
                  "content": "<point>",
         | 
| 255 | 
            +
                  "lstrip": false,
         | 
| 256 | 
            +
                  "normalized": false,
         | 
| 257 | 
            +
                  "rstrip": false,
         | 
| 258 | 
            +
                  "single_word": false,
         | 
| 259 | 
            +
                  "special": true
         | 
| 260 | 
            +
                },
         | 
| 261 | 
            +
                "151674": {
         | 
| 262 | 
            +
                  "content": "</point>",
         | 
| 263 | 
            +
                  "lstrip": false,
         | 
| 264 | 
            +
                  "normalized": false,
         | 
| 265 | 
            +
                  "rstrip": false,
         | 
| 266 | 
            +
                  "single_word": false,
         | 
| 267 | 
            +
                  "special": true
         | 
| 268 | 
            +
                },
         | 
| 269 | 
            +
                "151675": {
         | 
| 270 | 
            +
                  "content": "<slice>",
         | 
| 271 | 
            +
                  "lstrip": false,
         | 
| 272 | 
            +
                  "normalized": false,
         | 
| 273 | 
            +
                  "rstrip": false,
         | 
| 274 | 
            +
                  "single_word": false,
         | 
| 275 | 
            +
                  "special": true
         | 
| 276 | 
            +
                },
         | 
| 277 | 
            +
                "151676": {
         | 
| 278 | 
            +
                  "content": "</slice>",
         | 
| 279 | 
            +
                  "lstrip": false,
         | 
| 280 | 
            +
                  "normalized": false,
         | 
| 281 | 
            +
                  "rstrip": false,
         | 
| 282 | 
            +
                  "single_word": false,
         | 
| 283 | 
            +
                  "special": true
         | 
| 284 | 
            +
                },
         | 
| 285 | 
            +
                "151677": {
         | 
| 286 | 
            +
                  "content": "<image_id>",
         | 
| 287 | 
            +
                  "lstrip": false,
         | 
| 288 | 
            +
                  "normalized": false,
         | 
| 289 | 
            +
                  "rstrip": false,
         | 
| 290 | 
            +
                  "single_word": false,
         | 
| 291 | 
            +
                  "special": true
         | 
| 292 | 
            +
                },
         | 
| 293 | 
            +
                "151678": {
         | 
| 294 | 
            +
                  "content": "</image_id>",
         | 
| 295 | 
            +
                  "lstrip": false,
         | 
| 296 | 
            +
                  "normalized": false,
         | 
| 297 | 
            +
                  "rstrip": false,
         | 
| 298 | 
            +
                  "single_word": false,
         | 
| 299 | 
            +
                  "special": true
         | 
| 300 | 
            +
                },
         | 
| 301 | 
            +
                "151679": {
         | 
| 302 | 
            +
                  "content": "<unit>",
         | 
| 303 | 
            +
                  "lstrip": false,
         | 
| 304 | 
            +
                  "normalized": false,
         | 
| 305 | 
            +
                  "rstrip": false,
         | 
| 306 | 
            +
                  "single_word": false,
         | 
| 307 | 
            +
                  "special": true
         | 
| 308 | 
            +
                },
         | 
| 309 | 
            +
                "151680": {
         | 
| 310 | 
            +
                  "content": "</unit>",
         | 
| 311 | 
            +
                  "lstrip": false,
         | 
| 312 | 
            +
                  "normalized": false,
         | 
| 313 | 
            +
                  "rstrip": false,
         | 
| 314 | 
            +
                  "single_word": false,
         | 
| 315 | 
            +
                  "special": true
         | 
| 316 | 
            +
                },
         | 
| 317 | 
            +
                "151681": {
         | 
| 318 | 
            +
                  "content": "<asr>",
         | 
| 319 | 
            +
                  "lstrip": false,
         | 
| 320 | 
            +
                  "normalized": false,
         | 
| 321 | 
            +
                  "rstrip": false,
         | 
| 322 | 
            +
                  "single_word": false,
         | 
| 323 | 
            +
                  "special": true
         | 
| 324 | 
            +
                },
         | 
| 325 | 
            +
                "151682": {
         | 
| 326 | 
            +
                  "content": "</asr>",
         | 
| 327 | 
            +
                  "lstrip": false,
         | 
| 328 | 
            +
                  "normalized": false,
         | 
| 329 | 
            +
                  "rstrip": false,
         | 
| 330 | 
            +
                  "single_word": false,
         | 
| 331 | 
            +
                  "special": true
         | 
| 332 | 
            +
                },
         | 
| 333 | 
            +
                "151683": {
         | 
| 334 | 
            +
                  "content": "<query>",
         | 
| 335 | 
            +
                  "lstrip": false,
         | 
| 336 | 
            +
                  "normalized": false,
         | 
| 337 | 
            +
                  "rstrip": false,
         | 
| 338 | 
            +
                  "single_word": false,
         | 
| 339 | 
            +
                  "special": true
         | 
| 340 | 
            +
                },
         | 
| 341 | 
            +
                "151684": {
         | 
| 342 | 
            +
                  "content": "</query>",
         | 
| 343 | 
            +
                  "lstrip": false,
         | 
| 344 | 
            +
                  "normalized": false,
         | 
| 345 | 
            +
                  "rstrip": false,
         | 
| 346 | 
            +
                  "single_word": false,
         | 
| 347 | 
            +
                  "special": true
         | 
| 348 | 
            +
                },
         | 
| 349 | 
            +
                "151685": {
         | 
| 350 | 
            +
                  "content": "<|audio_start|>",
         | 
| 351 | 
            +
                  "lstrip": false,
         | 
| 352 | 
            +
                  "normalized": false,
         | 
| 353 | 
            +
                  "rstrip": false,
         | 
| 354 | 
            +
                  "single_word": false,
         | 
| 355 | 
            +
                  "special": true
         | 
| 356 | 
            +
                },
         | 
| 357 | 
            +
                "151686": {
         | 
| 358 | 
            +
                  "content": "<|audio|>",
         | 
| 359 | 
            +
                  "lstrip": false,
         | 
| 360 | 
            +
                  "normalized": false,
         | 
| 361 | 
            +
                  "rstrip": false,
         | 
| 362 | 
            +
                  "single_word": false,
         | 
| 363 | 
            +
                  "special": true
         | 
| 364 | 
            +
                },
         | 
| 365 | 
            +
                "151687": {
         | 
| 366 | 
            +
                  "content": "<|audio_end|>",
         | 
| 367 | 
            +
                  "lstrip": false,
         | 
| 368 | 
            +
                  "normalized": false,
         | 
| 369 | 
            +
                  "rstrip": false,
         | 
| 370 | 
            +
                  "single_word": false,
         | 
| 371 | 
            +
                  "special": true
         | 
| 372 | 
            +
                },
         | 
| 373 | 
            +
                "151688": {
         | 
| 374 | 
            +
                  "content": "<|spk_bos|>",
         | 
| 375 | 
            +
                  "lstrip": false,
         | 
| 376 | 
            +
                  "normalized": false,
         | 
| 377 | 
            +
                  "rstrip": false,
         | 
| 378 | 
            +
                  "single_word": false,
         | 
| 379 | 
            +
                  "special": true
         | 
| 380 | 
            +
                },
         | 
| 381 | 
            +
                "151689": {
         | 
| 382 | 
            +
                  "content": "<|spk|>",
         | 
| 383 | 
            +
                  "lstrip": false,
         | 
| 384 | 
            +
                  "normalized": false,
         | 
| 385 | 
            +
                  "rstrip": false,
         | 
| 386 | 
            +
                  "single_word": false,
         | 
| 387 | 
            +
                  "special": true
         | 
| 388 | 
            +
                },
         | 
| 389 | 
            +
                "151690": {
         | 
| 390 | 
            +
                  "content": "<|spk_eos|>",
         | 
| 391 | 
            +
                  "lstrip": false,
         | 
| 392 | 
            +
                  "normalized": false,
         | 
| 393 | 
            +
                  "rstrip": false,
         | 
| 394 | 
            +
                  "single_word": false,
         | 
| 395 | 
            +
                  "special": true
         | 
| 396 | 
            +
                },
         | 
| 397 | 
            +
                "151691": {
         | 
| 398 | 
            +
                  "content": "<|tts_bos|>",
         | 
| 399 | 
            +
                  "lstrip": false,
         | 
| 400 | 
            +
                  "normalized": false,
         | 
| 401 | 
            +
                  "rstrip": false,
         | 
| 402 | 
            +
                  "single_word": false,
         | 
| 403 | 
            +
                  "special": true
         | 
| 404 | 
            +
                },
         | 
| 405 | 
            +
                "151692": {
         | 
| 406 | 
            +
                  "content": "<|tts_eos|>",
         | 
| 407 | 
            +
                  "lstrip": false,
         | 
| 408 | 
            +
                  "normalized": false,
         | 
| 409 | 
            +
                  "rstrip": false,
         | 
| 410 | 
            +
                  "single_word": false,
         | 
| 411 | 
            +
                  "special": true
         | 
| 412 | 
            +
                },
         | 
| 413 | 
            +
                "151693": {
         | 
| 414 | 
            +
                  "content": "<|listen|>",
         | 
| 415 | 
            +
                  "lstrip": false,
         | 
| 416 | 
            +
                  "normalized": false,
         | 
| 417 | 
            +
                  "rstrip": false,
         | 
| 418 | 
            +
                  "single_word": false,
         | 
| 419 | 
            +
                  "special": true
         | 
| 420 | 
            +
                },
         | 
| 421 | 
            +
                "151694": {
         | 
| 422 | 
            +
                  "content": "<|speak|>",
         | 
| 423 | 
            +
                  "lstrip": false,
         | 
| 424 | 
            +
                  "normalized": false,
         | 
| 425 | 
            +
                  "rstrip": false,
         | 
| 426 | 
            +
                  "single_word": false,
         | 
| 427 | 
            +
                  "special": true
         | 
| 428 | 
            +
                },
         | 
| 429 | 
            +
                "151695": {
         | 
| 430 | 
            +
                  "content": "<|interrupt|>",
         | 
| 431 | 
            +
                  "lstrip": false,
         | 
| 432 | 
            +
                  "normalized": false,
         | 
| 433 | 
            +
                  "rstrip": false,
         | 
| 434 | 
            +
                  "single_word": false,
         | 
| 435 | 
            +
                  "special": true
         | 
| 436 | 
            +
                },
         | 
| 437 | 
            +
                "151696": {
         | 
| 438 | 
            +
                  "content": "<|vad_start|>",
         | 
| 439 | 
            +
                  "lstrip": false,
         | 
| 440 | 
            +
                  "normalized": false,
         | 
| 441 | 
            +
                  "rstrip": false,
         | 
| 442 | 
            +
                  "single_word": false,
         | 
| 443 | 
            +
                  "special": true
         | 
| 444 | 
            +
                },
         | 
| 445 | 
            +
                "151697": {
         | 
| 446 | 
            +
                  "content": "<|vad_end|>",
         | 
| 447 | 
            +
                  "lstrip": false,
         | 
| 448 | 
            +
                  "normalized": false,
         | 
| 449 | 
            +
                  "rstrip": false,
         | 
| 450 | 
            +
                  "single_word": false,
         | 
| 451 | 
            +
                  "special": true
         | 
| 452 | 
            +
                },
         | 
| 453 | 
            +
                "151698": {
         | 
| 454 | 
            +
                  "content": "<reserved_43>",
         | 
| 455 | 
            +
                  "lstrip": false,
         | 
| 456 | 
            +
                  "normalized": false,
         | 
| 457 | 
            +
                  "rstrip": false,
         | 
| 458 | 
            +
                  "single_word": false,
         | 
| 459 | 
            +
                  "special": true
         | 
| 460 | 
            +
                },
         | 
| 461 | 
            +
                "151699": {
         | 
| 462 | 
            +
                  "content": "<reserved_53>",
         | 
| 463 | 
            +
                  "lstrip": false,
         | 
| 464 | 
            +
                  "normalized": false,
         | 
| 465 | 
            +
                  "rstrip": false,
         | 
| 466 | 
            +
                  "single_word": false,
         | 
| 467 | 
            +
                  "special": true
         | 
| 468 | 
            +
                }
         | 
| 469 | 
            +
              },
         | 
| 470 | 
            +
              "additional_special_tokens": [
         | 
| 471 | 
            +
                "<image>",
         | 
| 472 | 
            +
                "</image>",
         | 
| 473 | 
            +
                "<ref>",
         | 
| 474 | 
            +
                "</ref>",
         | 
| 475 | 
            +
                "<box>",
         | 
| 476 | 
            +
                "</box>",
         | 
| 477 | 
            +
                "<quad>",
         | 
| 478 | 
            +
                "</quad>",
         | 
| 479 | 
            +
                "<point>",
         | 
| 480 | 
            +
                "</point>",
         | 
| 481 | 
            +
                "<slice>",
         | 
| 482 | 
            +
                "</slice>",
         | 
| 483 | 
            +
                "<image_id>",
         | 
| 484 | 
            +
                "</image_id>",
         | 
| 485 | 
            +
                "<unit>",
         | 
| 486 | 
            +
                "</unit>",
         | 
| 487 | 
            +
                "<asr>",
         | 
| 488 | 
            +
                "</asr>",
         | 
| 489 | 
            +
                "<query>",
         | 
| 490 | 
            +
                "</query>",
         | 
| 491 | 
            +
                "<|audio_start|>",
         | 
| 492 | 
            +
                "<|audio|>",
         | 
| 493 | 
            +
                "<|audio_end|>",
         | 
| 494 | 
            +
                "<|spk_bos|>",
         | 
| 495 | 
            +
                "<|spk|>",
         | 
| 496 | 
            +
                "<|spk_eos|>",
         | 
| 497 | 
            +
                "<|tts_bos|>",
         | 
| 498 | 
            +
                "<|tts_eos|>",
         | 
| 499 | 
            +
                "<|listen|>",
         | 
| 500 | 
            +
                "<|speak|>",
         | 
| 501 | 
            +
                "<|interrupt|>",
         | 
| 502 | 
            +
                "<|vad_start|>",
         | 
| 503 | 
            +
                "<|vad_end|>",
         | 
| 504 | 
            +
                "<reserved_43>",
         | 
| 505 | 
            +
                "<reserved_53>",
         | 
| 506 | 
            +
                "<|im_end|>"
         | 
| 507 | 
            +
              ],
         | 
| 508 | 
            +
              "auto_map": {
         | 
| 509 | 
            +
                "AutoProcessor": "processing_minicpmo.MiniCPMOProcessor",
         | 
| 510 | 
            +
                "AutoTokenizer": [
         | 
| 511 | 
            +
                  "tokenization_qwen2.Qwen2Tokenizer",
         | 
| 512 | 
            +
                  "tokenization_minicpmo_fast.MiniCPMOTokenizerFast"
         | 
| 513 | 
            +
                ]
         | 
| 514 | 
            +
              },
         | 
| 515 | 
            +
              "bos_token": "<|im_start|>",
         | 
| 516 | 
            +
              "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
         | 
| 517 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 518 | 
            +
              "eos_token": "<|im_end|>",
         | 
| 519 | 
            +
              "errors": "replace",
         | 
| 520 | 
            +
              "model_max_length": 18000,
         | 
| 521 | 
            +
              "pad_token": "<|endoftext|>",
         | 
| 522 | 
            +
              "padding_side": "right",
         | 
| 523 | 
            +
              "processor_class": "MiniCPMOProcessor",
         | 
| 524 | 
            +
              "split_special_tokens": false,
         | 
| 525 | 
            +
              "tokenizer_class": "MiniCPMOTokenizer",
         | 
| 526 | 
            +
              "unk_token": "<unk>"
         | 
| 527 | 
            +
            }
         | 
    	
        trainer_state.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        training_args.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4ea01878c433d946e8c158c183f3035895fb212f056d60e316165607a72a24fc
         | 
| 3 | 
            +
            size 6840
         | 
    	
        utils.py
    ADDED
    
    | @@ -0,0 +1,203 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2025 The OpenBMB Team. All rights reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            import logging
         | 
| 17 | 
            +
            import re
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            import librosa
         | 
| 20 | 
            +
            import numpy as np
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
            def is_silent(data):
         | 
| 26 | 
            +
                if np.abs(data).max() < 3e-3:
         | 
| 27 | 
            +
                    return True
         | 
| 28 | 
            +
                else:
         | 
| 29 | 
            +
                    return False
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            def sentence_end(txt):
         | 
| 33 | 
            +
                for c in [".", "。", "!", "?", "!", "?"]:
         | 
| 34 | 
            +
                    if c in txt:
         | 
| 35 | 
            +
                        if c == ".":  # check not number before it like 1.
         | 
| 36 | 
            +
                            idx = txt.find(c)
         | 
| 37 | 
            +
                            if idx > 0:
         | 
| 38 | 
            +
                                if txt[idx - 1].isdigit():
         | 
| 39 | 
            +
                                    continue
         | 
| 40 | 
            +
                        return c
         | 
| 41 | 
            +
                return ""
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            class NumberToTextConverter:
         | 
| 45 | 
            +
                r"""
         | 
| 46 | 
            +
                A helper class to ensure text-to-speech (TTS) systems read numeric digits
         | 
| 47 | 
            +
                in the desired language (Chinese or English) digit-by-digit. It forcibly
         | 
| 48 | 
            +
                replaces all numeric substrings in text with their language-specific
         | 
| 49 | 
            +
                textual representations, thereby reducing the likelihood of TTS mistakes
         | 
| 50 | 
            +
                on numbers.
         | 
| 51 | 
            +
                Note: MiniCPM-o 2.6 only use this in streaming mode.
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                Attributes:
         | 
| 54 | 
            +
                    num_to_chinese (dict):
         | 
| 55 | 
            +
                        Mapping from digit (str) to its Chinese textual form (str).
         | 
| 56 | 
            +
                    num_to_english (dict):
         | 
| 57 | 
            +
                        Mapping from digit (str) to its English textual form (str).
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                Example:
         | 
| 60 | 
            +
                    >>> converter = NumberToTextConverter()
         | 
| 61 | 
            +
                    >>> converter.replace_numbers_with_text("我有2个苹果", language="chinese")
         | 
| 62 | 
            +
                    '我有两个苹果'
         | 
| 63 | 
            +
                    >>> converter.replace_numbers_with_text("I have 23 books", language="english")
         | 
| 64 | 
            +
                    'I have two three books'
         | 
| 65 | 
            +
                """
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                def __init__(self):
         | 
| 68 | 
            +
                    self.num_to_chinese = {
         | 
| 69 | 
            +
                        "0": "零",
         | 
| 70 | 
            +
                        "1": "一",
         | 
| 71 | 
            +
                        "2": "二",
         | 
| 72 | 
            +
                        "3": "三",
         | 
| 73 | 
            +
                        "4": "四",
         | 
| 74 | 
            +
                        "5": "五",
         | 
| 75 | 
            +
                        "6": "六",
         | 
| 76 | 
            +
                        "7": "七",
         | 
| 77 | 
            +
                        "8": "八",
         | 
| 78 | 
            +
                        "9": "九",
         | 
| 79 | 
            +
                    }
         | 
| 80 | 
            +
                    self.num_to_english = {
         | 
| 81 | 
            +
                        "0": "zero",
         | 
| 82 | 
            +
                        "1": "one",
         | 
| 83 | 
            +
                        "2": "two",
         | 
| 84 | 
            +
                        "3": "three",
         | 
| 85 | 
            +
                        "4": "four",
         | 
| 86 | 
            +
                        "5": "five",
         | 
| 87 | 
            +
                        "6": "six",
         | 
| 88 | 
            +
                        "7": "seven",
         | 
| 89 | 
            +
                        "8": "eight",
         | 
| 90 | 
            +
                        "9": "nine",
         | 
| 91 | 
            +
                    }
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                def number_to_chinese_digit_by_digit(self, num_str):
         | 
| 94 | 
            +
                    result = ""
         | 
| 95 | 
            +
                    for char in num_str:
         | 
| 96 | 
            +
                        if char in self.num_to_chinese:
         | 
| 97 | 
            +
                            result += self.num_to_chinese[char]
         | 
| 98 | 
            +
                    return result
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                def number_to_english_digit_by_digit(self, num_str):
         | 
| 101 | 
            +
                    result = []
         | 
| 102 | 
            +
                    for char in num_str:
         | 
| 103 | 
            +
                        if char in self.num_to_english:
         | 
| 104 | 
            +
                            result.append(self.num_to_english[char])
         | 
| 105 | 
            +
                    return " ".join(result)
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                def detect_language(self, text):
         | 
| 108 | 
            +
                    chinese_count = len(re.findall(r"[\u4e00-\u9fff]", text))
         | 
| 109 | 
            +
                    english_count = len(re.findall(r"[a-zA-Z]", text))
         | 
| 110 | 
            +
                    return "chinese" if chinese_count >= english_count else "english"
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                def replace_numbers_with_text(self, text, language=None):
         | 
| 113 | 
            +
                    if language is None:
         | 
| 114 | 
            +
                        language = self.detect_language(text)
         | 
| 115 | 
            +
                    numbers = re.findall(r"\d+", text)
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                    for num in numbers:
         | 
| 118 | 
            +
                        if language == "chinese":
         | 
| 119 | 
            +
                            replacement = self.number_to_chinese_digit_by_digit(num)
         | 
| 120 | 
            +
                        else:
         | 
| 121 | 
            +
                            replacement = self.number_to_english_digit_by_digit(num)
         | 
| 122 | 
            +
                        text = text.replace(num, replacement, 1)
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                    return text
         | 
| 125 | 
            +
             | 
| 126 | 
            +
             | 
| 127 | 
            +
            class VoiceChecker:
         | 
| 128 | 
            +
                r"""
         | 
| 129 | 
            +
                A simple utility class to detect silence or low variation in consecutive audio chunks by comparing
         | 
| 130 | 
            +
                the mel-spectrogram distances. It keeps track of consecutive zero-distance and low-distance chunks
         | 
| 131 | 
            +
                to decide if the audio is considered "bad" (e.g., overly silent or not changing enough).
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                Attributes:
         | 
| 134 | 
            +
                    previous_mel (`np.ndarray` or `None`):
         | 
| 135 | 
            +
                        Holds the previously observed mel-spectrogram in decibel scale. Used to compute
         | 
| 136 | 
            +
                        the next distance; reset via :meth:`reset`.
         | 
| 137 | 
            +
                    consecutive_zeros (`int`):
         | 
| 138 | 
            +
                        The number of consecutive chunks that were detected as silent (distance = 0).
         | 
| 139 | 
            +
                    consecutive_low_distance (`int`):
         | 
| 140 | 
            +
                        The number of consecutive chunks whose distance was below the threshold.
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                Example:
         | 
| 143 | 
            +
                    >>> checker = VoiceChecker()
         | 
| 144 | 
            +
                    >>> # Suppose we have audio_wav (list or np.ndarray) and mel_spec (np.ndarray)
         | 
| 145 | 
            +
                    >>> # We split them into chunks and call checker.is_bad(...)
         | 
| 146 | 
            +
                    >>> is_audio_bad = checker.is_bad(audio_wav, mel_spec, chunk_size=2560, thresh=100.0)
         | 
| 147 | 
            +
                    >>> if is_audio_bad:
         | 
| 148 | 
            +
                    ...     print("Audio deemed bad!")
         | 
| 149 | 
            +
                    >>> # Reset states if needed
         | 
| 150 | 
            +
                    >>> checker.reset()
         | 
| 151 | 
            +
                """
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                def __init__(self):
         | 
| 154 | 
            +
                    self.previous_mel = None
         | 
| 155 | 
            +
                    self.consecutive_zeros = 0
         | 
| 156 | 
            +
                    self.consecutive_low_distance = 0
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                def compute_distance(self, audio_chunk, mel_spec):
         | 
| 159 | 
            +
                    if is_silent(audio_chunk):
         | 
| 160 | 
            +
                        return 0.0  # 检查是否为空白片段
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    mel_db = librosa.power_to_db(mel_spec)
         | 
| 163 | 
            +
                    if self.previous_mel is None:
         | 
| 164 | 
            +
                        self.previous_mel = mel_db
         | 
| 165 | 
            +
                        return -1.0
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                    distance = np.linalg.norm(np.mean(mel_db, axis=1) - np.mean(self.previous_mel, axis=1))
         | 
| 168 | 
            +
                    self.previous_mel = mel_db
         | 
| 169 | 
            +
                    return distance
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                def is_bad(self, audio_wav, mel_spec, chunk_size=2560, thresh=100.0):
         | 
| 172 | 
            +
                    num_chunks = len(audio_wav) // chunk_size
         | 
| 173 | 
            +
                    mel_chunk_size = mel_spec.shape[-1] // num_chunks
         | 
| 174 | 
            +
                    for i in range(num_chunks):
         | 
| 175 | 
            +
                        audio_chunk = audio_wav[i * chunk_size : (i + 1) * chunk_size]
         | 
| 176 | 
            +
                        mel_spec_chunk = mel_spec[:, i * mel_chunk_size : (i + 1) * mel_chunk_size]
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                        distance = self.compute_distance(audio_chunk, mel_spec_chunk)
         | 
| 179 | 
            +
                        logger.warning(
         | 
| 180 | 
            +
                            f"mel dist: {distance:.1f}, zero: {self.consecutive_zeros}, low: {self.consecutive_low_distance}"
         | 
| 181 | 
            +
                        )
         | 
| 182 | 
            +
                        if distance == 0:
         | 
| 183 | 
            +
                            self.consecutive_low_distance = 0  # reset
         | 
| 184 | 
            +
                            self.consecutive_zeros += 1
         | 
| 185 | 
            +
                            if self.consecutive_zeros >= 12:
         | 
| 186 | 
            +
                                logger.warning("VoiceChecker detected 1.2 s silent. Marking as failed.")
         | 
| 187 | 
            +
                                return True
         | 
| 188 | 
            +
                        elif distance < thresh:
         | 
| 189 | 
            +
                            self.consecutive_zeros = 0
         | 
| 190 | 
            +
                            self.consecutive_low_distance += 1
         | 
| 191 | 
            +
                            if self.consecutive_low_distance >= 5:
         | 
| 192 | 
            +
                                logger.warning("VoiceChecker detected 5 consecutive low distance chunks. Marking as failed.")
         | 
| 193 | 
            +
                                return True
         | 
| 194 | 
            +
                        else:
         | 
| 195 | 
            +
                            self.consecutive_low_distance = 0
         | 
| 196 | 
            +
                            self.consecutive_zeros = 0
         | 
| 197 | 
            +
             | 
| 198 | 
            +
                    return False
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                def reset(self):
         | 
| 201 | 
            +
                    self.previous_mel = None
         | 
| 202 | 
            +
                    self.consecutive_zeros = 0
         | 
| 203 | 
            +
                    self.consecutive_low_distance = 0
         | 
    	
        value_head.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:e4d81cf5e36d199137637ac21511852648076fbf13a695a53edfc2aed6256ef7
         | 
| 3 | 
            +
            size 7370
         | 
    	
        vocab.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  |