File size: 35,790 Bytes
8fb7827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
from collections.abc import Callable
from typing import Optional
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor, nn
from transformers import PreTrainedModel, Qwen2ForCausalLM
from transformers.activations import ACT2FN
from transformers.generation import GenerationMixin
from transformers.modeling_layers import GradientCheckpointingLayer
from transformers.modeling_outputs import BaseModelOutput
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.utils import auto_docstring
from .configuration_uas_audio import UASAudioConfig, UASAudioEncoderConfig, UASAudioEncoderOnlyConfig


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def _get_feat_extract_output_lengths(input_lengths):
    """
    Computes the output length of the convolutional layers and the output length of the audio encoder
    """

    input_lengths_leave = input_lengths % 100
    feat_lengths = (input_lengths_leave - 1) // 2 + 1
    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
    return output_lengths


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs,
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class SinusoidsPositionEmbedding(nn.Module):
    def __init__(self, length, channels, max_timescale=10000):
        super().__init__()
        if channels % 2 != 0:
            raise ValueError("SinusoidsPositionEmbedding needs even channels input")
        log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
        inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
        scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
        self.register_buffer(
            "positional_embedding",
            torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
            persistent=False,
        )

    def forward(self, seqlen: int):
        return self.positional_embedding[:seqlen, :]


class UASAudioAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.embed_dim = config.d_model
        self.num_heads = config.encoder_attention_heads
        self.dropout = config.attention_dropout
        self.head_dim = self.embed_dim // self.num_heads
        self.num_key_value_groups = 1  # needed for eager attention
        self.config = config

        if (self.head_dim * self.num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {self.num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.attention_dropout = 0.0
        self.is_decoder = False
        self.is_causal = False
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)

    def forward(
        self,
        hidden_states: torch.Tensor,
        cu_seqlens: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        seq_length, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
        key_states = self.k_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
        value_states = self.v_proj(hidden_states).reshape(seq_length, self.num_heads, -1)

        query_states = query_states.transpose(0, 1).unsqueeze(0)
        key_states = key_states.transpose(0, 1).unsqueeze(0)
        value_states = value_states.transpose(0, 1).unsqueeze(0)
        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, _ = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask=attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            cu_seq_lens_q=cu_seqlens,  # pass cu seq lens for FA2
            cu_seq_lens_k=cu_seqlens,
            max_length_q=max_seqlen,
            max_length_k=max_seqlen,
            is_causal=False,
            **kwargs,
        )

        attn_output = attn_output.reshape(seq_length, -1).contiguous()
        attn_output = self.out_proj(attn_output)

        return attn_output


class UASAudioEncoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: UASAudioEncoderConfig):
        super().__init__()
        self.embed_dim = config.d_model
        self.self_attn = UASAudioAttention(config)
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        cu_seqlens: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)
        hidden_states = self.self_attn(
            hidden_states=hidden_states,
            cu_seqlens=cu_seqlens,
            attention_mask=attention_mask,
            **kwargs,
        )
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        hidden_states = residual + hidden_states

        if hidden_states.dtype == torch.float16:
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)

        return outputs


class UASAudioEncoder(PreTrainedModel):
    config: UASAudioEncoderConfig
    main_input_name = "input_features"
    input_modalities = "audio"
    _no_split_modules = ["UASAudioEncoderLayer"]
    _supports_sdpa = True

    def __init__(self, config: UASAudioEncoderConfig):
        super().__init__(config)
        self.dropout = config.dropout

        embed_dim = config.d_model
        self.num_mel_bins = config.num_mel_bins
        self.max_source_positions = config.max_source_positions
        self.n_window = config.n_window
        self.positional_embedding = SinusoidsPositionEmbedding(self.max_source_positions, embed_dim)
        self.layers = nn.ModuleList([UASAudioEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.ln_post = nn.LayerNorm(config.d_model)
        self.gradient_checkpointing = False
        self.conv2d1 = nn.Conv2d(1, config.downsample_hidden_size, 3, 2, padding=1)
        self.conv2d2 = nn.Conv2d(config.downsample_hidden_size, config.downsample_hidden_size, 3, 2, padding=1)
        self.conv2d3 = nn.Conv2d(config.downsample_hidden_size, config.downsample_hidden_size, 3, 2, padding=1)
        self.conv_out = nn.Linear(
            config.downsample_hidden_size * ((((config.num_mel_bins + 1) // 2 + 1) // 2 + 1) // 2),
            config.d_model,
            bias=False,
        )
        self.n_window_infer = self.config.n_window_infer
        self.conv_chunksize = self.config.conv_chunksize
        self.post_init()

    def _freeze_parameters(self):
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def get_input_embeddings(self) -> nn.Module:
        return self.conv1

    def set_input_embeddings(self, value: nn.Module):
        self.conv1 = value

    def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
        # Flash Attention 2 doesn't need a 4D mask and relies on `cu_seqlens/max_seqlen`
        # NOTE: the created attention masl only approximates the ragged FA2 attention by
        # allowing bidirectional attention within `cu_seqlens` blocks, and not attending between
        # blocks. Though it will not be a 100% match for FA2's `varlen` path
        if self.config._attn_implementation == "flash_attention_2":
            return None

        seq_length = inputs_tensor.shape[0]
        attention_mask = torch.full(
            [1, 1, seq_length, seq_length],
            torch.finfo(inputs_tensor.dtype).min,
            device=inputs_tensor.device,
            dtype=inputs_tensor.dtype,
        )
        for i in range(1, len(cu_seqlens)):
            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
        return attention_mask

    @auto_docstring
    def forward(
        self,
        input_features,
        feature_lens=None,
        aftercnn_lens=None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        r"""
        feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
            mel length
        aftercnn_lens (`torch.LongTensor` of shape `(batch_size,)`):
            mel length after cnn
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        return_dict = return_dict if return_dict is not None else getattr(self.config, 'use_return_dict', False)
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else getattr(self.config, 'output_hidden_states', False)
        )

        aftercnn_lens = _get_feat_extract_output_lengths(feature_lens)
        chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()

        chunk_lengths = torch.tensor(
            [self.n_window * 2] * chunk_num.sum(),
            dtype=torch.long,
            device=feature_lens.device,
        )
        tail_chunk_index = F.pad(chunk_num, (1, 0), value=-1).cumsum(0)[1:]
        chunk_lengths[tail_chunk_index] = feature_lens % (self.n_window * 2)
        chunk_lengths[chunk_lengths == 0] = self.n_window * 2

        chunk_list = input_features.T.split(chunk_lengths.tolist(), dim=0)
        padded_feature = nn.utils.rnn.pad_sequence(chunk_list, batch_first=True).transpose(1, 2)
        feature_lens_after_cnn = _get_feat_extract_output_lengths(chunk_lengths)
        padded_mask_after_cnn = nn.utils.rnn.pad_sequence(
            [torch.ones(length, dtype=torch.bool, device=padded_feature.device) for length in feature_lens_after_cnn],
            batch_first=True,
        )
        padded_feature = padded_feature.unsqueeze(1)
        # Split to chunk to avoid OOM during convolution
        padded_embeds = []
        for chunk in padded_feature.split(self.conv_chunksize, dim=0):
            padded_embed = F.gelu(self.conv2d1(chunk))
            padded_embed = F.gelu(self.conv2d2(padded_embed))
            padded_embed = F.gelu(self.conv2d3(padded_embed))
            padded_embeds.append(padded_embed)
        padded_embed = torch.cat(padded_embeds, dim=0)
        b, c, f, t = padded_embed.size()
        padded_embed = self.conv_out(padded_embed.permute(0, 3, 1, 2).contiguous().view(b, t, c * f))

        positional_embedding = (
            self.positional_embedding.positional_embedding[: padded_embed.shape[1], :]
            .unsqueeze(0)
            .to(padded_embed.dtype)
        )
        padded_embed = padded_embed + positional_embedding
        hidden_states = padded_embed[padded_mask_after_cnn]
        cu_chunk_lens = [0]
        window_aftercnn = padded_mask_after_cnn.shape[-1] * (self.n_window_infer // (self.n_window * 2))
        for cnn_len in aftercnn_lens:
            cu_chunk_lens += [window_aftercnn] * (cnn_len // window_aftercnn)
            remainder = cnn_len % window_aftercnn
            if remainder != 0:
                cu_chunk_lens += [remainder]
        cu_seqlens = torch.tensor(cu_chunk_lens, device=aftercnn_lens.device).cumsum(-1, dtype=torch.int32)

        all_hidden_states = () if output_hidden_states else None
        if output_hidden_states:
            all_hidden_states = (hidden_states,)

        for layer_idx, encoder_layer in enumerate(self.layers):
            layer_outputs = encoder_layer(
                hidden_states,
                cu_seqlens,
            )

            hidden_states = layer_outputs[0]
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

        hidden_states = self.ln_post(hidden_states)

        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )

    def padded_and_mask_function(self, tensor_list, tensor_len, padding_value=0, padding_side="right"):
        """
        Pads a sequence of tensors to their maximum length on indicated `padding_side`.
        Then prepares a mask so that pad tokens are not attended to.
        """
        max_len = tensor_len.max()
        dim = tensor_list[0].shape[0]
        padded_tensor = torch.full(
            size=(len(tensor_list), dim, max_len),
            fill_value=padding_value,
            dtype=self.dtype,
            device=tensor_list[0].device,
        )

        batch_mask = torch.zeros(
            (len(tensor_len), max_len),
            dtype=torch.long,
            device=padded_tensor.device,
        )
        for i, length in enumerate(tensor_len):
            batch_mask[i, :length] = 1
            padded_tensor[i, :, :length] = tensor_list[i]

        feature_lens_after_cnn = (tensor_len - 1) // 2 + 1
        max_len_after_cnn = feature_lens_after_cnn.max()
        batch_mask_after_cnn = torch.zeros(
            (len(tensor_len), max_len_after_cnn),
            dtype=torch.long,
            device=padded_tensor.device,
        )
        for i, length in enumerate(feature_lens_after_cnn):
            batch_mask_after_cnn[i, :length] = 1
        return (
            padded_tensor,
            batch_mask.unsqueeze(1),
            batch_mask_after_cnn.bool(),
        )


class Adapter(nn.Module):
    def __init__(
        self,
        d_model: int,
        n_embd: int,
    ):
        super().__init__()
        self.audio_projector = torch.nn.Sequential(
            torch.nn.Linear(d_model, n_embd),
            torch.nn.GELU(),
            torch.nn.Linear(n_embd, n_embd)
        )

    def forward(self, x: Tensor) -> Tensor:
        x = self.audio_projector(x)
        return x


class UASAudioForCausalLM(PreTrainedModel, GenerationMixin):
    config_class = UASAudioConfig
    main_input_name = "input_ids"
    supports_gradient_checkpointing = True
    def __init__(self, config: UASAudioConfig):
        super().__init__(config)
        if isinstance(config.dtype, str):
            dtype = getattr(torch, config.dtype)
        else:
            dtype = config.dtype
        self.bf16 = dtype == torch.bfloat16

        self.llm = Qwen2ForCausalLM(config.text_config)
        self.audio_encoder = UASAudioEncoder(config.audio_encoder_config)

        d_model = config.audio_encoder_config.d_model

        self.adapter = Adapter(
            d_model,
            config.text_config.hidden_size,
        )
        self.audio_token = config.audio_token

        if self.bf16:
            self.audio_encoder = self.audio_encoder.bfloat16()
            self.adapter = self.adapter.bfloat16()

        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        mels=None,
        mel_masks=None,
        past_key_values=None,
        **kwargs
    ):
        # If past_key_values are provided, we are in the generation phase and should not process audio inputs again
        if past_key_values is not None:
            outputs = self.llm(
                input_ids=input_ids,
                attention_mask=attention_mask,
                past_key_values=past_key_values,
                **kwargs
            )
        else:
            # First get the text embeddings for the input_ids, then replace audio token positions with audio embeddings
            hidden_states = self.embedding_with_audio_tokens(input_ids, mels, mel_masks)
            outputs = self.llm(
                inputs_embeds=hidden_states,
                attention_mask=attention_mask,
                past_key_values=None,
                **kwargs
            )
        return outputs

    def embedding_with_audio_tokens(
        self,
        input_ids,
        mels,
        mel_masks
    ):
        """
        Get input embeddings for the LLM, replacing audio token positions with audio features from the audio encoder.
        """
        hidden_states = self.embeddings(input_ids)
        if mels is None:
            return hidden_states

        audio_embeddings = self.audio_encoding(mels, mel_masks)     # data -> feature
        audio_embeddings = self.adapter(audio_embeddings)
        audio_mask = input_ids == self.audio_token
        hidden_states[audio_mask] = audio_embeddings
        return hidden_states

    def audio_encoding(
        self,
        audio_features: torch.Tensor,
        audio_features_mask: torch.Tensor,
        output_hidden_states: bool = False
    ):
        """
        Encode audio features into embeddings.

        Args:
            audio_features: Audio features tensor
            audio_features_mask: Audio features mask
            output_hidden_states: Whether to return hidden states from all encoder layers

        Returns:
            If output_hidden_states=False: audio_features_encoded tensor
            If output_hidden_states=True: BaseModelOutput with last_hidden_state and hidden_states
        """
        feature_lens = audio_features_mask.sum(-1).long()  # [batch_size]
        input_features = audio_features.permute(0, 2, 1)[audio_features_mask.bool()].permute(1, 0)

        audio_encoder_outputs = self.audio_encoder(
            input_features,
            feature_lens=feature_lens,
            output_hidden_states=output_hidden_states,
            return_dict=output_hidden_states,  # Only return dict when we need hidden states
        )

        if output_hidden_states:
            # When output_hidden_states=True, we get BaseModelOutput
            return audio_encoder_outputs
        else:
            # When output_hidden_states=False, we get tuple (hidden_states, ...)
            # Extract the first element (hidden_states tensor) for backward compatibility
            if isinstance(audio_encoder_outputs, tuple):
                return audio_encoder_outputs[0]
            return audio_encoder_outputs

    @property
    def embeddings(self):
        """Return the model's input embeddings - required for GenerationMixin"""
        return self.llm.model.embed_tokens

    def forward_with_detailed_outputs(
        self,
        input_ids=None,
        attention_mask=None,
        mels=None,
        mel_masks=None,
        past_key_values=None,
        output_hidden_states: bool = True,
        **kwargs
    ):
        """
        Forward pass that returns detailed outputs including:
        - Audio encoder final output
        - Audio features after projector (adapter)
        - Text embedding features
        - Hidden states from each layer (separated for audio and text)

        Args:
            input_ids: Input token ids
            attention_mask: Attention mask
            mels: Audio mel features
            mel_masks: Audio mel masks
            past_key_values: Past key values for generation
            output_hidden_states: Whether to return hidden states from all layers
            **kwargs: Additional arguments

        Returns:
            dict containing:
                - audio_encoder_output: Final output from audio encoder
                - audio_features_after_adapter: Audio features after projector/adapter
                - text_embeddings: Text embedding features (before audio replacement)
                - audio_encoder_hidden_states: Tuple of hidden states from each audio encoder layer
                - llm_hidden_states: Tuple of hidden states from each LLM layer (mixed audio+text)
                - llm_hidden_states_text_only: Tuple of text-only hidden states from each LLM layer
                - llm_hidden_states_audio_only: Tuple of audio-only hidden states from each LLM layer
                - llm_outputs: Full LLM outputs (CausalLMOutputWithPast)
        """
        # Get text embeddings (pure text, before audio replacement)
        # Save original text embeddings for return (will have audio parts removed)
        text_embeddings_pure = self.embeddings(input_ids)

        # Process audio if provided
        audio_encoder_output = None
        audio_features_after_adapter = None
        audio_encoder_hidden_states = None
        audio_mask = None

        # Create embeddings for LLM forward pass (may include audio features)
        input_embeddings_for_llm = text_embeddings_pure.clone()

        # Identify audio token positions (even if no audio is provided, audio tokens may exist in input_ids)
        audio_mask = input_ids == self.audio_token

        if mels is not None:
            # Get audio encoder outputs with hidden states
            audio_encoder_outputs = self.audio_encoding(
                mels,
                mel_masks,
                output_hidden_states=output_hidden_states
            )

            if output_hidden_states:
                audio_encoder_output = audio_encoder_outputs.last_hidden_state
                audio_encoder_hidden_states = audio_encoder_outputs.hidden_states
            else:
                audio_encoder_output = audio_encoder_outputs
                audio_encoder_hidden_states = None

            # Apply adapter
            audio_features_after_adapter = self.adapter(audio_encoder_output)

            # Replace audio token positions with audio embeddings in the LLM input
            input_embeddings_for_llm[audio_mask] = audio_features_after_adapter

        # Remove audio parts from text_embeddings_pure (delete, not set to zero)
        # This ensures returned text_embeddings strictly contains no audio features
        if audio_mask.any():
            # Process each batch separately since audio positions may differ
            batch_size = text_embeddings_pure.shape[0]
            text_embeddings_list = []

            for i in range(batch_size):
                # Get text-only mask for this batch (inverse of audio_mask)
                text_mask = ~audio_mask[i]  # shape: (seq_len,)
                # Extract only text embeddings
                text_emb = text_embeddings_pure[i][text_mask]  # shape: (text_seq_len, hidden_size)
                text_embeddings_list.append(text_emb)

            # Pad sequences to the same length for batching
            # Use the maximum text sequence length across batches
            max_text_len = max(emb.shape[0] for emb in text_embeddings_list) if text_embeddings_list else 0
            if max_text_len > 0:
                hidden_size = text_embeddings_pure.shape[2]
                text_embeddings_pure = torch.zeros(
                    (batch_size, max_text_len, hidden_size),
                    dtype=text_embeddings_pure.dtype,
                    device=text_embeddings_pure.device
                )
                for i, emb in enumerate(text_embeddings_list):
                    text_len = emb.shape[0]
                    text_embeddings_pure[i, :text_len] = emb
            else:
                # No text embeddings (all are audio tokens)
                hidden_size = text_embeddings_pure.shape[2]
                text_embeddings_pure = torch.zeros(
                    (batch_size, 0, hidden_size),
                    dtype=text_embeddings_pure.dtype,
                    device=text_embeddings_pure.device
                )
        # If no audio tokens, text_embeddings_pure remains unchanged

        # Forward through LLM
        if past_key_values is not None:
            # Incremental decoding
            llm_outputs = self.llm(
                input_ids=input_ids,
                attention_mask=attention_mask,
                past_key_values=past_key_values,
                output_hidden_states=output_hidden_states,
                return_dict=True,
                **kwargs
            )
        else:
            # First step: use combined embeddings (may include audio features)
            llm_outputs = self.llm(
                inputs_embeds=input_embeddings_for_llm,
                attention_mask=attention_mask,
                past_key_values=None,
                output_hidden_states=output_hidden_states,
                return_dict=True,
                **kwargs
            )

        # Extract LLM hidden states
        llm_hidden_states = llm_outputs.hidden_states if output_hidden_states else None

        # Separate audio and text hidden states if audio is present
        llm_hidden_states_text_only = None
        llm_hidden_states_audio_only = None

        if output_hidden_states and llm_hidden_states is not None and audio_mask is not None:
            # Separate each layer's hidden states into text and audio parts
            llm_hidden_states_text_only = tuple()
            llm_hidden_states_audio_only = tuple()

            batch_size = llm_hidden_states[0].shape[0]
            hidden_size = llm_hidden_states[0].shape[2]

            for layer_hidden_states in llm_hidden_states:
                # layer_hidden_states shape: (batch_size, seq_len, hidden_size)
                # audio_mask shape: (batch_size, seq_len)

                # Process text-only hidden states: delete audio positions, not set to zero
                text_hidden_list = []
                audio_hidden_list = []

                for i in range(batch_size):
                    # Get text-only mask for this batch (inverse of audio_mask)
                    text_mask = ~audio_mask[i]  # shape: (seq_len,)
                    audio_mask_i = audio_mask[i]  # shape: (seq_len,)

                    # Extract only text hidden states (delete audio positions)
                    text_hidden_i = layer_hidden_states[i][text_mask]  # shape: (text_seq_len, hidden_size)
                    text_hidden_list.append(text_hidden_i)

                    # Extract only audio hidden states (delete text positions)
                    audio_hidden_i = layer_hidden_states[i][audio_mask_i]  # shape: (audio_seq_len, hidden_size)
                    audio_hidden_list.append(audio_hidden_i)

                # Pad sequences to the same length for batching
                # Use the maximum text sequence length across batches
                max_text_len = max(emb.shape[0] for emb in text_hidden_list) if text_hidden_list else 0
                max_audio_len = max(emb.shape[0] for emb in audio_hidden_list) if audio_hidden_list else 0

                if max_text_len > 0:
                    text_hidden = torch.zeros(
                        (batch_size, max_text_len, hidden_size),
                        dtype=layer_hidden_states.dtype,
                        device=layer_hidden_states.device
                    )
                    for i, emb in enumerate(text_hidden_list):
                        text_len = emb.shape[0]
                        text_hidden[i, :text_len] = emb
                else:
                    # No text hidden states (all are audio tokens)
                    text_hidden = torch.zeros(
                        (batch_size, 0, hidden_size),
                        dtype=layer_hidden_states.dtype,
                        device=layer_hidden_states.device
                    )

                if max_audio_len > 0:
                    audio_hidden = torch.zeros(
                        (batch_size, max_audio_len, hidden_size),
                        dtype=layer_hidden_states.dtype,
                        device=layer_hidden_states.device
                    )
                    for i, emb in enumerate(audio_hidden_list):
                        audio_len = emb.shape[0]
                        audio_hidden[i, :audio_len] = emb
                else:
                    # No audio hidden states (all are text tokens)
                    audio_hidden = torch.zeros(
                        (batch_size, 0, hidden_size),
                        dtype=layer_hidden_states.dtype,
                        device=layer_hidden_states.device
                    )

                llm_hidden_states_text_only += (text_hidden,)
                llm_hidden_states_audio_only += (audio_hidden,)

        return {
            "audio_encoder_output": audio_encoder_output,
            "audio_features_after_adapter": audio_features_after_adapter,
            "text_embeddings": text_embeddings_pure,  # Return text embeddings with audio parts removed
            "audio_encoder_hidden_states": audio_encoder_hidden_states,
            "llm_hidden_states": llm_hidden_states,
            "llm_hidden_states_text_only": llm_hidden_states_text_only,
            "llm_hidden_states_audio_only": llm_hidden_states_audio_only,
            "llm_outputs": llm_outputs,
        }

    def generate(
        self,
        input_ids,
        attention_mask=None,
        mels=None,
        mel_masks=None,
        generation_config=None,
        **generate_kwargs
    ):
        """
        New implementation of the generate method to support audio inputs.

        This method will:
        1. Handle the initial processing of audio inputs;
        2. Call the underlying LLM's generate method with the appropriate embeddings;
        3. The incremental decoding will be handled by the LLM's generate method using past_key_values.
        """
        # Process audio inputs and get combined embeddings for the initial step
        input_embeddings = self.embedding_with_audio_tokens(input_ids, mels, mel_masks)

        # Call the underlying LLM's generate method with inputs_embeds instead of input_ids
        # The LLM's generate method will handle the generation loop.
        # During incremental decoding, it will use past_key_values to avoid re-processing audio inputs.
        outputs = self.llm.generate(
            inputs_embeds=input_embeddings,
            attention_mask=attention_mask,
            generation_config=generation_config,
            use_cache=True,
            **generate_kwargs
        )

        return outputs


class UASAudioEncoderOnly(PreTrainedModel):
    """
    UASAudio encoder-only model that contains only the audio encoder and adapter.
    Input: audio features
    Output: features processed by encoder and adapter
    """
    config_class = UASAudioEncoderOnlyConfig
    main_input_name = "input_features"
    input_modalities = "audio"

    def __init__(self, config: UASAudioEncoderOnlyConfig):
        super().__init__(config)
        if isinstance(config.dtype, str):
            dtype = getattr(torch, config.dtype)
        else:
            dtype = getattr(config, "dtype", torch.bfloat16)
        self.bf16 = dtype == torch.bfloat16

        self.audio_encoder = UASAudioEncoder(config.audio_encoder_config)

        d_model = config.audio_encoder_config.d_model

        self.adapter = Adapter(
            d_model,
            config.hidden_size,
        )

        if self.bf16:
            self.audio_encoder = self.audio_encoder.bfloat16()
            self.adapter = self.adapter.bfloat16()

        self.post_init()

    def forward(
        self,
        input_features: torch.Tensor,
        feature_lens: Optional[torch.Tensor] = None,
        **kwargs
    ):
        """
        Forward pass through audio encoder and adapter.

        Args:
            input_features: Audio features tensor of shape (seq_len, num_mel_bins) or (batch, seq_len, num_mel_bins)
            feature_lens: Optional tensor of shape (batch_size,) indicating the length of each sequence
            **kwargs: Additional arguments passed to audio encoder

        Returns:
            torch.Tensor: Features processed by encoder and adapter
        """
        # Encode audio features
        audio_features_encoded = self.audio_encoder(
            input_features,
            feature_lens=feature_lens,
            **kwargs
        )

        # Handle tuple output from encoder (backward compatibility)
        if isinstance(audio_features_encoded, tuple):
            audio_features_encoded = audio_features_encoded[0]
        elif hasattr(audio_features_encoded, "last_hidden_state"):
            audio_features_encoded = audio_features_encoded.last_hidden_state

        # Apply adapter (projector)
        output_features = self.adapter(audio_features_encoded)

        return output_features