|
|
import transformers |
|
|
|
|
|
from dataclasses import dataclass, field |
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ModelArguments: |
|
|
model_name_or_path: Optional[str] = field(default="facebook/opt-125m") |
|
|
version: Optional[str] = field(default="v0") |
|
|
freeze_backbone: bool = field(default=False) |
|
|
tune_speech_projector: bool = field(default=False) |
|
|
tune_speech_encoder: bool = field(default=False) |
|
|
tune_speech_generator_only: bool = field(default=False) |
|
|
speech_encoder_type: Optional[str] = field(default=None) |
|
|
speech_encoder: Optional[str] = field(default=None) |
|
|
pretrain_speech_projector: Optional[str] = field(default=None) |
|
|
speech_projector_type: Optional[str] = field(default='linear') |
|
|
speech_generator_type: Optional[str] = field(default='ctc') |
|
|
ctc_decoder_config: str = "(2,4096,32,11008)" |
|
|
ctc_upsample_factor: int = 1 |
|
|
ctc_loss_weight: float = 1.0 |
|
|
unit_vocab_size: int = 1000 |
|
|
speech_encoder_ds_rate: int = 5 |
|
|
speech_encoder_hidden_size: int = 1280 |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DataArguments: |
|
|
data_path: str = field(default=None, |
|
|
metadata={"help": "Path to the training data."}) |
|
|
is_multimodal: bool = False |
|
|
input_type: str = field(default="mel") |
|
|
speech_normalize: bool = False |
|
|
mel_size: int = 128 |
|
|
has_tgt_units: bool = False |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class TrainingArguments(transformers.TrainingArguments): |
|
|
cache_dir: Optional[str] = field(default=None) |
|
|
optim: str = field(default="adamw_torch") |
|
|
freeze_speech_projector: bool = field(default=False) |
|
|
model_max_length: int = field( |
|
|
default=512, |
|
|
metadata={ |
|
|
"help": |
|
|
"Maximum sequence length. Sequences will be right padded (and possibly truncated)." |
|
|
}, |
|
|
) |
|
|
double_quant: bool = field( |
|
|
default=True, |
|
|
metadata={"help": "Compress the quantization statistics through double quantization."} |
|
|
) |
|
|
quant_type: str = field( |
|
|
default="nf4", |
|
|
metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} |
|
|
) |
|
|
bits: int = field( |
|
|
default=16, |
|
|
metadata={"help": "How many bits to use."} |
|
|
) |
|
|
lora_enable: bool = False |
|
|
lora_r: int = 64 |
|
|
lora_alpha: int = 16 |
|
|
lora_dropout: float = 0.05 |
|
|
lora_weight_path: str = "" |
|
|
lora_bias: str = "none" |
|
|
speech_projector_lr: Optional[float] = None |
|
|
group_by_modality_length: bool = field(default=False) |