Xcodec and Xcodec2
Collection
Transformer supported versions of X-Codec models: https://github.com/zhenye234/xcodec?tab=readme-ov-file#available-models
β’
6 items
β’
Updated
β’
1
This codec is part of the X-Codec family of codecs as shown below:
Model checkpoint | Semantic Model | Domain | Training Data |
---|---|---|---|
xcodec-hubert-librispeech | facebook/hubert-base-ls960 | Speech | Librispeech |
xcodec-wavlm-mls (this model) | microsoft/wavlm-base-plus | Speech | MLS English |
xcodec-wavlm-more-data | microsoft/wavlm-base-plus | Speech | MLS English + Internal data |
xcodec-hubert-general | ZhenYe234/hubert_base_general_audio | General audio | 200k hours internal data |
xcodec-hubert-general-balanced | ZhenYe234/hubert_base_general_audio | General audio | More balanced data |
Original model is xcodec_wavlm_mls
from this table.
The example below applies the codec over all possible bandwidths.
from datasets import Audio, load_dataset
from transformers import XcodecModel, AutoFeatureExtractor
import torch
import os
from scipy.io.wavfile import write as write_wav
model_id = "hf-audio/xcodec-wavlm-mls"
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
available_bandwidths = [0.5, 1, 1.5, 2, 4]
# load model
model = XcodecModel.from_pretrained(model_id, device_map=torch_device)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
# load audio example
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
librispeech_dummy = librispeech_dummy.cast_column(
"audio", Audio(sampling_rate=feature_extractor.sampling_rate)
)
audio_array = librispeech_dummy[0]["audio"]["array"]
inputs = feature_extractor(
raw_audio=audio_array, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt"
).to(model.device)
audio = inputs["input_values"]
for bandwidth in available_bandwidths:
print(f"Encoding with bandwidth: {bandwidth} kbps")
# encode
audio_codes = model.encode(audio, bandwidth=bandwidth, return_dict=False)
print("Codebook shape", audio_codes.shape)
# 0.5 kbps -> torch.Size([1, 1, 293])
# 1.0 kbps -> torch.Size([1, 2, 293])
# 1.5 kbps -> torch.Size([1, 3, 293])
# 2.0 kbps -> torch.Size([1, 4, 293])
# 4.0 kbps -> torch.Size([1, 8, 293])
# decode
input_values_dec = model.decode(audio_codes).audio_values
# save audio to file
write_wav(f"{os.path.basename(model_id)}_{bandwidth}.wav", feature_extractor.sampling_rate, input_values_dec.squeeze().detach().cpu().numpy())
write_wav("original.wav", feature_extractor.sampling_rate, audio.squeeze().detach().cpu().numpy())
Original
0.5 kbps
1 kbps
1.5 kbps
2 kbps
4 kbps
from datasets import Audio, load_dataset
from transformers import XcodecModel, AutoFeatureExtractor
import torch
model_id = "hf-audio/xcodec-wavlm-mls"
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
bandwidth = 4
n_audio = 2 # number of audio samples to process in a batch
# load model
model = XcodecModel.from_pretrained(model_id, device_map=torch_device)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
# load audio example
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column(
"audio", Audio(sampling_rate=feature_extractor.sampling_rate)
)
audio = [audio_sample["array"] for audio_sample in ds[-n_audio:]["audio"]]
print(f"Input audio shape: {[_sample.shape for _sample in audio]}")
# Input audio shape: [(113840,), (71680,)]
inputs = feature_extractor(
raw_audio=audio, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt"
).to(model.device)
audio = inputs["input_values"]
print(f"Padded audio shape: {audio.shape}")
# Padded audio shape: torch.Size([2, 1, 113920])
# encode
audio_codes = model.encode(audio, bandwidth=bandwidth, return_dict=False)
print("Codebook shape", audio_codes.shape)
# Codebook shape torch.Size([2, 8, 356])
# decode
decoded_audio = model.decode(audio_codes).audio_values
print("Decoded audio shape", decoded_audio.shape)
# Decoded audio shape torch.Size([2, 1, 113920])
Base model
microsoft/wavlm-base-plus