# Copyright 2025 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Processor class for Dia""" import math from pathlib import Path from ...audio_utils import AudioInput, make_list_of_audio from ...feature_extraction_utils import BatchFeature from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...utils import auto_docstring, is_soundfile_available, is_torch_available if is_torch_available(): import torch if is_soundfile_available(): import soundfile as sf class DiaAudioKwargs(AudioKwargs, total=False): """ bos_token_id (`int`, *optional*, defaults to `1026`): The token ID used as the beginning-of-sequence token for audio codebooks. This token is prepended to each audio sequence during encoding. eos_token_id (`int`, *optional*, defaults to `1024`): The token ID used as the end-of-sequence token for audio codebooks. This token is appended to audio sequences during training (when `generation=False`) to mark the end of the audio. pad_token_id (`int`, *optional*, defaults to `1025`): The token ID used for padding audio codebook sequences. This token is used to fill positions in the delay pattern where no valid audio token exists. delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`): A list of delay values (in frames) for each codebook channel. The delay pattern creates temporal offsets between different codebook channels, allowing the model to capture dependencies across channels. Each value represents the number of frames to delay that specific channel. generation (`bool`, *optional*, defaults to `True`): Whether the processor is being used for generation (text-to-speech) or training. When `True`, the processor prepares inputs for generation mode where audio is generated from text. When `False`, it prepares inputs for training where both text and audio are provided. """ bos_token_id: int eos_token_id: int pad_token_id: int delay_pattern: list[int] generation: bool class DiaProcessorKwargs(ProcessingKwargs, total=False): audio_kwargs: DiaAudioKwargs _defaults = { "text_kwargs": { "padding": True, "padding_side": "right", "add_special_tokens": False, }, "audio_kwargs": { "eos_token_id": 1024, "pad_token_id": 1025, "bos_token_id": 1026, "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15], "generation": True, "sampling_rate": 44100, }, "common_kwargs": { "return_tensors": "pt", }, } @auto_docstring class DiaProcessor(ProcessorMixin): audio_tokenizer_class = "DacModel" def __init__(self, feature_extractor, tokenizer, audio_tokenizer): r""" audio_tokenizer (`DacModel`): An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input. """ super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer) @auto_docstring def __call__( self, text: str | list[str], audio: AudioInput | None = None, output_labels: bool | None = False, **kwargs: Unpack[DiaProcessorKwargs], ): r""" output_labels (`bool`, *optional*, defaults to `False`): Whether to return labels for training. When `True`, the processor generates labels from the decoder input sequence by shifting it by one position. Labels use special values: `-100` for tokens to ignore in loss computation (padding and BOS tokens), and `-101` for audio frames used only for the backbone model (when `depth_decoder_labels_ratio < 1.0`). Cannot be used together with `generation=True`. """ if not is_torch_available(): raise ValueError( "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't " "find it in your environment. You can install torch via `pip install torch`." ) if text is None: raise ValueError("You need to specify the `text` input to process.") output_kwargs = self._merge_kwargs( DiaProcessorKwargs, **kwargs, ) text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] return_tensors = text_kwargs.get("return_tensors", None) if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") data = {} # Text if isinstance(text, str): text = [text] elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): raise ValueError("Invalid input text. Please provide a string, or a list of strings") encodings = self.tokenizer(text, **text_kwargs) data.update(encodings) # Audio delay_pattern = audio_kwargs.pop("delay_pattern", None) audio_bos_token_id = audio_kwargs.pop("bos_token_id", None) audio_eos_token_id = audio_kwargs.pop("eos_token_id", None) audio_pad_token_id = audio_kwargs.pop("pad_token_id", None) generation = audio_kwargs.pop("generation", True) if ( audio_bos_token_id is None or audio_eos_token_id is None or audio_pad_token_id is None or delay_pattern is None ): raise ValueError( "To enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, " "`pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those." ) if generation and output_labels: raise ValueError( f"Labels with `generation` is incompatible, got generation={generation}, output_labels={output_labels}." ) batch_size = data["input_ids"].shape[0] num_channels = len(delay_pattern) max_delay = max(delay_pattern) # Voice cloning generation / general training if audio is not None: audio = make_list_of_audio(audio) input_audios = self.feature_extractor(audio, **audio_kwargs) compression_rate = math.prod(self.audio_tokenizer.config.downsampling_ratios) max_encoded_sequence_len = input_audios["padding_mask"][0].shape[-1] // compression_rate decoder_input_ids = [] decoder_attention_mask = [] # TODO: dac with batching is currently broken, but non-batch is working # refer to https://gist.github.com/vasqu/643a45b680cf39fd7467271ee2eb6f80 for a validation script for padding_mask, audio in zip(input_audios["padding_mask"], input_audios["input_values"]): # get current length with hop length in mind (as if it were sampled as a single audio) base_pad_len = self.feature_extractor.hop_length current_audio_len = math.ceil(padding_mask.sum(dim=-1) / base_pad_len) * base_pad_len encoded_sequence_len = current_audio_len // compression_rate padding_len = max_encoded_sequence_len - encoded_sequence_len # compute non-padded forward pass; one extra bos (and eos if training) is added with torch.no_grad(): audio = audio[None, ..., :current_audio_len].to(self.audio_tokenizer.device) input_ids = self.audio_tokenizer.encode(audio).audio_codes.transpose(1, 2) if not generation: input_ids = torch.nn.functional.pad( input_ids, pad=(0, 0, 0, 1, 0, 0), mode="constant", value=audio_eos_token_id ) # apply padding # +1 for the bos within the real sequence input_ids = torch.nn.functional.pad( input_ids, pad=(0, 0, padding_len + 1, 0, 0, 0), mode="constant", value=audio_bos_token_id ) num_valid_inputs = encoded_sequence_len + 1 + max_delay # sequence + bos + delay num_valid_inputs += 0 if generation else 1 # eos if training attention_mask = torch.tensor([0] * padding_len + [1] * num_valid_inputs, dtype=torch.long)[None, :] decoder_input_ids.append(input_ids) decoder_attention_mask.append(attention_mask) decoder_input_ids = torch.cat(decoder_input_ids, dim=0) decoder_attention_mask = torch.cat(decoder_attention_mask, dim=0) # TTS generation elif generation: # all bos to start with TTS decoder_input_ids = torch.full((batch_size, 1, num_channels), audio_bos_token_id, dtype=torch.long) # we preemptively add the delay decoder_attention_mask = torch.ones(size=(batch_size, 1 + max_delay), dtype=torch.long) else: raise ValueError("If you try to train, you should provide audio data as well.") if batch_size != decoder_input_ids.shape[0]: raise ValueError( f"Need the same amount of samples for both text and audio, but got text samples={batch_size} and " f"audio samples = {decoder_input_ids.shape[0]} instead." ) # prepare shift indices per delay max_seq_len = decoder_attention_mask.shape[-1] max_audio_len = max_seq_len - max_delay precomputed_idx = self.build_indices( bsz=batch_size, seq_len=max_seq_len, num_channels=num_channels, delay_pattern=delay_pattern, revert=False, ) # create delay pattern input # the pad token will be used for masking which input is valid for prediction during generation prefill = torch.full( (batch_size, max_seq_len, num_channels), fill_value=audio_pad_token_id, dtype=torch.int, ) prefill[:, :max_audio_len] = decoder_input_ids delayed_decoder_input_ids = self.apply_audio_delay( audio=prefill, pad_token_id=audio_pad_token_id, bos_token_id=audio_bos_token_id, precomputed_idx=precomputed_idx, ) data.update({"decoder_input_ids": delayed_decoder_input_ids, "decoder_attention_mask": decoder_attention_mask}) if output_labels: # Base idea is to shift on the sequence dim labels = data["decoder_input_ids"].clone()[:, 1:] labels[labels == audio_pad_token_id] = -100 labels[labels == audio_bos_token_id] = -100 data["labels"] = labels.transpose(1, 2).reshape(batch_size * num_channels, -1).contiguous().long() data["decoder_input_ids"] = data["decoder_input_ids"][:, :-1] data["decoder_attention_mask"] = data["decoder_attention_mask"][:, :-1] return BatchFeature(data=data, tensor_type=return_tensors) def batch_decode( self, decoder_input_ids: "torch.Tensor", audio_prompt_len: int | None = None, **kwargs: Unpack[DiaProcessorKwargs], ) -> list["torch.Tensor"]: """ Decodes a batch of audio codebook sequences into their respective audio waveforms via the `audio_tokenizer`. See [`~DacModel.decode`] for more information. Args: decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder. audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning). """ output_kwargs = self._merge_kwargs( DiaProcessorKwargs, **kwargs, ) audio_kwargs = output_kwargs["audio_kwargs"] delay_pattern = audio_kwargs.pop("delay_pattern", None) audio_bos_token_id = audio_kwargs.pop("bos_token_id", None) audio_pad_token_id = audio_kwargs.pop("pad_token_id", None) if audio_bos_token_id is None or audio_pad_token_id is None or delay_pattern is None: raise ValueError( "To enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, " "and `delay_pattern`. You may have accidentally overwritten one of those." ) # either decode the whole audio sequence or only the generated parts if audio_prompt_len is not None: audio_prompt_len = torch.tensor(audio_prompt_len, device=decoder_input_ids.device, dtype=torch.long) start_of_generation_idx = audio_prompt_len[None].expand(decoder_input_ids.shape[0]) else: start_of_generation_idx = (decoder_input_ids[:, :, 0] == audio_bos_token_id).sum(dim=-1) # -1 for the eos token end_of_generation_idx = ( decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == audio_pad_token_id).sum(dim=-1) - 1 ) # revert delay bsz, seq_len, num_channels = decoder_input_ids.shape precomputed_idx = self.build_indices( bsz=bsz, seq_len=seq_len, num_channels=num_channels, delay_pattern=delay_pattern, revert=True, ) output_sequences = self.apply_audio_delay( audio=decoder_input_ids, # We do not care about these values as we cut them out # with `start_of_generation_idx` and `end_of_generation_idx` pad_token_id=-1, bos_token_id=-1, precomputed_idx=precomputed_idx, ).transpose(1, 2) # retrieve the correct sequences each audios = [] # TODO: see above, dac doesn't work in batches yet with torch.no_grad(): for i in range(start_of_generation_idx.shape[0]): output_i = output_sequences[i, :, start_of_generation_idx[i] : end_of_generation_idx[i]][None, ...] output_i = output_i.to(self.audio_tokenizer.device) audio_i = self.audio_tokenizer.decode(audio_codes=output_i).audio_values.cpu().squeeze() audios.append(audio_i) return audios def decode( self, decoder_input_ids: "torch.Tensor", audio_prompt_len: int | None = None, **kwargs: Unpack[DiaProcessorKwargs], ) -> "torch.Tensor": """ Decodes a single sequence of audio codebooks into the respective audio waveform via the `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information. """ if decoder_input_ids.shape[0] != 1: raise ValueError( f"Expecting a single output to be decoded but received {decoder_input_ids.shape[0]} samples instead." ) return self.batch_decode(decoder_input_ids, audio_prompt_len, **kwargs)[0] def get_audio_prompt_len( self, decoder_attention_mask: "torch.Tensor", **kwargs: Unpack[DiaProcessorKwargs], ) -> int: """Utility function to get the audio prompt length.""" output_kwargs = self._merge_kwargs( DiaProcessorKwargs, **kwargs, ) audio_kwargs = output_kwargs["audio_kwargs"] delay_pattern = audio_kwargs.pop("delay_pattern", None) if delay_pattern is None: raise ValueError( "To enable the utility of retrieving the prompt length for Dia, we need the " "`delay_pattern`. You may have accidentally overwritten this." ) return decoder_attention_mask.shape[1] - max(delay_pattern) # Copied from transformers.models.csm.processing_csm.CsmProcessor.save_audio with Csm->Dia def save_audio( self, audio: AudioInput, saving_path: str | Path | list[str | Path], **kwargs: Unpack[DiaProcessorKwargs], ): # TODO: @eustlb, this should be in AudioProcessor if not is_soundfile_available(): raise ImportError("Please install `soundfile` to save audio files.") # ensure correct audio input audio = make_list_of_audio(audio) # ensure correct saving path if isinstance(saving_path, (str, Path)): saving_path = [saving_path] elif not (isinstance(saving_path, (list, tuple)) and all(isinstance(p, (str, Path)) for p in saving_path)): raise ValueError("Invalid input path. Please provide a string, or a list of strings") if len(audio) != len(saving_path): raise ValueError("The number of audio and saving paths must be the same") output_kwargs = self._merge_kwargs( DiaProcessorKwargs, **kwargs, ) audio_kwargs = output_kwargs["audio_kwargs"] sampling_rate = audio_kwargs["sampling_rate"] for audio_value, p in zip(audio, saving_path): if isinstance(audio_value, torch.Tensor): audio_value = audio_value.cpu().float().numpy() sf.write(p, audio_value, sampling_rate) @staticmethod def build_indices( bsz: int, seq_len: int, num_channels: int, delay_pattern: list[int], revert: bool = False, ) -> tuple["torch.Tensor", "torch.Tensor"]: """ Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel] or in[seq, channel] = out[seq + delay[channel], channel] if `revert`. Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD. """ delay_array = torch.tensor(delay_pattern, dtype=torch.int32) # (0..seq_len-1) sequence_idx = torch.arange(seq_len, dtype=torch.int32)[None, :].expand(bsz, seq_len)[..., None] # + or - delay depending if we delay or revert the delay if not revert: sequence_idx = sequence_idx - delay_array[None, None, :] else: sequence_idx = sequence_idx + delay_array[None, None, :] # if delay goes over the range we clamp back to valid values valid_sequence_idx = torch.clamp(sequence_idx, 0, seq_len - 1) batch_idx = torch.arange(bsz, dtype=torch.int32)[:, None, None].expand(bsz, seq_len, num_channels) channel_idx = torch.arange(num_channels, dtype=torch.int32)[None, None, :].expand(bsz, seq_len, num_channels) all_idx = torch.stack( [batch_idx.reshape(-1), valid_sequence_idx.reshape(-1), channel_idx.reshape(-1)], dim=1, ).long() return sequence_idx, all_idx @staticmethod def apply_audio_delay( audio: "torch.Tensor", pad_token_id: int, bos_token_id: int, precomputed_idx: tuple["torch.Tensor", "torch.Tensor"], ) -> "torch.Tensor": """ Applies or reverts the delay pattern to batched audio tokens using precomputed indices, inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len. Args: audio: audio tokens of shape [bsz, seq_len, num_channels] pad_token_id: the PAD token bos_token_id: the BOS token precomputed_idx: from `build_indices` Returns: final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels] """ # Move everything to the same device device = audio.device sequence_idx, all_idx = precomputed_idx sequence_idx = sequence_idx.to(device) all_idx = all_idx.to(device) # Gather per precomputed indices batch_idx, valid_sequence_idx, channel_idx = torch.unbind(all_idx, dim=-1) gathered_audio = audio[batch_idx, valid_sequence_idx, channel_idx].view(audio.size()) # Mask according to negative sequence_idx => BOS; sequence_idx >= seq_len => PAD mask_bos = sequence_idx < 0 mask_pad = sequence_idx >= audio.shape[1] final_audio = torch.where(mask_bos, bos_token_id, torch.where(mask_pad, pad_token_id, gathered_audio)) return final_audio __all__ = ["DiaProcessor"]