# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/t5gemma2/modular_t5gemma2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_t5gemma2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. # # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import copy from collections.abc import Callable from typing import Optional import torch import torch.nn as nn from ... import initialization as init from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache from ...generation import GenerationConfig, GenerationMixin, GenerationMode from ...integrations import use_kernel_func_from_hub, use_kernelized_func from ...masking_utils import create_bidirectional_mask, create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPooling, Seq2SeqLMOutput, Seq2SeqModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast from ..auto import AutoModel from .configuration_t5gemma2 import T5Gemma2Config, T5Gemma2DecoderConfig, T5Gemma2EncoderConfig, T5Gemma2TextConfig class T5Gemma2RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-6): super().__init__() self.eps = eps self.weight = nn.Parameter(torch.zeros(dim)) def _norm(self, x): return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) def forward(self, x): output = self._norm(x.float()) # Llama does x.to(float16) * w whilst T5Gemma2 is (x * w).to(float16) # See https://github.com/huggingface/transformers/pull/29402 output = output * (1.0 + self.weight.float()) return output.type_as(x) def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.eps}" class T5Gemma2MLP(nn.Module): def __init__(self, config: T5Gemma2TextConfig): super().__init__() self.config = config self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) self.act_fn = ACT2FN[config.hidden_activation] self.dropout = nn.Dropout(config.dropout_rate) def forward(self, x): hidden_states = self.act_fn(self.gate_proj(x)) * self.up_proj(x) hidden_states = self.dropout(hidden_states) down_proj = self.down_proj(hidden_states) return down_proj class T5Gemma2RotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: T5Gemma2TextConfig, device=None): super().__init__() self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config self.layer_types = list(set(config.layer_types)) self.rope_type = {} for layer_type in self.layer_types: rope_params = self.config.rope_parameters[layer_type] if rope_params is None: continue self.rope_type[layer_type] = rope_params["rope_type"] rope_init_fn: Callable = self.compute_default_rope_parameters if self.rope_type[layer_type] != "default": rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]] curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type) self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False) self.register_buffer(f"{layer_type}_original_inv_freq", curr_inv_freq.clone(), persistent=False) setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling) @staticmethod def compute_default_rope_parameters( config: T5Gemma2TextConfig | None = None, device: Optional["torch.device"] = None, seq_len: int | None = None, layer_type: str | None = None, ) -> tuple["torch.Tensor", float]: """ Computes the inverse frequencies according to the original RoPE implementation Args: config ([`~transformers.PreTrainedConfig`]): The model configuration. device (`torch.device`): The device to use for initialization of the inverse frequencies. seq_len (`int`, *optional*): The current sequence length. Unused for this type of RoPE. layer_type (`str`, *optional*): The current layer type if the model has different RoPE parameters per type. Should not be used unless `config.layer_types is not None` Returns: Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format base = config.rope_parameters[layer_type]["rope_theta"] dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads attention_factor = 1.0 # Unused in this type of RoPE # Compute the inverse frequencies inv_freq = 1.0 / ( base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) ) return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids, layer_type=None): inv_freq = getattr(self, f"{layer_type}_inv_freq") attention_scaling = getattr(self, f"{layer_type}_attention_scaling") inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) position_ids_expanded = position_ids[:, None, :].float() device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" with maybe_autocast(device_type=device_type, enabled=False): # Force float32 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() * attention_scaling sin = emb.sin() * attention_scaling return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] return torch.cat((-x2, x1), dim=-1) @use_kernel_func_from_hub("rotary_pos_emb") def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. Args: q (`torch.Tensor`): The query tensor. k (`torch.Tensor`): The key tensor. cos (`torch.Tensor`): The cosine part of the rotary embedding. sin (`torch.Tensor`): The sine part of the rotary embedding. unsqueeze_dim (`int`, *optional*, defaults to 1): The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. Returns: `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. """ cos = cos.unsqueeze(unsqueeze_dim) sin = sin.unsqueeze(unsqueeze_dim) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ batch, num_key_value_heads, slen, head_dim = hidden_states.shape if n_rep == 1: return hidden_states hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) def eager_attention_forward( module: nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, dropout: float = 0.0, scaling: float | None = None, softcap: float | None = None, **kwargs, ) -> tuple[torch.Tensor, torch.Tensor]: if scaling is None: scaling = module.head_dim**-0.5 key_states = repeat_kv(key, module.num_key_value_groups) value_states = repeat_kv(value, module.num_key_value_groups) attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if softcap is not None: attn_weights = attn_weights / softcap attn_weights = torch.tanh(attn_weights) attn_weights = attn_weights * softcap if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() return attn_output, attn_weights @use_kernelized_func(apply_rotary_pos_emb) class T5Gemma2SelfAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config: T5Gemma2TextConfig, layer_idx: int): super().__init__() self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.scaling = config.query_pre_attn_scalar**-0.5 self.attention_dropout = self.config.attention_dropout self.is_causal = False # Only used by the encoder self.q_proj = nn.Linear( config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias ) self.k_proj = nn.Linear( config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias ) self.v_proj = nn.Linear( config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias ) self.o_proj = nn.Linear( config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) self.attn_logit_softcapping = self.config.attn_logit_softcapping self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None self.is_sliding = self.layer_type == "sliding_attention" self.q_norm = T5Gemma2RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) self.k_norm = T5Gemma2RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) def forward( self, hidden_states: torch.Tensor, position_embeddings: torch.Tensor = None, attention_mask: torch.Tensor | None = None, past_key_values: Cache | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) query_states = self.q_norm(query_states) key_states = self.k_norm(key_states) cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: # sin and cos are specific to RoPE models; cache_position needed for the static cache cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward ) attn_output, attn_weights = attention_interface( self, query_states, key_states, value_states, attention_mask, dropout=self.attention_dropout if self.training else 0.0, scaling=self.scaling, sliding_window=self.sliding_window, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() attn_output = self.o_proj(attn_output) return attn_output, attn_weights @use_kernelized_func(apply_rotary_pos_emb) class T5Gemma2MergedAttention(nn.Module): """Merged self-attention and cross-attention for decoder.""" def __init__(self, config: T5Gemma2TextConfig, layer_idx: int): super().__init__() self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.scaling = config.query_pre_attn_scalar**-0.5 self.attention_dropout = self.config.attention_dropout self.is_causal = False # Fused causal and encoder mask self.q_proj = nn.Linear( config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias ) self.k_proj = nn.Linear( config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias ) self.v_proj = nn.Linear( config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias ) self.o_proj = nn.Linear( config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) self.attn_logit_softcapping = self.config.attn_logit_softcapping self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None self.is_sliding = self.layer_type == "sliding_attention" self.q_norm = T5Gemma2RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) self.k_norm = T5Gemma2RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) def forward( self, # decoder self-attention inputs hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], merged_attention_mask: torch.Tensor | None, # cross-attention inputs encoder_hidden_states: torch.Tensor, # cache inputs past_key_values: EncoderDecoderCache | None = None, cache_position: torch.LongTensor | None = None, # others **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]: # attention shapes. input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) cross_input_shape = encoder_hidden_states.shape[:-1] cross_hidden_shape = (*cross_input_shape, -1, self.head_dim) # self-attention. query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) query_states = self.q_norm(query_states) key_states = self.k_norm(key_states) cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: # self-attention. # sin and cos are specific to RoPE models; cache_position needed for the static cache cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} self_attention_cache = past_key_values.self_attention_cache key_states, value_states = self_attention_cache.update( key_states, value_states, self.layer_idx, cache_kwargs ) # cross-attention. is_updated = past_key_values.is_updated.get(self.layer_idx) cross_attention_cache = past_key_values.cross_attention_cache if past_key_values is None or not is_updated: cross_key_states = self.k_proj(encoder_hidden_states).view(cross_hidden_shape).transpose(1, 2) cross_value_states = self.v_proj(encoder_hidden_states).view(cross_hidden_shape).transpose(1, 2) cross_key_states = self.k_norm(cross_key_states) if past_key_values is not None: cross_key_states, cross_value_states = cross_attention_cache.update( cross_key_states, cross_value_states, self.layer_idx ) past_key_values.is_updated[self.layer_idx] = True else: cross_key_states = cross_attention_cache.layers[self.layer_idx].keys cross_value_states = cross_attention_cache.layers[self.layer_idx].values # merged attention. query_states = query_states cross_key_size = cross_input_shape[1] key_states = torch.cat([key_states, cross_key_states], dim=2) value_states = torch.cat([value_states, cross_value_states], dim=2) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward ) attn_output, attn_weights = attention_interface( self, query_states, key_states, value_states, merged_attention_mask, dropout=self.attention_dropout if self.training else 0.0, scaling=self.scaling, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() attn_output = self.o_proj(attn_output) # decompose merged attention weights into self & cross attention weights if attn_weights is not None: self_attn_weights = attn_weights[..., :-cross_key_size] cross_attn_weights = attn_weights[..., -cross_key_size:] else: self_attn_weights, cross_attn_weights = None, None return attn_output, self_attn_weights, cross_attn_weights class T5Gemma2EncoderLayer(GradientCheckpointingLayer): """Encoder sub-layer.""" def __init__(self, config, layer_idx: int): super().__init__() self.hidden_size = config.hidden_size self.config = config self.layer_idx = layer_idx self.attention_type = config.layer_types[layer_idx] self.self_attn = T5Gemma2SelfAttention( config=config, layer_idx=layer_idx, ) self.pre_self_attn_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_self_attn_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.mlp = T5Gemma2MLP(config) self.pre_feedforward_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, **kwargs, ) -> tuple[torch.FloatTensor,]: residual = hidden_states hidden_states = self.pre_self_attn_layernorm(hidden_states) hidden_states, _ = self.self_attn( hidden_states=hidden_states, position_embeddings=position_embeddings, attention_mask=attention_mask, position_ids=position_ids, past_key_values=None, **kwargs, ) hidden_states = self.post_self_attn_layernorm(hidden_states) hidden_states = residual + self.dropout(hidden_states) residual = hidden_states hidden_states = self.pre_feedforward_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = self.post_feedforward_layernorm(hidden_states) hidden_states = residual + self.dropout(hidden_states) return hidden_states class T5Gemma2DecoderLayer(GradientCheckpointingLayer): """Decoder sub-layer: merged attention instead of vanilla self-attention.""" def __init__(self, config, layer_idx: int): super().__init__() self.hidden_size = config.hidden_size self.config = config self.layer_idx = layer_idx self.attention_type = config.layer_types[layer_idx] # replace vanilla self-attention with merged attention to support joint cross-attention. self.self_attn = T5Gemma2MergedAttention( config=config, layer_idx=layer_idx, ) self.pre_self_attn_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_self_attn_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.mlp = T5Gemma2MLP(config) self.pre_feedforward_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.dropout = nn.Dropout(config.dropout_rate) def forward( self, hidden_states: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], merged_attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, past_key_values: EncoderDecoderCache | None = None, use_cache: bool | None = False, cache_position: torch.LongTensor | None = None, encoder_hidden_states: torch.Tensor | None = None, **kwargs, ) -> torch.FloatTensor: residual = hidden_states hidden_states = self.pre_self_attn_layernorm(hidden_states) hidden_states, _, _ = self.self_attn( hidden_states=hidden_states, position_embeddings=position_embeddings, merged_attention_mask=merged_attention_mask, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, encoder_hidden_states=encoder_hidden_states, **kwargs, ) hidden_states = self.post_self_attn_layernorm(hidden_states) hidden_states = residual + self.dropout(hidden_states) residual = hidden_states hidden_states = self.pre_feedforward_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = self.post_feedforward_layernorm(hidden_states) hidden_states = residual + self.dropout(hidden_states) return hidden_states class T5Gemma2LMHead(nn.Module): """Head for language modeling (generation) tasks.""" def __init__(self, hidden_size: int, vocab_size: int, bias: bool = False): super().__init__() self.out_proj = nn.Linear(hidden_size, vocab_size, bias=bias) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: logits = self.out_proj(hidden_states) return logits class T5Gemma2ClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, hidden_size: int, num_labels: int, classifier_dropout_rate: float = 0.0): super().__init__() self.dropout = nn.Dropout(p=classifier_dropout_rate) self.out_proj = nn.Linear(hidden_size, num_labels) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.dropout(hidden_states) hidden_states = self.out_proj(hidden_states) return hidden_states class T5Gemma2MultiModalProjector(nn.Module): def __init__(self, config: T5Gemma2EncoderConfig): super().__init__() self.mm_input_projection_weight = nn.Parameter( torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size) ) self.mm_soft_emb_norm = T5Gemma2RMSNorm( config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps ) self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size) self.tokens_per_side = int(config.mm_tokens_per_image**0.5) self.kernel_size = self.patches_per_image // self.tokens_per_side self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size) def forward(self, vision_outputs: torch.Tensor): batch_size, _, hidden_size = vision_outputs.shape reshaped_vision_outputs = vision_outputs.transpose(1, 2) reshaped_vision_outputs = reshaped_vision_outputs.reshape( batch_size, hidden_size, self.patches_per_image, self.patches_per_image ) reshaped_vision_outputs = reshaped_vision_outputs.contiguous() pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs) pooled_vision_outputs = pooled_vision_outputs.flatten(2) pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2) normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs) projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight) return projected_vision_outputs.type_as(vision_outputs) class T5Gemma2TextScaledWordEmbedding(nn.Embedding): """T5Gemma2 Embedding: override to add eoi token embedding separately.""" def __init__( self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0, eoi_token_index: int = 256_000, ): super().__init__(num_embeddings, embedding_dim, padding_idx) self.scalar_embed_scale = embed_scale self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False) self.eoi_token_index = eoi_token_index self.eoi_embedding = nn.Parameter(torch.zeros(self.embedding_dim)) def forward(self, input_ids: torch.Tensor): input_embeddings = super().forward(input_ids) * self.embed_scale.to(self.weight.dtype) input_embeddings[input_ids == self.eoi_token_index] = self.eoi_embedding.to(input_embeddings.dtype) return input_embeddings @auto_docstring class T5Gemma2PreTrainedModel(PreTrainedModel): config: T5Gemma2Config base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = [ "T5Gemma2EncoderLayer", "T5Gemma2DecoderLayer", "SiglipVisionEmbeddings", "SiglipEncoderLayer", "SiglipMultiheadAttentionPoolingHead", ] _skip_keys_device_placement = ["past_key_values"] # Mask creation is incompatible # FA due to non-default creation / SWA _supports_flash_attn = False _supports_sdpa = True # Flex due to custom masks not compatible to be merged after creation _supports_flex_attn = False _can_compile_fullgraph = True _supports_attention_backend = True _can_record_outputs = { "hidden_states": [T5Gemma2EncoderLayer, T5Gemma2DecoderLayer], "attentions": [ OutputRecorder(T5Gemma2SelfAttention, index=1, layer_name="self_attn"), OutputRecorder(T5Gemma2MergedAttention, index=1, layer_name="self_attn"), OutputRecorder(T5Gemma2MergedAttention, index=2, layer_name="cross_attn"), ], } input_modalities = ("image", "text") @torch.no_grad() def _init_weights(self, module): super()._init_weights(module) if isinstance(module, T5Gemma2MultiModalProjector): init.zeros_(module.mm_input_projection_weight) elif isinstance(module, T5Gemma2TextScaledWordEmbedding): init.zeros_(module.eoi_embedding) init.constant_(module.embed_scale, module.scalar_embed_scale) elif isinstance(module, T5Gemma2ClassificationHead): scale = module.out_proj.weight.shape[0] ** -0.5 init.normal_(module.out_proj.weight, mean=0.0, std=self.config.initializer_range * scale) if hasattr(module.out_proj, "bias") and module.out_proj.bias is not None: init.zeros_(module.out_proj.bias) # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) elif "RMSNorm" in module.__class__.__name__: init.zeros_(module.weight) elif isinstance(module, T5Gemma2RotaryEmbedding): for layer_type in module.layer_types: rope_init_fn = module.compute_default_rope_parameters if module.rope_type[layer_type] != "default": rope_init_fn = ROPE_INIT_FUNCTIONS[module.rope_type[layer_type]] curr_inv_freq, _ = rope_init_fn(module.config, layer_type=layer_type) init.copy_(getattr(module, f"{layer_type}_inv_freq"), curr_inv_freq) init.copy_(getattr(module, f"{layer_type}_original_inv_freq"), curr_inv_freq) def prepare_decoder_input_ids_from_labels(self, input_ids): """ Shifts input_ids to the right, prepends the decoder_start_token_id, and handles pad_token_id replacement for labels that were -100. This is a common preparation step for decoder inputs in sequence-to-sequence models. """ decoder_config = self.config.decoder decoder_start_token_id = decoder_config.bos_token_id pad_token_id = decoder_config.pad_token_id if decoder_start_token_id is None: raise ValueError("self.model.config.decoder.bos_token_id has to be defined. ") # shift inputs to the right shifted_input_ids = input_ids.new_zeros(input_ids.shape) shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() shifted_input_ids[..., 0] = decoder_start_token_id if pad_token_id is None: raise ValueError("self.model.config.decoder.pad_token_id has to be defined.") # Is this T5 specific? # replace possible -100 values in labels by `pad_token_id` shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) return shifted_input_ids def sliding_window_mask_function(sliding_window: int, is_causal=True) -> Callable: """ This creates uni/bidirectional attention mask with sliding window. """ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: if is_causal: left_window_size, right_window_size = sliding_window, 0 else: left_window_size, right_window_size = ((sliding_window + 1) // 2, (sliding_window) // 2 + 1) dist = q_idx - kv_idx left_mask = (dist >= 0) & (dist < left_window_size) right_mask = (dist < 0) & (-dist < right_window_size) return left_mask | right_mask return inner_mask class T5Gemma2TextEncoder(T5Gemma2PreTrainedModel): config: T5Gemma2TextConfig _can_record_outputs = { "attentions": T5Gemma2SelfAttention, "hidden_states": T5Gemma2EncoderLayer, } def __init__( self, config: T5Gemma2TextConfig, eoi_token_index: int = 256_000, ): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = T5Gemma2TextScaledWordEmbedding( config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=config.hidden_size**0.5, eoi_token_index=eoi_token_index, ) self.norm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.gradient_checkpointing = False self.layers = nn.ModuleList( [T5Gemma2EncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.dropout = nn.Dropout(config.dropout_rate) self.rotary_emb = T5Gemma2RotaryEmbedding(config) # Initialize weights and apply final processing self.post_init() @check_model_inputs @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, # Unused for processor compatibility kept in signature. token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutput: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") # As we want to pass `past_key_values=None` explicitly everywhere, we need to pop them from kwargs if present kwargs.pop("past_key_values", None) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) if position_ids is None: position_ids = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0) if not isinstance(self_attn_mask_mapping := attention_mask, dict): mask_kwargs = { "config": self.config, "input_embeds": inputs_embeds, "attention_mask": attention_mask, } self_attn_mask_mapping = { "full_attention": create_bidirectional_mask(**mask_kwargs), "sliding_attention": create_bidirectional_mask( **mask_kwargs, and_mask_function=sliding_window_mask_function(self.config.sliding_window, is_causal=False), ), } # input layer hidden_states = inputs_embeds # global and local position embeddings position_embeddings = {} for layer_type in self.config.layer_types: position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) # dropout hidden_states = self.dropout(hidden_states) for layer_module in self.layers[: self.config.num_hidden_layers]: hidden_states = layer_module( hidden_states, position_embeddings[layer_module.attention_type], self_attn_mask_mapping[layer_module.attention_type], position_ids, **kwargs, ) hidden_states = self.norm(hidden_states) hidden_states = self.dropout(hidden_states) return BaseModelOutput( last_hidden_state=hidden_states, ) class T5Gemma2Encoder(T5Gemma2PreTrainedModel): config: T5Gemma2EncoderConfig def __init__( self, config: T5Gemma2EncoderConfig, eoi_token_index: int = 256_000, ): super().__init__(config) self.text_model = T5Gemma2TextEncoder._from_config(config.text_config, eoi_token_index=eoi_token_index) self.vision_tower = AutoModel.from_config(config=config.vision_config) self.multi_modal_projector = T5Gemma2MultiModalProjector(config) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.text_model.get_input_embeddings() def set_input_embeddings(self, new_embeddings): return self.text_model.set_input_embeddings(new_embeddings) @can_return_tuple @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] ) -> tuple | BaseModelOutputWithPooling: # pixel_values: (batch_size, channels, height, width) # image_features: Image feature tensor of shape (num_images, image_length, embed_dim). vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state image_features = self.multi_modal_projector(last_hidden_state) vision_outputs.pooler_output = image_features return vision_outputs def get_image_placeholder_mask( self, input_ids: torch.LongTensor | None, inputs_embeds: torch.FloatTensor | None, image_features: torch.FloatTensor, ): """ Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is equal to the length of multimodal features. If the lengths are different, an error is raised. """ image_token_id = self.config.image_token_id if input_ids is None: if inputs_embeds is None: raise ValueError("Either `input_ids` or `inputs_embeds` has to be provided.") special_image_mask = inputs_embeds == self.get_input_embeddings()( torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device) ) special_image_mask = special_image_mask.all(-1) else: special_image_mask = input_ids == image_token_id n_image_tokens = special_image_mask.sum() special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) n_image_features = image_features.shape[0] * image_features.shape[1] torch_compilable_check( inputs_embeds[special_image_mask].numel() == image_features.numel(), f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}", ) return special_image_mask @check_model_inputs @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, pixel_values: torch.FloatTensor | None = None, # Unused for processor compatibility kept in signature. token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutput: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.text_model.embed_tokens(input_ids) if pixel_values is not None: image_features = self.get_image_features(pixel_values, return_dict=True).pooler_output image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) image_mask = self.get_image_placeholder_mask( input_ids, inputs_embeds=inputs_embeds, image_features=image_features ) inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features) hidden_states = self.text_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, **kwargs, ) return BaseModelOutput( last_hidden_state=hidden_states, ) def bidirectional_mask_function(attention_mask: torch.Tensor | None) -> Callable: """ This creates bidirectional attention mask. """ def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool: if attention_mask is None: return torch.ones((), dtype=torch.bool) return attention_mask[batch_idx, kv_idx].to(torch.bool) return inner_mask class T5Gemma2Decoder(T5Gemma2PreTrainedModel): config: T5Gemma2DecoderConfig _can_record_outputs = { "attentions": OutputRecorder(T5Gemma2MergedAttention, index=1), "cross_attentions": OutputRecorder(T5Gemma2MergedAttention, index=2), "hidden_states": T5Gemma2DecoderLayer, } def __init__(self, config: T5Gemma2DecoderConfig, eoi_token_index: int = 256_000): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = T5Gemma2TextScaledWordEmbedding( config.vocab_size, config.hidden_size, config.pad_token_id, embed_scale=config.hidden_size**0.5, eoi_token_index=eoi_token_index, ) self.norm = T5Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.gradient_checkpointing = False self.layers = nn.ModuleList( [T5Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.dropout = nn.Dropout(config.dropout_rate) self.rotary_emb = T5Gemma2RotaryEmbedding(config) self.post_init() @check_model_inputs @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, past_key_values: EncoderDecoderCache | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if encoder_hidden_states is None: raise ValueError("`encoder_hidden_states` must be given in decoder") if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) if not self.training and use_cache and past_key_values is None: past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache()) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device ) if position_ids is None: position_ids = cache_position.unsqueeze(0) if not isinstance(self_attn_mask_mapping := attention_mask, dict): mask_kwargs = { "config": self.config, "input_embeds": inputs_embeds, "attention_mask": attention_mask, "cache_position": cache_position, "past_key_values": past_key_values.self_attention_cache if past_key_values is not None else None, "position_ids": position_ids, } # this masking function did nothing to masking but forces `allow_is_causal_skip` to be False # as we always need a mask during decoding for merged attention. mask_kwargs["and_mask_function"] = lambda *args: torch.tensor(True, dtype=torch.bool) self_attn_mask_mapping = { "full_attention": create_causal_mask(**mask_kwargs), "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), } if not isinstance(cross_attn_mask_mapping := encoder_attention_mask, dict): mask_kwargs = { "config": self.config, "input_embeds": encoder_hidden_states, "attention_mask": encoder_attention_mask, "cache_position": cache_position, "past_key_values": None, "position_ids": None, } cross_attn_mask_mapping = { "full_attention": create_causal_mask( **mask_kwargs, or_mask_function=bidirectional_mask_function(encoder_attention_mask), ), } merged_attn_mask_mapping = { "full_attention": torch.cat( [self_attn_mask_mapping["full_attention"], cross_attn_mask_mapping["full_attention"]], dim=-1 ), "sliding_attention": torch.cat( [self_attn_mask_mapping["sliding_attention"], cross_attn_mask_mapping["full_attention"]], dim=-1 ), } # input layer hidden_states = inputs_embeds # global and local position embeddings position_embeddings = {} for layer_type in self.config.layer_types: position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) # dropout hidden_states = self.dropout(hidden_states) for layer_module in self.layers[: self.config.num_hidden_layers]: hidden_states = layer_module( hidden_states, position_embeddings[layer_module.attention_type], merged_attn_mask_mapping[layer_module.attention_type], position_ids, past_key_values, use_cache, cache_position, encoder_hidden_states, **kwargs, ) hidden_states = self.norm(hidden_states) hidden_states = self.dropout(hidden_states) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=past_key_values, ) @auto_docstring class T5Gemma2Model(T5Gemma2PreTrainedModel): _tied_weights_keys = { "decoder.embed_tokens.weight": "encoder.text_model.embed_tokens.weight", "decoder.embed_tokens.eoi_embedding": "encoder.text_model.embed_tokens.eoi_embedding", } def __init__(self, config: T5Gemma2Config): super().__init__(config) # setup encoder and decoder self.encoder = T5Gemma2Encoder(config.encoder, config.eoi_token_index) self.decoder = T5Gemma2Decoder(config.decoder, config.eoi_token_index) self.post_init() def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder def get_input_embeddings(self): return self.encoder.get_input_embeddings() def set_input_embeddings(self, new_embeddings): return self.encoder.set_input_embeddings(new_embeddings) @can_return_tuple @auto_docstring def forward( self, # encoder inputs input_ids: torch.LongTensor | None = None, pixel_values: torch.FloatTensor | None = None, attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, # decoder inputs decoder_input_ids: torch.LongTensor | None = None, decoder_attention_mask: torch.BoolTensor | None = None, decoder_position_ids: torch.LongTensor | None = None, # others (mainly inference or cache related) encoder_outputs: BaseModelOutput | None = None, past_key_values: EncoderDecoderCache | None = None, inputs_embeds: torch.Tensor | None = None, decoder_inputs_embeds: torch.Tensor | None = None, use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> Seq2SeqModelOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) """ # encoder if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, inputs_embeds=inputs_embeds, pixel_values=pixel_values, return_dict=True, **kwargs, ) encoder_hidden_states = encoder_outputs.last_hidden_state # decoder decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, position_ids=decoder_position_ids, inputs_embeds=decoder_inputs_embeds, past_key_values=past_key_values, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=attention_mask, use_cache=use_cache, cache_position=cache_position, return_dict=True, **kwargs, ) return Seq2SeqModelOutput( last_hidden_state=decoder_outputs.last_hidden_state, past_key_values=decoder_outputs.past_key_values, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, cross_attentions=decoder_outputs.cross_attentions, encoder_last_hidden_state=encoder_outputs.last_hidden_state, encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) class T5Gemma2ForConditionalGeneration(T5Gemma2PreTrainedModel, GenerationMixin): _tied_weights_keys = { "lm_head.out_proj.weight": "model.encoder.text_model.embed_tokens.weight", } _tp_plan = {"lm_head.out_proj": "colwise_gather_output"} _pp_plan = {"lm_head.out_proj": (["hidden_states"], ["logits"])} def __init__(self, config: T5Gemma2Config): super().__init__(config) self.model = T5Gemma2Model(config) self.vocab_size = config.decoder.vocab_size self.lm_head = T5Gemma2LMHead(config.decoder.hidden_size, self.vocab_size) self.loss_type = "ForMaskedLM" self.post_init() def set_output_embeddings(self, new_embeddings): self.lm_head.out_proj = new_embeddings def get_output_embeddings(self): return self.lm_head.out_proj def get_input_embeddings(self): return self.model.get_input_embeddings() def set_input_embeddings(self, value): self.model.set_input_embeddings(value) def get_encoder(self): return self.model.get_encoder() def get_decoder(self): return self.model.get_decoder() @can_return_tuple @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] ) -> tuple | BaseModelOutputWithPooling: return self.get_encoder().get_image_features(pixel_values, **kwargs) @property def vision_tower(self): return self.get_encoder().vision_tower @can_return_tuple @auto_docstring def forward( self, # encoder inputs input_ids: torch.LongTensor | None = None, pixel_values: torch.FloatTensor | None = None, attention_mask: torch.FloatTensor | None = None, position_ids: torch.LongTensor | None = None, # decoder inputs decoder_input_ids: torch.LongTensor | None = None, decoder_attention_mask: torch.BoolTensor | None = None, decoder_position_ids: torch.LongTensor | None = None, # others (mainly inference or cache related) encoder_outputs: BaseModelOutput | None = None, past_key_values: EncoderDecoderCache | None = None, inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right decoder_input_ids = self.prepare_decoder_input_ids_from_labels(labels) decoder_outputs: Seq2SeqModelOutput = self.model( input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, decoder_position_ids=decoder_position_ids, encoder_outputs=encoder_outputs, past_key_values=past_key_values, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, cache_position=cache_position, **kwargs, ) hidden_states = decoder_outputs.last_hidden_state # Only compute necessary logits, and do not upcast them to float if we are not computing the loss slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) decoder_config = self.config.decoder if decoder_config.final_logit_softcapping is not None: logits = logits / decoder_config.final_logit_softcapping logits = torch.tanh(logits) logits = logits * decoder_config.final_logit_softcapping loss = None if labels is not None: # Input has right-shifted so we directly perform masked lm loss loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) return Seq2SeqLMOutput( loss=loss, logits=logits, past_key_values=decoder_outputs.past_key_values, decoder_hidden_states=decoder_outputs.decoder_hidden_states, decoder_attentions=decoder_outputs.decoder_attentions, cross_attentions=decoder_outputs.cross_attentions, encoder_last_hidden_state=decoder_outputs.encoder_last_hidden_state, encoder_hidden_states=decoder_outputs.encoder_hidden_states, encoder_attentions=decoder_outputs.encoder_attentions, ) def _prepare_cache_for_generation( self, generation_config: GenerationConfig, model_kwargs: dict, generation_mode: GenerationMode, batch_size: int, max_cache_length: int, ) -> bool: """Override cache preparation to support T5Gemma2-specific EncoderDecoder Cache.""" # Build cache and past_key_values structure first and then override as needed. super()._prepare_cache_for_generation( generation_config, model_kwargs, generation_mode, batch_size, max_cache_length, ) # If use_cache is False, do not prepare the cache. if generation_config.use_cache is False: return cache_implementation = generation_config.cache_implementation if cache_implementation is None: offload_cache = False else: offload_cache = "offloaded" in generation_config.cache_implementation # Main change: use full cache for cross-attention. cross_attn_config = copy.deepcopy(self.config.get_text_config(decoder=True)) # cross-attention does not use sliding window del cross_attn_config.sliding_window del cross_attn_config.layer_types cross_attn_cache_kwargs = { "config": cross_attn_config, "offloading": offload_cache, } past_key_values = model_kwargs.get("past_key_values") if past_key_values is not None: if not isinstance(past_key_values, EncoderDecoderCache): raise ValueError( "The `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma2 model." ) # Cache already established, no need to re-initialize. if len(past_key_values.is_updated) > 0 and past_key_values.is_updated.get(0): return cross_attn_cls = type(past_key_values.cross_attention_cache) if cross_attn_cls == StaticCache: cross_attn_cache_kwargs["max_cache_len"] = model_kwargs["encoder_outputs"][0].shape[1] # Update cross-attention cache only (switch from sliding_window to full). past_key_values.cross_attention_cache = cross_attn_cls(**cross_attn_cache_kwargs) else: # Initialize new cache. model_kwargs["past_key_values"] = EncoderDecoderCache( DynamicCache( **{ "config": self.config.get_text_config(decoder=True), "offloading": offload_cache, } ), # self-attention cache DynamicCache(), # cross-attention cache ) if hasattr(self, "_cache") and self._cache is not None: if not isinstance(self._cache, EncoderDecoderCache): raise ValueError("The internal cache must be of type `EncoderDecoderCache` for T5Gemma2 model.") self._cache = model_kwargs["past_key_values"] @auto_docstring class T5Gemma2ForSequenceClassification(T5Gemma2PreTrainedModel): def __init__(self, config: T5Gemma2Config): super().__init__(config) self.num_labels = config.num_labels self.hidden_size = config.decoder.hidden_size self.model = T5Gemma2Model(config) classifier_dropout = getattr(config, "classifier_dropout_rate", 0.1) self.score = T5Gemma2ClassificationHead(self.hidden_size, self.num_labels, classifier_dropout) self.post_init() def get_input_embeddings(self): return self.model.get_input_embeddings() def set_input_embeddings(self, value): self.model.set_input_embeddings(value) @can_return_tuple @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, pixel_values: torch.FloatTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, decoder_input_ids: torch.LongTensor | None = None, decoder_attention_mask: torch.Tensor | None = None, decoder_position_ids: torch.LongTensor | None = None, encoder_outputs: BaseModelOutput | None = None, inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> SequenceClassifierOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ if inputs_embeds is not None or decoder_inputs_embeds is not None: raise NotImplementedError( f"Passing input embeddings is currently not supported for {self.__class__.__name__}." ) if input_ids is None: raise ValueError("You have to specify input_ids") if decoder_input_ids is None: decoder_input_ids = self.prepare_decoder_input_ids_from_labels(input_ids) outputs: Seq2SeqModelOutput = self.model( input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, decoder_position_ids=decoder_position_ids, encoder_outputs=encoder_outputs, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=False, **kwargs, ) last_hidden_state = outputs.last_hidden_state hidden_states = outputs.decoder_hidden_states attentions = outputs.decoder_attentions logits = self.score(last_hidden_state) batch_size = input_ids.shape[0] # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id non_pad_mask = (decoder_input_ids != self.config.pad_token_id).to(logits.device, torch.int32) token_indices = torch.arange(decoder_input_ids.shape[-1], device=logits.device, dtype=torch.int32) last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) last_non_pad_token = torch.clamp(last_non_pad_token, max=decoder_input_ids.shape[-1] - 1) pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] loss = None if labels is not None: loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) return SequenceClassifierOutput( loss=loss, logits=pooled_logits, hidden_states=hidden_states, attentions=attentions, ) @auto_docstring class T5Gemma2ForTokenClassification(T5Gemma2PreTrainedModel): def __init__(self, config: T5Gemma2Config): super().__init__(config) self.num_labels = config.num_labels self.hidden_size = config.decoder.hidden_size self.model = T5Gemma2Model(config) classifier_dropout = getattr(config, "classifier_dropout_rate", 0.1) self.score = T5Gemma2ClassificationHead(self.hidden_size, self.num_labels, classifier_dropout) self.post_init() def get_input_embeddings(self): return self.model.get_input_embeddings() def set_input_embeddings(self, value): self.model.set_input_embeddings(value) @can_return_tuple @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, pixel_values: torch.FloatTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, decoder_input_ids: torch.LongTensor | None = None, decoder_attention_mask: torch.Tensor | None = None, decoder_position_ids: torch.LongTensor | None = None, encoder_outputs: BaseModelOutput | None = None, inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> TokenClassifierOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ if inputs_embeds is not None or decoder_inputs_embeds is not None: raise NotImplementedError( f"Passing input embeddings is currently not supported for {self.__class__.__name__}." ) if input_ids is None: raise ValueError("You have to specify input_ids") if decoder_input_ids is None: decoder_input_ids = self.prepare_decoder_input_ids_from_labels(input_ids) outputs: Seq2SeqModelOutput = self.model( input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, decoder_position_ids=decoder_position_ids, encoder_outputs=encoder_outputs, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=False, **kwargs, ) last_hidden_state = outputs.last_hidden_state hidden_states = outputs.decoder_hidden_states attentions = outputs.decoder_attentions logits = self.score(last_hidden_state) loss = None if labels is not None: loss = self.loss_function(logits, labels, self.config) return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=hidden_states, attentions=attentions, ) __all__ = [ "T5Gemma2ForConditionalGeneration", "T5Gemma2Model", "T5Gemma2Encoder", "T5Gemma2PreTrainedModel", "T5Gemma2ForSequenceClassification", "T5Gemma2ForTokenClassification", ]