# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/internvl/modular_internvl.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_internvl.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # Copyright 2025 HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections.abc from collections.abc import Callable from dataclasses import dataclass import torch import torch.nn as nn from ... import initialization as init from ...activations import ACT2FN from ...cache_utils import Cache from ...generation import GenerationMixin from ...integrations import use_kernel_forward_from_hub from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ModelOutput, TransformersKwargs, auto_docstring, torch_compilable_check, torch_int from ...utils.generic import check_model_inputs from ..auto import AutoModel from .configuration_internvl import InternVLConfig, InternVLVisionConfig @use_kernel_forward_from_hub("RMSNorm") class InternVLVisionRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ InternVLVisionRMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" def eager_attention_forward( module: nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, dropout: float = 0.0, **kwargs, ): key_states = key value_states = value attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling if attention_mask is not None: causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask # No upcasting of the attention weights to float32 in this implementation attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() return attn_output, attn_weights class InternVLVisionAttention(nn.Module): """Attention Class for InternVL Vision Encoder""" def __init__(self, config: InternVLVisionConfig): super().__init__() self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads})." ) self.scale = self.head_dim**-0.5 self.attention_dropout = config.attention_dropout proj_dropout = config.projection_dropout qk_norm = config.use_qk_norm # Needed for flash attention self.is_causal = False self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias) self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias) self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim) self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity() self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity() self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity() def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ): batch_size, seq_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) query_states = self.q_norm(query_states) key_states = self.k_norm(key_states) query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward ) attn_output, attn_weights = attention_interface( self, query_states, key_states, value_states, attention_mask, dropout=0.0 if not self.training else self.attention_dropout, scaling=self.scale, is_causal=False, **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim) output = self.projection_layer(attn_output) output = self.projection_dropout(output) return output, attn_weights @dataclass @auto_docstring( custom_intro=""" Class for outputs of [`InternVLVisionModel`]. """ ) class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling): r""" pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token will be returned. """ class InternVLVisionPatchEmbeddings(nn.Module): """ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a Transformer. """ def __init__(self, config): super().__init__() image_size, patch_size = config.image_size, config.patch_size num_channels, hidden_size = config.num_channels, config.hidden_size num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) self.image_size = image_size self.patch_size = patch_size self.num_channels = num_channels self.num_patches = num_patches self.patch_shape = patch_shape self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: batch_size, num_channels, height, width = pixel_values.shape if num_channels != self.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." ) embeddings = self.projection(pixel_values.to(self.projection.weight.dtype)) embeddings = embeddings.flatten(2).transpose(1, 2) return embeddings # Based on timm implementation, which can be found here: # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py class InternVLVisionEmbeddings(nn.Module): """ Construct the CLS token, position and patch embeddings. Optionally, also the mask token. """ def __init__(self, config: InternVLVisionConfig) -> None: super().__init__() self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if config.use_mask_token: self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) else: self.mask_token = None self.patch_embeddings = InternVLVisionPatchEmbeddings(config) self.patch_size = config.patch_size self.image_size = ( config.image_size if isinstance(config.image_size, collections.abc.Iterable) else (config.image_size, config.image_size) ) num_patches = self.patch_embeddings.num_patches if config.use_absolute_position_embeddings: self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) else: self.position_embeddings = None self.dropout = nn.Dropout(config.hidden_dropout_prob) def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution images. This method is also adapted to support torch.jit tracing. Adapted from: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ num_patches = embeddings.shape[1] - 1 num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and num_patches == num_positions and height == width: return self.position_embeddings class_pos_embed = self.position_embeddings[:, :1] patch_pos_embed = self.position_embeddings[:, 1:] dim = embeddings.shape[-1] new_height = height // self.patch_size[0] new_width = width // self.patch_size[1] sqrt_num_positions = torch_int(num_positions**0.5) patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) patch_pos_embed = nn.functional.interpolate( patch_pos_embed, size=(new_height, new_width), mode="bicubic", align_corners=False, ) patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward( self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, ) -> torch.Tensor: _, _, height, width = pixel_values.shape embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.size() if bool_masked_pos is not None: mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) # replace the masked visual tokens by mask_tokens w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) embeddings = embeddings * (1 - w) + mask_tokens * w cls_tokens = self.cls_token.expand(batch_size, -1, -1) embeddings = torch.cat((cls_tokens, embeddings), dim=1) if self.position_embeddings is not None: embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) embeddings = self.dropout(embeddings) return embeddings class InternVLVisionMLP(nn.Module): def __init__(self, config): super().__init__() self.config = config self.activation_fn = ACT2FN[config.hidden_act] self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.fc1(hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(hidden_states) return hidden_states NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm} class InternVLVisionLayer(GradientCheckpointingLayer): """This corresponds to the Block class in the timm implementation.""" def __init__(self, config: InternVLVisionConfig) -> None: super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.attention = InternVLVisionAttention(config) self.mlp = InternVLVisionMLP(config) # InternVL uses different layernorm implementations for different models self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps) self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps) init_values = config.layer_scale_init_value self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True) self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward( self, hidden_states: torch.Tensor, ) -> tuple[torch.Tensor] | tuple[torch.Tensor, torch.Tensor]: attention_output, _ = self.attention( self.layernorm_before(hidden_states), # in InternVLVision, layernorm is applied before self-attention ) attention_output = self.lambda_1 * attention_output # first residual connection hidden_states = attention_output + hidden_states # in InternVLVision, layernorm is also applied after self-attention layer_output = self.layernorm_after(hidden_states) layer_output = self.mlp(layer_output) layer_output = self.dropout(layer_output) if self.lambda_2 is not None: layer_output = self.lambda_2 * layer_output # second residual connection layer_output = layer_output + hidden_states return layer_output class InternVLVisionEncoder(nn.Module): def __init__(self, config: InternVLVisionConfig) -> None: super().__init__() self.config = config self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( self, hidden_states: torch.Tensor, ) -> tuple | BaseModelOutput: for layer_module in self.layer: hidden_states = layer_module(hidden_states) return BaseModelOutput( last_hidden_state=hidden_states, ) @auto_docstring class InternVLVisionPreTrainedModel(PreTrainedModel): config: InternVLVisionConfig base_model_prefix = "internvl_vision" main_input_name = "pixel_values" input_modalities = ("image", "video") supports_gradient_checkpointing = True _no_split_modules = ["InternVLVisionLayer"] _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True _supports_attention_backend = True _can_record_outputs = { "hidden_states": InternVLVisionLayer, "attentions": InternVLVisionAttention, } @torch.no_grad() def _init_weights(self, module): """Initialize the weights""" super()._init_weights(module) if isinstance(module, InternVLVisionEmbeddings): init.zeros_(module.cls_token) if module.mask_token is not None: init.zeros_(module.mask_token) if module.position_embeddings is not None: init.zeros_(module.position_embeddings) elif isinstance(module, InternVLVisionLayer): init.constant_(module.lambda_1, self.config.layer_scale_init_value) init.constant_(module.lambda_2, self.config.layer_scale_init_value) @auto_docstring class InternVLVisionModel(InternVLVisionPreTrainedModel): def __init__(self, config: InternVLVisionConfig) -> None: super().__init__(config) self.config = config self.embeddings = InternVLVisionEmbeddings(config) self.encoder = InternVLVisionEncoder(config) self.layernorm = ( nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) ) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.embeddings.patch_embeddings @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs ) -> tuple | InternVLVisionModelOutputWithPooling: r""" bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). """ embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) encoder_outputs = self.encoder(embedding_output) sequence_output = encoder_outputs[0] sequence_output = self.layernorm(sequence_output) return InternVLVisionModelOutputWithPooling( last_hidden_state=sequence_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @auto_docstring class InternVLPreTrainedModel(PreTrainedModel): config: InternVLConfig base_model_prefix = "model" input_modalities = ("image", "text", "video") supports_gradient_checkpointing = True _skip_keys_device_placement = "past_key_values" _supports_flash_attn = True _supports_sdpa = True _can_compile_fullgraph = True _supports_flex_attn = True _supports_attention_backend = True class InternVLMultiModalProjector(nn.Module): def __init__(self, config: InternVLConfig): super().__init__() self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2) self.linear_1 = nn.Linear( config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size ) self.act = ACT2FN[config.projector_hidden_act] self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size) def forward(self, image_features): hidden_states = self.layer_norm(image_features) hidden_states = self.linear_1(hidden_states) hidden_states = self.act(hidden_states) hidden_states = self.linear_2(hidden_states) return hidden_states @dataclass @auto_docstring( custom_intro=""" Base class for InternVL outputs, with hidden states and attentions. """ ) class InternVLModelOutputWithPast(BaseModelOutputWithPast): r""" past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. image_hidden_states (`torch.FloatTensor`, *optional*): A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ image_hidden_states: torch.FloatTensor | None = None @auto_docstring( custom_intro=""" The InternVL model which consists of a vision backbone and a language model, without a language modeling head. """ ) class InternVLModel(InternVLPreTrainedModel): _checkpoint_conversion_mapping = { r"^language_model.model": "language_model", } def __init__(self, config: InternVLConfig): super().__init__(config) self.vision_tower = AutoModel.from_config(config.vision_config) self.multi_modal_projector = InternVLMultiModalProjector(config) self.language_model = AutoModel.from_config(config.text_config) self.post_init() def get_input_embeddings(self): return self.language_model.get_input_embeddings() def set_input_embeddings(self, value): self.language_model.set_input_embeddings(value) @check_model_inputs(tie_last_hidden_states=False) @auto_docstring( custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection." ) def get_image_features( self, pixel_values: torch.FloatTensor, vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. vision_feature_layer (`int` or `list[int]`): Layer index or list of layer indices to extract features from. """ pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility downsample_ratio = self.config.downsample_ratio if vision_feature_layer != -1: kwargs["output_hidden_states"] = True vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) if vision_feature_layer == -1: vision_features = vision_outputs.last_hidden_state else: vision_features = vision_outputs.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": vision_features = vision_features[:, 1:, :] # Calculate dimensions based on vision features channels = vision_features.shape[1] feature_size = int(channels**0.5) batch_size = vision_features.shape[0] # Reshape tensor to spatial dimensions vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1) # Apply downsampling using pixel shuffle vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio) # Reshape tensor to prepare for projection vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1]) # Project features through multi-modal projector vision_features = self.multi_modal_projector(vision_features) vision_outputs.pooler_output = vision_features return vision_outputs def get_placeholder_mask( self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor ): """ Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is equal to the length of multimodal features. If the lengths are different, an error is raised. """ if input_ids is None: special_image_mask = inputs_embeds == self.get_input_embeddings()( torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) ) special_image_mask = special_image_mask.all(-1) else: special_image_mask = input_ids == self.config.image_token_id n_image_tokens = special_image_mask.sum() n_image_features = image_features.shape[0] * image_features.shape[1] special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) torch_compilable_check( inputs_embeds[special_image_mask].numel() == image_features.numel(), f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}", ) return special_image_mask @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, pixel_values: torch.FloatTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | InternVLModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) if pixel_values is not None: image_features = self.get_image_features( pixel_values=pixel_values, vision_feature_layer=vision_feature_layer, vision_feature_select_strategy=vision_feature_select_strategy, return_dict=True, ).pooler_output image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) special_image_mask = self.get_placeholder_mask( input_ids, inputs_embeds=inputs_embeds, image_features=image_features ) inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, cache_position=cache_position, **kwargs, ) return InternVLModelOutputWithPast( last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, image_hidden_states=image_features if pixel_values is not None else None, ) def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5): """Perform pixel shuffle downsampling on vision features. Args: vision_features (`torch.Tensor`): Input tensor of shape (batch_size, width, height, channels). scale_factor (`float`, *optional*, defaults to `0.5`): Factor by which to downsample. Default is 0.5, which halves the dimensions. Returns: vision_features (`torch.Tensor`): Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)). """ batch_size, width, height, channels = vision_features.size() if height % scale_factor != 0 or width % scale_factor != 0: raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.") # Reshape to allow downsampling vision_features = vision_features.view( batch_size, width, int(height * scale_factor), int(channels / scale_factor) ) # Permute dimensions to align downsampled axis correctly vision_features = vision_features.permute(0, 2, 1, 3).contiguous() # Reshape to achieve final downsampled dimensions vision_features = vision_features.view( batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2)) ) # Swap height and width back for proper orientation vision_features = vision_features.permute(0, 2, 1, 3).contiguous() return vision_features @dataclass @auto_docstring( custom_intro=""" Base class for InternVL causal language model (or autoregressive) outputs. """ ) class InternVLCausalLMOutputWithPast(ModelOutput): r""" loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Language modeling loss (for next-token prediction). logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. image_hidden_states (`torch.FloatTensor`, *optional*): A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. """ loss: torch.FloatTensor | None = None logits: torch.FloatTensor | None = None past_key_values: Cache | None = None hidden_states: tuple[torch.FloatTensor] | None = None attentions: tuple[torch.FloatTensor] | None = None image_hidden_states: torch.FloatTensor | None = None @auto_docstring( custom_intro=""" The INTERNVL model which consists of a vision backbone and a language model. """ ) class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin): _checkpoint_conversion_mapping = { r"^language_model.model": "model.language_model", r"^vision_tower": "model.vision_tower", r"^multi_modal_projector": "model.multi_modal_projector", r"^language_model.lm_head": "lm_head", } _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"} def __init__(self, config: InternVLConfig): super().__init__(config) self.model = InternVLModel(config) self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) self.post_init() def get_input_embeddings(self): return self.model.get_input_embeddings() def set_input_embeddings(self, value): self.model.set_input_embeddings(value) def get_output_embeddings(self) -> nn.Module: return self.lm_head @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: return self.model.get_image_features( pixel_values=pixel_values, vision_feature_layer=vision_feature_layer, vision_feature_select_strategy=vision_feature_select_strategy, **kwargs, ) @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, pixel_values: torch.FloatTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, past_key_values: Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, labels: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | InternVLCausalLMOutputWithPast: r""" Example: ```python >>> import torch >>> from transformers import AutoProcessor, AutoModelForImageTextToText >>> torch_device = "cuda" >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf") >>> model = AutoModelForImageTextToText.from_pretrained( ... "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device ... ) >>> messages = [ ... { ... "role": "user", ... "content": [ ... { ... "type": "image", ... "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", ... }, ... { ... "type": "image", ... "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg", ... }, ... {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"}, ... ], ... }, ... ] >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device) >>> generate_ids = model.generate(**inputs, max_new_tokens=200) >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)) The images depict the Statue of Liberty and the Golden Gate Bridge. ```""" outputs = self.model( input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, vision_feature_layer=vision_feature_layer, vision_feature_select_strategy=vision_feature_select_strategy, cache_position=cache_position, image_sizes=image_sizes, **kwargs, ) hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep logits = self.lm_head(hidden_states[:, slice_indices, :]) loss = None if labels is not None: loss = self.loss_function( logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs ) return InternVLCausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, image_hidden_states=outputs.image_hidden_states, ) def prepare_inputs_for_generation( self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, cache_position=None, logits_to_keep=None, is_first_iteration=False, **kwargs, ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model model_inputs = super().prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, attention_mask=attention_mask, cache_position=cache_position, logits_to_keep=logits_to_keep, is_first_iteration=is_first_iteration, **kwargs, ) if is_first_iteration or not kwargs.get("use_cache", True): # Pixel values are used only in the first iteration if available # In subsquent iterations, they are already merged with text and cached # NOTE: first iteration doesn't have to be prefill, it can be the first # iteration with a question and cached system prompt (continue generate from cache) model_inputs["pixel_values"] = pixel_values return model_inputs __all__ = [ "InternVLVisionPreTrainedModel", "InternVLVisionModel", "InternVLPreTrainedModel", "InternVLModel", "InternVLForConditionalGeneration", ]