You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

640 lines
25 KiB

# Copyright 2025 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections.abc
from collections.abc import Callable
from dataclasses import dataclass
import torch
import torch.nn as nn
from ... import initialization as init
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...modeling_layers import GradientCheckpointingLayer
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import TransformersKwargs, auto_docstring, torch_int
from ...utils.generic import check_model_inputs
from ..clip.modeling_clip import CLIPMLP
from ..janus.modeling_janus import JanusVisionAttention
from ..llama.modeling_llama import LlamaRMSNorm
from ..llava.modeling_llava import (
LlavaCausalLMOutputWithPast,
LlavaForConditionalGeneration,
LlavaModel,
LlavaModelOutputWithPast,
LlavaPreTrainedModel,
)
from .configuration_internvl import InternVLConfig, InternVLVisionConfig
def eager_attention_forward(
module: nn.Module,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attention_mask: torch.Tensor | None,
scaling: float,
dropout: float = 0.0,
**kwargs,
):
key_states = key
value_states = value
attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
if attention_mask is not None:
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
attn_weights = attn_weights + causal_mask
# No upcasting of the attention weights to float32 in this implementation
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
attn_output = torch.matmul(attn_weights, value_states)
attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output, attn_weights
class InternVLVisionRMSNorm(LlamaRMSNorm):
pass
class InternVLVisionAttention(JanusVisionAttention):
def __init__(self, config: InternVLVisionConfig):
super().__init__(config)
del self.num_key_value_groups
# Needed for flash attention
self.is_causal = False
qk_norm = config.use_qk_norm
self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor | None = None,
**kwargs: Unpack[TransformersKwargs],
):
batch_size, seq_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = self.q_norm(query_states)
key_states = self.k_norm(key_states)
query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
self.config._attn_implementation, eager_attention_forward
)
attn_output, attn_weights = attention_interface(
self,
query_states,
key_states,
value_states,
attention_mask,
dropout=0.0 if not self.training else self.attention_dropout,
scaling=self.scale,
is_causal=False,
**kwargs,
)
attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)
output = self.projection_layer(attn_output)
output = self.projection_dropout(output)
return output, attn_weights
@dataclass
@auto_docstring(
custom_intro="""
Class for outputs of [`InternVLVisionModel`].
"""
)
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
r"""
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
*config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
will be returned.
"""
class InternVLVisionPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.hidden_size
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.patch_shape = patch_shape
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
batch_size, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values.to(self.projection.weight.dtype))
embeddings = embeddings.flatten(2).transpose(1, 2)
return embeddings
# Based on timm implementation, which can be found here:
# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
class InternVLVisionEmbeddings(nn.Module):
"""
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
"""
def __init__(self, config: InternVLVisionConfig) -> None:
super().__init__()
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
if config.use_mask_token:
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
else:
self.mask_token = None
self.patch_embeddings = InternVLVisionPatchEmbeddings(config)
self.patch_size = config.patch_size
self.image_size = (
config.image_size
if isinstance(config.image_size, collections.abc.Iterable)
else (config.image_size, config.image_size)
)
num_patches = self.patch_embeddings.num_patches
if config.use_absolute_position_embeddings:
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
else:
self.position_embeddings = None
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
"""
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.
Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
"""
num_patches = embeddings.shape[1] - 1
num_positions = self.position_embeddings.shape[1] - 1
# always interpolate when tracing to ensure the exported model works for dynamic input shapes
if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
return self.position_embeddings
class_pos_embed = self.position_embeddings[:, :1]
patch_pos_embed = self.position_embeddings[:, 1:]
dim = embeddings.shape[-1]
new_height = height // self.patch_size[0]
new_width = width // self.patch_size[1]
sqrt_num_positions = torch_int(num_positions**0.5)
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed,
size=(new_height, new_width),
mode="bicubic",
align_corners=False,
)
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
def forward(
self,
pixel_values: torch.Tensor,
bool_masked_pos: torch.BoolTensor | None = None,
) -> torch.Tensor:
_, _, height, width = pixel_values.shape
embeddings = self.patch_embeddings(pixel_values)
batch_size, seq_len, _ = embeddings.size()
if bool_masked_pos is not None:
mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
# replace the masked visual tokens by mask_tokens
w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
embeddings = embeddings * (1 - w) + mask_tokens * w
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, embeddings), dim=1)
if self.position_embeddings is not None:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
embeddings = self.dropout(embeddings)
return embeddings
class InternVLVisionMLP(CLIPMLP):
pass
NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm}
class InternVLVisionLayer(GradientCheckpointingLayer):
"""This corresponds to the Block class in the timm implementation."""
def __init__(self, config: InternVLVisionConfig) -> None:
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = InternVLVisionAttention(config)
self.mlp = InternVLVisionMLP(config)
# InternVL uses different layernorm implementations for different models
self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
init_values = config.layer_scale_init_value
self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(
self,
hidden_states: torch.Tensor,
) -> tuple[torch.Tensor] | tuple[torch.Tensor, torch.Tensor]:
attention_output, _ = self.attention(
self.layernorm_before(hidden_states), # in InternVLVision, layernorm is applied before self-attention
)
attention_output = self.lambda_1 * attention_output
# first residual connection
hidden_states = attention_output + hidden_states
# in InternVLVision, layernorm is also applied after self-attention
layer_output = self.layernorm_after(hidden_states)
layer_output = self.mlp(layer_output)
layer_output = self.dropout(layer_output)
if self.lambda_2 is not None:
layer_output = self.lambda_2 * layer_output
# second residual connection
layer_output = layer_output + hidden_states
return layer_output
class InternVLVisionEncoder(nn.Module):
def __init__(self, config: InternVLVisionConfig) -> None:
super().__init__()
self.config = config
self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
) -> tuple | BaseModelOutput:
for layer_module in self.layer:
hidden_states = layer_module(hidden_states)
return BaseModelOutput(
last_hidden_state=hidden_states,
)
@auto_docstring
class InternVLVisionPreTrainedModel(PreTrainedModel):
config: InternVLVisionConfig
base_model_prefix = "internvl_vision"
main_input_name = "pixel_values"
input_modalities = ("image", "video")
supports_gradient_checkpointing = True
_no_split_modules = ["InternVLVisionLayer"]
_supports_sdpa = True
_supports_flash_attn = True
_supports_flex_attn = True
_supports_attention_backend = True
_can_record_outputs = {
"hidden_states": InternVLVisionLayer,
"attentions": InternVLVisionAttention,
}
@torch.no_grad()
def _init_weights(self, module):
"""Initialize the weights"""
super()._init_weights(module)
if isinstance(module, InternVLVisionEmbeddings):
init.zeros_(module.cls_token)
if module.mask_token is not None:
init.zeros_(module.mask_token)
if module.position_embeddings is not None:
init.zeros_(module.position_embeddings)
elif isinstance(module, InternVLVisionLayer):
init.constant_(module.lambda_1, self.config.layer_scale_init_value)
init.constant_(module.lambda_2, self.config.layer_scale_init_value)
@auto_docstring
class InternVLVisionModel(InternVLVisionPreTrainedModel):
def __init__(self, config: InternVLVisionConfig) -> None:
super().__init__(config)
self.config = config
self.embeddings = InternVLVisionEmbeddings(config)
self.encoder = InternVLVisionEncoder(config)
self.layernorm = (
nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs
) -> tuple | InternVLVisionModelOutputWithPooling:
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
encoder_outputs = self.encoder(embedding_output)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
return InternVLVisionModelOutputWithPooling(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class InternVLPreTrainedModel(LlavaPreTrainedModel):
input_modalities = ("image", "text", "video")
INTERNVL_INPUTS_DOCSTRING = None
class InternVLMultiModalProjector(nn.Module):
def __init__(self, config: InternVLConfig):
super().__init__()
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)
def forward(self, image_features):
hidden_states = self.layer_norm(image_features)
hidden_states = self.linear_1(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.linear_2(hidden_states)
return hidden_states
class InternVLModelOutputWithPast(LlavaModelOutputWithPast):
pass
class InternVLModel(LlavaModel):
def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5):
"""Perform pixel shuffle downsampling on vision features.
Args:
vision_features (`torch.Tensor`):
Input tensor of shape (batch_size, width, height, channels).
scale_factor (`float`, *optional*, defaults to `0.5`):
Factor by which to downsample. Default is 0.5, which halves the dimensions.
Returns:
vision_features (`torch.Tensor`):
Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
"""
batch_size, width, height, channels = vision_features.size()
if height % scale_factor != 0 or width % scale_factor != 0:
raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.")
# Reshape to allow downsampling
vision_features = vision_features.view(
batch_size, width, int(height * scale_factor), int(channels / scale_factor)
)
# Permute dimensions to align downsampled axis correctly
vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
# Reshape to achieve final downsampled dimensions
vision_features = vision_features.view(
batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2))
)
# Swap height and width back for proper orientation
vision_features = vision_features.permute(0, 2, 1, 3).contiguous()
return vision_features
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring(
custom_intro="Obtains image last hidden states from the vision tower and apply multimodal projection."
)
def get_image_features(
self,
pixel_values: torch.FloatTensor,
vision_feature_layer: int | list[int] | None = None,
vision_feature_select_strategy: str | None = None,
**kwargs: Unpack[TransformersKwargs],
) -> tuple | BaseModelOutputWithPooling:
r"""
pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
The tensors corresponding to the input images.
vision_feature_layer (`int` or `list[int]`):
Layer index or list of layer indices to extract features from.
"""
pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility
downsample_ratio = self.config.downsample_ratio
if vision_feature_layer != -1:
kwargs["output_hidden_states"] = True
vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
if vision_feature_layer == -1:
vision_features = vision_outputs.last_hidden_state
else:
vision_features = vision_outputs.hidden_states[vision_feature_layer]
if vision_feature_select_strategy == "default":
vision_features = vision_features[:, 1:, :]
# Calculate dimensions based on vision features
channels = vision_features.shape[1]
feature_size = int(channels**0.5)
batch_size = vision_features.shape[0]
# Reshape tensor to spatial dimensions
vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1)
# Apply downsampling using pixel shuffle
vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio)
# Reshape tensor to prepare for projection
vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1])
# Project features through multi-modal projector
vision_features = self.multi_modal_projector(vision_features)
vision_outputs.pooler_output = vision_features
return vision_outputs
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
input_ids: torch.LongTensor | None = None,
pixel_values: torch.FloatTensor | None = None,
attention_mask: torch.Tensor | None = None,
position_ids: torch.LongTensor | None = None,
past_key_values: Cache | None = None,
inputs_embeds: torch.FloatTensor | None = None,
vision_feature_layer: int | list[int] | None = None,
vision_feature_select_strategy: str | None = None,
cache_position: torch.LongTensor | None = None,
**kwargs: Unpack[TransformersKwargs],
) -> tuple | InternVLModelOutputWithPast:
if (input_ids is None) ^ (inputs_embeds is not None):
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
if inputs_embeds is None:
inputs_embeds = self.get_input_embeddings()(input_ids)
if pixel_values is not None:
image_features = self.get_image_features(
pixel_values=pixel_values,
vision_feature_layer=vision_feature_layer,
vision_feature_select_strategy=vision_feature_select_strategy,
return_dict=True,
).pooler_output
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
special_image_mask = self.get_placeholder_mask(
input_ids, inputs_embeds=inputs_embeds, image_features=image_features
)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
outputs = self.language_model(
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
cache_position=cache_position,
**kwargs,
)
return InternVLModelOutputWithPast(
last_hidden_state=outputs.last_hidden_state,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
image_hidden_states=image_features if pixel_values is not None else None,
)
class InternVLCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
pass
class InternVLForConditionalGeneration(LlavaForConditionalGeneration):
def forward(**super_kwargs):
r"""
Example:
```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
... "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
... )
>>> messages = [
... {
... "role": "user",
... "content": [
... {
... "type": "image",
... "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
... },
... {
... "type": "image",
... "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
... },
... {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
... ],
... },
... ]
>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```"""
super().forward(**super_kwargs)
__all__ = [
"InternVLVisionPreTrainedModel",
"InternVLVisionModel",
"InternVLPreTrainedModel",
"InternVLModel",
"InternVLForConditionalGeneration",
]