# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """PyTorch CLIP model.""" from collections.abc import Callable from dataclasses import dataclass from typing import Any import torch from torch import nn from ... import initialization as init from ...activations import ACT2FN from ...masking_utils import create_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ( ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int, ) from ...utils.generic import check_model_inputs from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig logger = logging.get_logger(__name__) # contrastive loss function, adapted from # https://sachinruk.github.io/blog/2021-03-07-clip.html def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) def clip_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor: """ This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566 """ square_tensor = torch.pow(tensor, 2) sum_tensor = torch.sum(square_tensor, dim=-1, keepdim=True) normed_tensor = torch.pow(sum_tensor, 0.5) return normed_tensor @dataclass @auto_docstring( custom_intro=""" Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. """ ) class CLIPVisionModelOutput(ModelOutput): r""" image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The image embeddings obtained by applying the projection layer to the pooler_output. """ image_embeds: torch.FloatTensor | None = None last_hidden_state: torch.FloatTensor | None = None hidden_states: tuple[torch.FloatTensor, ...] | None = None attentions: tuple[torch.FloatTensor, ...] | None = None @dataclass @auto_docstring( custom_intro=""" Base class for text model's outputs that also contains a pooling of the last hidden states. """ ) class CLIPTextModelOutput(ModelOutput): r""" text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): The text embeddings obtained by applying the projection layer to the pooler_output. """ text_embeds: torch.FloatTensor | None = None last_hidden_state: torch.FloatTensor | None = None hidden_states: tuple[torch.FloatTensor, ...] | None = None attentions: tuple[torch.FloatTensor, ...] | None = None @dataclass @auto_docstring class CLIPOutput(ModelOutput): r""" loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): Contrastive loss for image-text similarity. logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text similarity scores. logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image similarity scores. text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`]. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`]. text_model_output (`BaseModelOutputWithPooling`): The output of the [`CLIPTextModel`]. vision_model_output (`BaseModelOutputWithPooling`): The output of the [`CLIPVisionModel`]. """ loss: torch.FloatTensor | None = None logits_per_image: torch.FloatTensor | None = None logits_per_text: torch.FloatTensor | None = None text_embeds: torch.FloatTensor | None = None image_embeds: torch.FloatTensor | None = None text_model_output: BaseModelOutputWithPooling = None vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> tuple[Any]: return tuple( self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() for k in self.keys() ) class CLIPVisionEmbeddings(nn.Module): def __init__(self, config: CLIPVisionConfig): super().__init__() self.config = config self.embed_dim = config.hidden_size self.image_size = config.image_size self.patch_size = config.patch_size self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) self.patch_embedding = nn.Conv2d( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False, ) self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution images. This method is also adapted to support torch.jit tracing. Adapted from: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ num_patches = embeddings.shape[1] - 1 position_embedding = self.position_embedding.weight.unsqueeze(0) num_positions = position_embedding.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and num_patches == num_positions and height == width: return self.position_embedding(self.position_ids) class_pos_embed = position_embedding[:, :1] patch_pos_embed = position_embedding[:, 1:] dim = embeddings.shape[-1] new_height = height // self.patch_size new_width = width // self.patch_size sqrt_num_positions = torch_int(num_positions**0.5) patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) patch_pos_embed = nn.functional.interpolate( patch_pos_embed, size=(new_height, new_width), mode="bicubic", align_corners=False, ) patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): raise ValueError( f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})." ) target_dtype = self.patch_embedding.weight.dtype patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) if interpolate_pos_encoding: embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) else: embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings class CLIPTextEmbeddings(nn.Module): def __init__(self, config: CLIPTextConfig): super().__init__() embed_dim = config.hidden_size self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) def forward( self, input_ids: torch.LongTensor | None = None, position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, ) -> torch.Tensor: seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] max_position_embedding = self.position_embedding.weight.shape[0] if seq_length > max_position_embedding: raise ValueError( f"Sequence length must be less than max_position_embeddings (got `sequence length`: " f"{seq_length} and max_position_embeddings: {max_position_embedding}" ) if position_ids is None: position_ids = self.position_ids[:, :seq_length] if inputs_embeds is None: inputs_embeds = self.token_embedding(input_ids) position_embeddings = self.position_embedding(position_ids) embeddings = inputs_embeds + position_embeddings return embeddings def eager_attention_forward( module: nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: torch.Tensor | None, scaling: float, dropout: float = 0.0, **kwargs: Unpack[TransformersKwargs], ): attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling if attention_mask is not None: attn_weights = attn_weights + attention_mask attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) attn_output = torch.matmul(attn_weights, value) attn_output = attn_output.transpose(1, 2).contiguous() return attn_output, attn_weights class CLIPAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config: CLIPVisionConfig | CLIPTextConfig): super().__init__() self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads})." ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.is_causal = False self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor | None]: """Input shape: Batch x Time x Channel""" batch_size, seq_length, embed_dim = hidden_states.shape queries = self.q_proj(hidden_states) keys = self.k_proj(hidden_states) values = self.v_proj(hidden_states) queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface( self.config._attn_implementation, eager_attention_forward ) attn_output, attn_weights = attention_interface( self, queries, keys, values, attention_mask, scaling=self.scale, dropout=0.0 if not self.training else self.dropout, **kwargs, ) attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() attn_output = self.out_proj(attn_output) return attn_output, attn_weights class CLIPMLP(nn.Module): def __init__(self, config): super().__init__() self.config = config self.activation_fn = ACT2FN[config.hidden_act] self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.fc1(hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(hidden_states) return hidden_states class CLIPEncoderLayer(GradientCheckpointingLayer): def __init__(self, config: CLIPVisionConfig | CLIPTextConfig): super().__init__() self.embed_dim = config.hidden_size self.self_attn = CLIPAttention(config) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = CLIPMLP(config) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], ) -> torch.FloatTensor: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) hidden_states, _ = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, **kwargs, ) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states return hidden_states @auto_docstring class CLIPPreTrainedModel(PreTrainedModel): config: CLIPConfig base_model_prefix = "clip" input_modalities = ("image", "text") supports_gradient_checkpointing = True _supports_sdpa = True _supports_flash_attn = True _supports_flex_attn = True _supports_attention_backend = True _can_record_outputs = { "hidden_states": CLIPEncoderLayer, "attentions": CLIPAttention, } @torch.no_grad() def _init_weights(self, module): """Initialize the weights""" factor = self.config.initializer_factor if isinstance(module, CLIPTextEmbeddings): init.normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02) init.normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02) init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1))) elif isinstance(module, CLIPVisionEmbeddings): factor = self.config.initializer_factor init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) init.copy_(module.position_ids, torch.arange(module.num_positions).expand((1, -1))) elif isinstance(module, CLIPAttention): factor = self.config.initializer_factor in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor out_proj_std = (module.embed_dim**-0.5) * factor init.normal_(module.q_proj.weight, std=in_proj_std) init.normal_(module.k_proj.weight, std=in_proj_std) init.normal_(module.v_proj.weight, std=in_proj_std) init.normal_(module.out_proj.weight, std=out_proj_std) elif isinstance(module, CLIPMLP): factor = self.config.initializer_factor in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * module.config.hidden_size) ** -0.5 * factor init.normal_(module.fc1.weight, std=fc_std) init.normal_(module.fc2.weight, std=in_proj_std) elif isinstance(module, CLIPModel): init.normal_( module.text_projection.weight, std=module.text_embed_dim**-0.5 * self.config.initializer_factor, ) init.normal_( module.visual_projection.weight, std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, ) elif isinstance(module, CLIPVisionModelWithProjection): init.normal_( module.visual_projection.weight, std=self.config.hidden_size**-0.5 * self.config.initializer_factor, ) elif isinstance(module, CLIPTextModelWithProjection): init.normal_( module.text_projection.weight, std=self.config.hidden_size**-0.5 * self.config.initializer_factor, ) elif isinstance(module, CLIPForImageClassification): init.normal_( module.classifier.weight, std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor, ) if isinstance(module, nn.LayerNorm): init.zeros_(module.bias) init.ones_(module.weight) if isinstance(module, nn.Linear) and module.bias is not None: init.zeros_(module.bias) class CLIPEncoder(nn.Module): """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a [`CLIPEncoderLayer`]. Args: config: CLIPConfig """ def __init__(self, config: CLIPConfig): super().__init__() self.config = config self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( self, inputs_embeds, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) """ hidden_states = inputs_embeds for encoder_layer in self.layers: hidden_states = encoder_layer( hidden_states, attention_mask, **kwargs, ) return BaseModelOutput( last_hidden_state=hidden_states, ) class CLIPTextTransformer(nn.Module): def __init__(self, config: CLIPTextConfig): super().__init__() self.config = config embed_dim = config.hidden_size self.embeddings = CLIPTextEmbeddings(config) self.encoder = CLIPEncoder(config) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) # For `pooled_output` computation self.eos_token_id = config.eos_token_id @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: if input_ids is None: raise ValueError("You have to specify input_ids") input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) attention_mask = create_causal_mask( config=self.config, input_embeds=hidden_states, attention_mask=attention_mask, cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device), past_key_values=None, ) kwargs.pop("is_causal", None) encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, is_causal=True, **kwargs, ) last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.final_layer_norm(last_hidden_state) if self.eos_token_id == 2: # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here. # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added # ------------------------------------------------------------ # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 pooled_output = last_hidden_state[ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1), ] else: # The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible) pooled_output = last_hidden_state[ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) # Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer) (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id) .int() .argmax(dim=-1), ] return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @auto_docstring( custom_intro=""" The text model from CLIP without any head or projection on top. """ ) class CLIPTextModel(CLIPPreTrainedModel): config: CLIPTextConfig input_modalities = ("text",) _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] def __init__(self, config: CLIPTextConfig): super().__init__(config) self.text_model = CLIPTextTransformer(config) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self) -> nn.Module: return self.text_model.embeddings.token_embedding def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" Examples: ```python >>> from transformers import AutoTokenizer, CLIPTextModel >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled (EOS token) states ```""" return self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, **kwargs, ) class CLIPVisionTransformer(nn.Module): def __init__(self, config: CLIPVisionConfig): super().__init__() self.config = config embed_dim = config.hidden_size self.embeddings = CLIPVisionEmbeddings(config) self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) self.encoder = CLIPEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, **kwargs, ) last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) @auto_docstring( custom_intro=""" The vision model from CLIP without any head or projection on top. """ ) class CLIPVisionModel(CLIPPreTrainedModel): config: CLIPVisionConfig main_input_name = "pixel_values" input_modalities = ("image",) _no_split_modules = ["CLIPEncoderLayer"] def __init__(self, config: CLIPVisionConfig): super().__init__(config) self.vision_model = CLIPVisionTransformer(config) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], ) -> BaseModelOutputWithPooling: r""" Example: ```python >>> from PIL import Image >>> import httpx >>> from io import BytesIO >>> from transformers import AutoProcessor, CLIPVisionModel >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> with httpx.stream("GET", url) as response: ... image = Image.open(BytesIO(response.read())) >>> inputs = processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" return self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, **kwargs, ) @auto_docstring class CLIPModel(CLIPPreTrainedModel): config: CLIPConfig _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"] def __init__(self, config: CLIPConfig): super().__init__(config) if not isinstance(config.text_config, CLIPTextConfig): raise TypeError( "config.text_config is expected to be of type CLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, CLIPVisionConfig): raise TypeError( "config.vision_config is expected to be of type CLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) text_config = config.text_config vision_config = config.vision_config self.projection_dim = config.projection_dim self.text_embed_dim = text_config.hidden_size self.vision_embed_dim = vision_config.hidden_size text_model = CLIPTextModel._from_config(text_config) self.text_model = text_model.text_model vision_model = CLIPVisionModel._from_config(vision_config) self.vision_model = vision_model.vision_model self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() @can_return_tuple @auto_docstring def get_text_features( self, input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" Examples: ```python >>> import torch >>> from transformers import AutoTokenizer, CLIPModel >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> with torch.inference_mode(): ... text_features = model.get_text_features(**inputs) ```""" text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, return_dict=True, **kwargs, ) pooled_output = text_outputs.pooler_output text_outputs.pooler_output = self.text_projection(pooled_output) return text_outputs @can_return_tuple @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], ) -> tuple | BaseModelOutputWithPooling: r""" Examples: ```python >>> import torch >>> from transformers import AutoProcessor, CLIPModel >>> from transformers.image_utils import load_image >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = load_image(url) >>> inputs = processor(images=image, return_tensors="pt") >>> with torch.inference_mode(): ... image_features = model.get_image_features(**inputs) ```""" vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=True, **kwargs, ) pooled_output = vision_outputs.pooler_output vision_outputs.pooler_output = self.visual_projection(pooled_output) return vision_outputs @can_return_tuple @auto_docstring def forward( self, input_ids: torch.LongTensor | None = None, pixel_values: torch.FloatTensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.LongTensor | None = None, return_loss: bool | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], ) -> CLIPOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. Examples: ```python >>> import torch >>> from transformers import AutoProcessor, CLIPModel >>> from transformers.image_utils import load_image >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = load_image(url) >>> inputs = processor( ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True ... ) >>> with torch.inference_mode(): ... outputs = model(**inputs) >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, **kwargs, ) text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, **kwargs, ) image_embeds = vision_outputs.pooler_output image_embeds = self.visual_projection(image_embeds) text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) # normalized features image_embeds = image_embeds / _get_vector_norm(image_embeds) text_embeds = text_embeds / _get_vector_norm(text_embeds) # cosine similarity as logits logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) logits_per_text = logits_per_text * self.logit_scale.exp().to(text_embeds.device) logits_per_image = logits_per_text.t() loss = None if return_loss: loss = clip_loss(logits_per_text) return CLIPOutput( loss=loss, logits_per_image=logits_per_image, logits_per_text=logits_per_text, text_embeds=text_embeds, image_embeds=image_embeds, text_model_output=text_outputs, vision_model_output=vision_outputs, ) @auto_docstring class CLIPTextModelWithProjection(CLIPPreTrainedModel): config: CLIPTextConfig input_modalities = ("text",) _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] def __init__(self, config: CLIPTextConfig): super().__init__(config) text_model = CLIPTextModel._from_config(config) self.text_model = text_model.text_model self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self) -> nn.Module: return self.text_model.embeddings.token_embedding def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> CLIPTextModelOutput: r""" Examples: ```python >>> import torch >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> with torch.inference_mode(): ... outputs = model(**inputs) >>> text_embeds = outputs.text_embeds ```""" text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, **kwargs, ) pooled_output = text_outputs.pooler_output text_embeds = self.text_projection(pooled_output) return CLIPTextModelOutput( text_embeds=text_embeds, last_hidden_state=text_outputs.last_hidden_state, ) @auto_docstring class CLIPVisionModelWithProjection(CLIPPreTrainedModel): config: CLIPVisionConfig main_input_name = "pixel_values" input_modalities = ("image",) def __init__(self, config: CLIPVisionConfig): super().__init__(config) vision_model = CLIPVisionModel._from_config(config) self.vision_model = vision_model.vision_model self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], ) -> CLIPVisionModelOutput: r""" Examples: ```python >>> import torch >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection >>> from transformers.image_utils import load_image >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = load_image(url) >>> inputs = processor(images=image, return_tensors="pt") >>> with torch.inference_mode(): ... outputs = model(**inputs) >>> image_embeds = outputs.image_embeds ```""" vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, **kwargs, ) pooled_output = vision_outputs.pooler_output image_embeds = self.visual_projection(pooled_output) return CLIPVisionModelOutput( image_embeds=image_embeds, last_hidden_state=vision_outputs.last_hidden_state, ) @auto_docstring( custom_intro=""" CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of the patch tokens) e.g. for ImageNet. """ ) class CLIPForImageClassification(CLIPPreTrainedModel): main_input_name = "pixel_values" input_modalities = ("image",) def __init__(self, config: CLIPConfig) -> None: super().__init__(config) self.num_labels = config.num_labels vision_model = CLIPVisionModel._from_config(config.vision_config) self.vision_model = vision_model.vision_model # Classifier head self.classifier = ( nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() ) # Initialize weights and apply final processing self.post_init() @check_model_inputs(tie_last_hidden_states=False) @auto_docstring def forward( self, pixel_values: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], ) -> ImageClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values, **kwargs, ) sequence_output = outputs.last_hidden_state sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1) logits = self.classifier(sequence_output) loss = None if labels is not None: loss = self.loss_function(labels, logits, self.config) return ImageClassifierOutput( loss=loss, logits=logits, ) __all__ = [ "CLIPModel", "CLIPPreTrainedModel", "CLIPTextModel", "CLIPTextModelWithProjection", "CLIPVisionModel", "CLIPVisionModelWithProjection", "CLIPForImageClassification", ]