You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

434 lines
17 KiB

# Copyright 2023 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch ConvNextV2 model."""
import torch
from torch import nn
from ... import initialization as init
from ...activations import ACT2FN
from ...backbone_utils import BackboneMixin
from ...modeling_outputs import (
BackboneOutput,
BaseModelOutputWithNoAttention,
BaseModelOutputWithPoolingAndNoAttention,
ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import auto_docstring, logging
from ...utils.generic import can_return_tuple
from .configuration_convnextv2 import ConvNextV2Config
logger = logging.get_logger(__name__)
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
output = input.div(keep_prob) * random_tensor
return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->ConvNextV2
class ConvNextV2DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: float | None = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return f"p={self.drop_prob}"
class ConvNextV2GRN(nn.Module):
"""GRN (Global Response Normalization) layer"""
def __init__(self, dim: int):
super().__init__()
self.weight = nn.Parameter(torch.zeros(1, 1, 1, dim))
self.bias = nn.Parameter(torch.zeros(1, 1, 1, dim))
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
# Compute and normalize global spatial feature maps
global_features = torch.linalg.vector_norm(hidden_states, ord=2, dim=(1, 2), keepdim=True)
norm_features = global_features / (global_features.mean(dim=-1, keepdim=True) + 1e-6)
hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states
return hidden_states
# Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->ConvNextV2
class ConvNextV2LayerNorm(nn.LayerNorm):
r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
super().__init__(normalized_shape, eps=eps, **kwargs)
if data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError(f"Unsupported data format: {data_format}")
self.data_format = data_format
def forward(self, features: torch.Tensor) -> torch.Tensor:
"""
Args:
features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
"""
if self.data_format == "channels_first":
features = features.permute(0, 2, 3, 1)
features = super().forward(features)
features = features.permute(0, 3, 1, 2)
else:
features = super().forward(features)
return features
# Copied from transformers.models.convnext.modeling_convnext.ConvNextEmbeddings with ConvNext->ConvNextV2
class ConvNextV2Embeddings(nn.Module):
"""This class is comparable to (and inspired by) the SwinEmbeddings class
found in src/transformers/models/swin/modeling_swin.py.
"""
def __init__(self, config):
super().__init__()
self.patch_embeddings = nn.Conv2d(
config.num_channels, config.hidden_sizes[0], kernel_size=config.patch_size, stride=config.patch_size
)
self.layernorm = ConvNextV2LayerNorm(config.hidden_sizes[0], eps=1e-6, data_format="channels_first")
self.num_channels = config.num_channels
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
num_channels = pixel_values.shape[1]
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.patch_embeddings(pixel_values)
embeddings = self.layernorm(embeddings)
return embeddings
class ConvNextV2Layer(nn.Module):
"""This corresponds to the `Block` class in the original implementation.
There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back
The authors used (2) as they find it slightly faster in PyTorch.
Args:
config ([`ConvNextV2Config`]): Model configuration class.
dim (`int`): Number of input channels.
drop_path (`float`): Stochastic depth rate. Default: 0.0.
"""
def __init__(self, config, dim, drop_path=0):
super().__init__()
# depthwise conv
self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
self.layernorm = ConvNextV2LayerNorm(dim, eps=1e-6)
# pointwise/1x1 convs, implemented with linear layers
self.pwconv1 = nn.Linear(dim, 4 * dim)
self.act = ACT2FN[config.hidden_act]
self.grn = ConvNextV2GRN(4 * dim)
self.pwconv2 = nn.Linear(4 * dim, dim)
self.drop_path = ConvNextV2DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
def forward(self, features: torch.Tensor) -> torch.Tensor:
residual = features
features = self.dwconv(features)
# (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
features = features.permute(0, 2, 3, 1)
features = self.layernorm(features)
features = self.pwconv1(features)
features = self.act(features)
features = self.grn(features)
features = self.pwconv2(features)
# (batch_size, height, width, num_channels) -> (batch_size, num_channels, height, width)
features = features.permute(0, 3, 1, 2)
features = residual + self.drop_path(features)
return features
# Copied from transformers.models.convnext.modeling_convnext.ConvNextStage with ConvNeXT->ConvNeXTV2, ConvNext->ConvNextV2
class ConvNextV2Stage(nn.Module):
"""ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.
Args:
config ([`ConvNextV2Config`]): Model configuration class.
in_channels (`int`): Number of input channels.
out_channels (`int`): Number of output channels.
depth (`int`): Number of residual blocks.
drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
"""
def __init__(self, config, in_channels, out_channels, kernel_size=2, stride=2, depth=2, drop_path_rates=None):
super().__init__()
if in_channels != out_channels or stride > 1:
self.downsampling_layer = nn.ModuleList(
[
ConvNextV2LayerNorm(in_channels, eps=1e-6, data_format="channels_first"),
nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride),
]
)
else:
self.downsampling_layer = nn.ModuleList()
drop_path_rates = drop_path_rates or [0.0] * depth
self.layers = nn.ModuleList(
[ConvNextV2Layer(config, dim=out_channels, drop_path=drop_path_rates[j]) for j in range(depth)]
)
def forward(self, features: torch.Tensor) -> torch.Tensor:
for layer in self.downsampling_layer:
features = layer(features)
for layer in self.layers:
features = layer(features)
return features
# Copied from transformers.models.convnext.modeling_convnext.ConvNextEncoder with ConvNext->ConvNextV2
class ConvNextV2Encoder(nn.Module):
def __init__(self, config):
super().__init__()
self.stages = nn.ModuleList()
drop_path_rates = [
x.tolist()
for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu").split(config.depths)
]
prev_chs = config.hidden_sizes[0]
for i in range(config.num_stages):
out_chs = config.hidden_sizes[i]
stage = ConvNextV2Stage(
config,
in_channels=prev_chs,
out_channels=out_chs,
stride=2 if i > 0 else 1,
depth=config.depths[i],
drop_path_rates=drop_path_rates[i],
)
self.stages.append(stage)
prev_chs = out_chs
def forward(
self, hidden_states: torch.Tensor, output_hidden_states: bool | None = False
) -> BaseModelOutputWithNoAttention:
all_hidden_states = [hidden_states] if output_hidden_states else None
for layer_module in self.stages:
hidden_states = layer_module(hidden_states)
if all_hidden_states is not None:
all_hidden_states.append(hidden_states)
return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
@auto_docstring
class ConvNextV2PreTrainedModel(PreTrainedModel):
config: ConvNextV2Config
base_model_prefix = "convnextv2"
main_input_name = "pixel_values"
input_modalities = ("image",)
_no_split_modules = ["ConvNextV2Layer"]
@torch.no_grad()
def _init_weights(self, module):
"""Initialize the weights"""
super()._init_weights(module)
if isinstance(module, ConvNextV2GRN):
init.zeros_(module.weight)
init.zeros_(module.bias)
@auto_docstring
# Copied from transformers.models.convnext.modeling_convnext.ConvNextModel with CONVNEXT->CONVNEXTV2, ConvNext->ConvNextV2
class ConvNextV2Model(ConvNextV2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.config = config
self.embeddings = ConvNextV2Embeddings(config)
self.encoder = ConvNextV2Encoder(config)
# final layernorm layer
self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
# Initialize weights and apply final processing
self.post_init()
@can_return_tuple
@auto_docstring
def forward(
self, pixel_values: torch.FloatTensor | None = None, output_hidden_states: bool | None = None, **kwargs
) -> BaseModelOutputWithPoolingAndNoAttention:
if output_hidden_states is None:
output_hidden_states = self.config.output_hidden_states
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.embeddings(pixel_values)
encoder_outputs: BaseModelOutputWithNoAttention = self.encoder(
embedding_output, output_hidden_states=output_hidden_states
)
last_hidden_state = encoder_outputs.last_hidden_state
# global average pooling, (N, C, H, W) -> (N, C)
pooled_output = self.layernorm(last_hidden_state.mean([-2, -1]))
return BaseModelOutputWithPoolingAndNoAttention(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
)
@auto_docstring(
custom_intro="""
ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
ImageNet.
"""
)
# Copied from transformers.models.convnext.modeling_convnext.ConvNextForImageClassification with CONVNEXT->CONVNEXTV2,ConvNext->ConvNextV2,convnext->convnextv2
class ConvNextV2ForImageClassification(ConvNextV2PreTrainedModel):
accepts_loss_kwargs = False
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.convnextv2 = ConvNextV2Model(config)
# Classifier head
if config.num_labels > 0:
self.classifier = nn.Linear(config.hidden_sizes[-1], config.num_labels)
else:
self.classifier = nn.Identity()
# Initialize weights and apply final processing
self.post_init()
@can_return_tuple
@auto_docstring
def forward(
self, pixel_values: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs
) -> ImageClassifierOutputWithNoAttention:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs: BaseModelOutputWithPoolingAndNoAttention = self.convnextv2(pixel_values, **kwargs)
pooled_output = outputs.pooler_output
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
loss = self.loss_function(labels=labels, pooled_logits=logits, config=self.config)
return ImageClassifierOutputWithNoAttention(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
)
@auto_docstring(
custom_intro="""
ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
"""
)
# Copied from transformers.models.convnext.modeling_convnext.ConvNextBackbone with CONVNEXT->CONVNEXTV2,ConvNext->ConvNextV2,facebook/convnext-tiny-224->facebook/convnextv2-tiny-1k-224
class ConvNextV2Backbone(BackboneMixin, ConvNextV2PreTrainedModel):
has_attentions = False
def __init__(self, config):
super().__init__(config)
self.embeddings = ConvNextV2Embeddings(config)
self.encoder = ConvNextV2Encoder(config)
self.num_features = [config.hidden_sizes[0]] + config.hidden_sizes
# Add layer norms to hidden states of out_features
hidden_states_norms = {}
for stage, num_channels in zip(self.out_features, self.channels):
hidden_states_norms[stage] = ConvNextV2LayerNorm(num_channels, data_format="channels_first")
self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
# initialize weights and apply final processing
self.post_init()
@can_return_tuple
@auto_docstring
def forward(
self, pixel_values: torch.Tensor, output_hidden_states: bool | None = None, **kwargs
) -> BackboneOutput:
r"""
Examples:
```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
... image = Image.open(BytesIO(response.read()))
>>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
>>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```"""
if output_hidden_states is None:
output_hidden_states = self.config.output_hidden_states
embedding_output = self.embeddings(pixel_values)
outputs: BaseModelOutputWithPoolingAndNoAttention = self.encoder(embedding_output, output_hidden_states=True)
hidden_states = outputs.hidden_states
feature_maps = []
for stage, hidden_state in zip(self.stage_names, hidden_states):
if stage in self.out_features:
hidden_state = self.hidden_states_norms[stage](hidden_state)
feature_maps.append(hidden_state)
return BackboneOutput(
feature_maps=tuple(feature_maps),
hidden_states=hidden_states if output_hidden_states else None,
)
__all__ = ["ConvNextV2ForImageClassification", "ConvNextV2Model", "ConvNextV2PreTrainedModel", "ConvNextV2Backbone"]