You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
498 lines
24 KiB
498 lines
24 KiB
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import enum
|
|
from typing import Any, Union, overload
|
|
|
|
import numpy as np
|
|
|
|
from ..audio_utils import AudioInput
|
|
from ..generation import GenerationConfig
|
|
from ..image_utils import ImageInput
|
|
from ..processing_utils import ProcessingKwargs, Unpack
|
|
from ..utils import (
|
|
add_end_docstrings,
|
|
is_torch_available,
|
|
is_vision_available,
|
|
logging,
|
|
requires_backends,
|
|
)
|
|
from ..video_utils import VideoInput
|
|
from .base import Pipeline, build_pipeline_init_args
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
from ..models.auto.modeling_auto import MODEL_FOR_MULTIMODAL_LM_MAPPING_NAMES
|
|
from .pt_utils import KeyDataset
|
|
|
|
if is_vision_available():
|
|
from PIL import Image
|
|
|
|
logger = logging.get_logger(__name__)
|
|
|
|
|
|
class ReturnType(enum.Enum):
|
|
TENSORS = 0
|
|
NEW_TEXT = 1
|
|
FULL_TEXT = 2
|
|
|
|
|
|
class Chat:
|
|
"""This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
|
|
to this format because the rest of the pipeline code tends to assume that lists of messages are
|
|
actually a batch of samples rather than messages in the same conversation."""
|
|
|
|
def __init__(self, messages: list[dict]):
|
|
for message in messages:
|
|
if not ("role" in message and "content" in message):
|
|
raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
|
|
self.messages = messages
|
|
|
|
|
|
@add_end_docstrings(build_pipeline_init_args(has_processor=True))
|
|
class AnyToAnyPipeline(Pipeline):
|
|
"""
|
|
Multimodal Generation pipeline using an `AutoModelForMultimodalLM`. This pipeline generates text given any
|
|
combination of multimodal data and text.When the underlying model is a conversational model, it can also
|
|
accept one or more chats, in which case the pipeline will operate in chat mode and will continue the
|
|
chat(s) by adding its response(s). Each chat takes the form of a list of dicts, where each dict contains
|
|
"role" and "content" keys.
|
|
|
|
Unless the model you're using explicitly sets these generation parameters in its configuration files
|
|
(`generation_config.json`), the following default values will be used:
|
|
- max_new_tokens: 256
|
|
|
|
Example:
|
|
|
|
```python
|
|
>>> from transformers import pipeline
|
|
|
|
>>> pipe = pipeline(task="any-to-any", model="google/gemma-3n-E4B-it")
|
|
>>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
|
|
[{'generated_text': 'a photo of two birds'}]
|
|
```
|
|
|
|
```python
|
|
>>> from transformers import pipeline
|
|
|
|
>>> pipe = pipeline("any-to-any", model="google/gemma-3n-E4B-it")
|
|
>>> messages = [
|
|
>>> {
|
|
>>> "role": "user",
|
|
>>> "content": [
|
|
>>> {
|
|
>>> "type": "image",
|
|
>>> "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
|
|
>>> },
|
|
>>> {"type": "text", "text": "Describe this image."},
|
|
>>> ],
|
|
>>> },
|
|
>>> {
|
|
>>> "role": "assistant",
|
|
>>> "content": [
|
|
>>> {"type": "text", "text": "There is a dog and"},
|
|
>>> ],
|
|
>>> },
|
|
>>> ]
|
|
>>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
|
|
[{'input_text': [{'role': 'user',
|
|
'content': [{'type': 'image',
|
|
'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
|
|
{'type': 'text', 'text': 'Describe this image.'}]},
|
|
{'role': 'assistant',
|
|
'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
|
|
'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
|
|
```
|
|
|
|
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
|
|
|
|
This multimodal pipeline can currently be loaded from pipeline() using the following task identifier:
|
|
"any-to-any".
|
|
|
|
See the list of available models on
|
|
[huggingface.co/models](https://huggingface.co/models?pipeline_tag=any-to-any).
|
|
"""
|
|
|
|
_load_processor = True
|
|
_load_image_processor = False
|
|
_load_feature_extractor = False
|
|
_load_tokenizer = False
|
|
|
|
_pipeline_calls_generate = True
|
|
# Make sure the docstring is updated when the default generation config is changed
|
|
_default_generation_config = GenerationConfig(
|
|
max_new_tokens=256,
|
|
)
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
if "image" in self.model.input_modalities or "video" in self.model.input_modalities:
|
|
requires_backends(self, "vision")
|
|
requires_backends(self, "torchvision")
|
|
if "audio" in self.model.input_modalities:
|
|
requires_backends(self, "librosa")
|
|
self.check_model_type(MODEL_FOR_MULTIMODAL_LM_MAPPING_NAMES)
|
|
|
|
def _sanitize_parameters(
|
|
self,
|
|
max_new_tokens=None,
|
|
generate_kwargs=None,
|
|
timeout=None,
|
|
return_full_text=None,
|
|
return_tensors=None,
|
|
return_type=None,
|
|
clean_up_tokenization_spaces=None,
|
|
stop_sequence=None,
|
|
continue_final_message=None,
|
|
skip_special_tokens=None,
|
|
generation_mode=None,
|
|
**kwargs: Unpack[ProcessingKwargs],
|
|
):
|
|
forward_kwargs = {}
|
|
preprocess_params = {}
|
|
postprocess_params = {}
|
|
|
|
# Preprocess params
|
|
preprocess_params.update(kwargs)
|
|
if timeout is not None:
|
|
preprocess_params["timeout"] = timeout
|
|
if continue_final_message is not None:
|
|
preprocess_params["continue_final_message"] = continue_final_message
|
|
|
|
# Forward kwargs
|
|
forward_kwargs["generate_kwargs"] = generate_kwargs or {}
|
|
if generation_mode is not None and generation_mode != "text":
|
|
forward_kwargs["generate_kwargs"]["generation_mode"] = generation_mode
|
|
if kwargs.get("load_audio_from_video"):
|
|
forward_kwargs["generate_kwargs"]["use_audio_in_video"] = True
|
|
if stop_sequence is not None:
|
|
if isinstance(stop_sequence, str):
|
|
stop_sequence = [stop_sequence]
|
|
forward_kwargs["generate_kwargs"]["stop_strings"] = stop_sequence
|
|
forward_kwargs["generate_kwargs"]["tokenizer"] = self.processor.tokenizer
|
|
|
|
if max_new_tokens is not None:
|
|
if generate_kwargs is not None and "max_new_tokens" in generate_kwargs:
|
|
raise ValueError(
|
|
"'max_new_tokens' is defined twice, once in 'generate_kwargs' and "
|
|
"once as a direct argument. Please use only one."
|
|
)
|
|
forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
|
|
|
|
if return_full_text is not None and return_type is None:
|
|
if return_tensors is not None:
|
|
raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
|
|
return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
|
|
elif return_tensors is not None and return_type is None:
|
|
return_type = ReturnType.TENSORS
|
|
# We don't want to set the global default to FULLTEXT at init time. That is why
|
|
# `_postprocess_params` is checked before setting the default value
|
|
elif return_type is None and generation_mode in [None, "text"] and hasattr(self, "_postprocess_params"):
|
|
return_type = ReturnType.FULL_TEXT
|
|
|
|
# Postprocess params
|
|
if generation_mode not in [None, "text"] and return_type is not None:
|
|
raise ValueError(
|
|
f"`return_type` cannot be set to {return_type} when generation_mode={generation_mode}. "
|
|
"Set `return_type=None` or generation_mode='text'"
|
|
)
|
|
if generation_mode not in [None, "text", "image", "audio"]:
|
|
raise ValueError(
|
|
f"`generation_mode` can be only one of the `text`, `audio`, `image` but got generation_mode[={generation_mode}]"
|
|
)
|
|
elif generation_mode is not None and generation_mode not in self.model.output_modalities:
|
|
raise ValueError(
|
|
f"`generation_mode={generation_mode}` is not supported for {self.model.__class__.__name__}. "
|
|
f"The model can only output the following modalities: {self.model.output_modalities}"
|
|
)
|
|
|
|
if return_type is not None:
|
|
postprocess_params["return_type"] = return_type
|
|
if continue_final_message is not None:
|
|
postprocess_params["continue_final_message"] = continue_final_message
|
|
if clean_up_tokenization_spaces is not None:
|
|
postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
|
|
if skip_special_tokens is not None:
|
|
postprocess_params["skip_special_tokens"] = skip_special_tokens
|
|
postprocess_params["generation_mode"] = generation_mode
|
|
return preprocess_params, forward_kwargs, postprocess_params
|
|
|
|
@overload
|
|
def __call__(
|
|
self,
|
|
text: str | None = None,
|
|
images: Union[str, "Image.Image"] | None = None,
|
|
videos: Union[str, "np.ndarray", "torch.Tensor"] | None = None,
|
|
audio: Union[str, "np.ndarray"] | None = None,
|
|
**kwargs: Any,
|
|
) -> list[dict[str, Any]]: ...
|
|
|
|
@overload
|
|
def __call__(
|
|
self,
|
|
text: list[str] | None = None,
|
|
images: list[str] | list["Image.Image"] | None = None,
|
|
videos: list[str] | list["np.ndarray"] | list["torch.Tensor"] | None = None,
|
|
audio: list[str] | list["np.ndarray"] | None = None,
|
|
**kwargs: Any,
|
|
) -> list[list[dict[str, Any]]]: ...
|
|
|
|
def __call__(
|
|
self,
|
|
text: str | list[str] | list[dict],
|
|
images: str | list[str] | list[list[str]] | ImageInput | None = None,
|
|
videos: str | list[str] | VideoInput | None = None,
|
|
audio: str | list[str] | AudioInput | None = None,
|
|
**kwargs,
|
|
) -> list[dict[str, Any]] | list[list[dict[str, Any]]]:
|
|
"""
|
|
Generate a text given text and optionally multimodal data passed as inputs.
|
|
|
|
Args:
|
|
text (`str`, `list[str]`, `list[dict]`):
|
|
The text to be used for generation. If a list of strings is passed, the length of the list should be
|
|
the same as the number of images. Text can also follow the chat format: a list of dictionaries where
|
|
each dictionary represents a message in a conversation. Each dictionary should have two keys: 'role'
|
|
and 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of
|
|
dictionary containing the text of the message and the type of the message.
|
|
images (`str`, `list[str]`, `ImageInput`):
|
|
The pipeline handles three types of images:
|
|
|
|
- A string containing a HTTP(s) link pointing to an image
|
|
- A string containing a local path to an image
|
|
- An image loaded in PIL directly
|
|
|
|
The pipeline accepts either a single image or a batch of images. Finally, this pipeline also supports
|
|
the chat format (see `text`) containing images and text in this argument.
|
|
videos (`str`, `list[str]`, `VideoInput`):
|
|
The pipeline handles three types of videos:
|
|
|
|
- A string containing a HTTP(s) link pointing to a video
|
|
- A string containing a local path to a video
|
|
- A video loaded and decoded to array format
|
|
|
|
The pipeline accepts either a single video or a batch of videos. Finally, this pipeline also supports
|
|
the chat format (see `text`) containing videos and text in this argument.
|
|
audio (`str`, `list[str]`, `AudioInput`):
|
|
The pipeline handles three types of audios:
|
|
|
|
- A string containing a HTTP(s) link pointing to an audio
|
|
- A string containing a local path to an audio
|
|
- An audio loaded in PIL directly
|
|
|
|
The pipeline accepts either a single audios or a batch of audios. Finally, this pipeline also supports
|
|
the chat format (see `text`) containing audios and text in this argument.
|
|
return_tensors (`bool`, *optional*, defaults to `False`):
|
|
Returns the tensors of predictions (as token indices) in the outputs. If set to
|
|
`True`, the decoded text is not returned.
|
|
return_text (`bool`, *optional*):
|
|
Returns the decoded texts in the outputs.
|
|
return_full_text (`bool`, *optional*, defaults to `True`):
|
|
If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
|
|
specified at the same time as `return_text`.
|
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
|
Whether or not to clean up the potential extra spaces in the text output.
|
|
continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
|
|
last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
|
|
By default this is `True` when the final message in the input chat has the `assistant` role and
|
|
`False` otherwise, but you can manually override that behaviour by setting this flag.
|
|
|
|
Return:
|
|
A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot
|
|
return a combination of both `generated_text` and `generated_token_ids`):
|
|
|
|
- **generated_text** (`str`, present when `return_text=True` and `generation_mode="text") -- The generated text.
|
|
- **generated_audio** (`np.ndarray`, present when `generation_mode="audio") -- The generated audio.
|
|
- **generated_image** (`PIL.Image.Image`, present when `generation_mode="image") -- The generated image.
|
|
- **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True` and `generation_mode="text") -- The token
|
|
ids of the generated text.
|
|
- **input_text** (`str`) -- The input text.
|
|
"""
|
|
if images is None and text is None:
|
|
raise ValueError("You must at least provide either text or images.")
|
|
|
|
if isinstance(text, (list, tuple, KeyDataset)) and isinstance(text[0], (list, tuple, dict)):
|
|
# We have one or more prompts in list-of-dicts format, so this is chat mode
|
|
if isinstance(text[0], dict) and "role" in text[0]:
|
|
return super().__call__(Chat(text), **kwargs)
|
|
elif isinstance(text[0], (list, tuple)) and isinstance(text[0][0], dict) and "role" in text[0][0]:
|
|
chats = [Chat(chat) for chat in text] # 🐈 🐈 🐈
|
|
return super().__call__(chats, **kwargs)
|
|
|
|
if text is not None and not (isinstance(text, str) or (isinstance(text, list) and isinstance(text[0], str))):
|
|
"""
|
|
Supports the following format
|
|
- {"text": text, "image": image, "video": video, "audio": audio}
|
|
- [{"text": text, "image": image, "video": video, "audio": audio}]
|
|
- Generator and datasets
|
|
This is a common pattern in other multimodal pipelines, so we support it here as well.
|
|
"""
|
|
return super().__call__(text, **kwargs)
|
|
|
|
# encourage the user to use the chat format if supported
|
|
if getattr(self.processor, "chat_template", None) is not None:
|
|
logger.warning_once(
|
|
"The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even "
|
|
"though this model supports chat. Consider using the chat format for better results. For more "
|
|
"information, see https://huggingface.co/docs/transformers/en/chat_templating"
|
|
)
|
|
|
|
return super().__call__({"text": text, "images": images, "video": videos, "audio": audio}, **kwargs)
|
|
|
|
def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **processing_kwargs):
|
|
if isinstance(inputs, Chat):
|
|
# If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
|
|
# because very few models support multiple separate, consecutive assistant messages
|
|
if continue_final_message is None:
|
|
continue_final_message = inputs.messages[-1]["role"] == "assistant"
|
|
|
|
# Handle Mistral tokenizer which does not accept processing kwargs
|
|
chat_template_kwargs = {"add_generation_prompt": not continue_final_message, **processing_kwargs}
|
|
if self.processor.tokenizer.__class__.__name__ == "MistralCommonBackend":
|
|
chat_template_kwargs = {
|
|
k: v for k, v in chat_template_kwargs.items() if k in ["padding", "truncation", "max_length"]
|
|
}
|
|
|
|
model_inputs = self.processor.apply_chat_template(
|
|
inputs.messages,
|
|
continue_final_message=continue_final_message,
|
|
return_tensors="pt",
|
|
tokenize=True,
|
|
return_dict=True,
|
|
**chat_template_kwargs,
|
|
).to(dtype=self.dtype)
|
|
model_inputs["text"] = inputs
|
|
return model_inputs
|
|
|
|
# In case we only have text inputs
|
|
if isinstance(inputs, (list, tuple, str)):
|
|
text = inputs
|
|
inputs = {}
|
|
else:
|
|
inputs = inputs.copy() # avoid in-place changes if users passed dict
|
|
text = inputs.pop("text")
|
|
|
|
# Feature extractor do not load audio files and expect a decode array
|
|
if inputs.get("audio", None) is not None and hasattr(self.processor, "feature_extractor"):
|
|
inputs["audio"] = self.processor.feature_extractor.fetch_audio(inputs["audio"])
|
|
|
|
# If batched text inputs, we set padding to True unless specified otherwise
|
|
if isinstance(text, (list, tuple)) and len(text) > 1:
|
|
processing_kwargs.setdefault("padding", True)
|
|
|
|
# Multimodal data is loaded in preprocessors so we pass all ipnuts directly to `self.processor`
|
|
model_inputs = self.processor(text=text, **inputs, return_tensors="pt", **processing_kwargs).to(
|
|
dtype=self.dtype
|
|
)
|
|
model_inputs["text"] = text
|
|
return model_inputs
|
|
|
|
def _forward(self, model_inputs, generate_kwargs=None):
|
|
generate_kwargs = {} if generate_kwargs is None else generate_kwargs
|
|
prompt_text = model_inputs.pop("text")
|
|
input_ids = model_inputs.get("input_ids", model_inputs.get("decoder_input_ids"))
|
|
|
|
# User-defined `generation_config` passed to the pipeline call take precedence
|
|
if "generation_config" not in generate_kwargs:
|
|
generate_kwargs["generation_config"] = self.generation_config
|
|
|
|
generated_sequence = self.model.generate(**model_inputs, **generate_kwargs)
|
|
return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
|
|
|
|
def postprocess(
|
|
self,
|
|
model_outputs,
|
|
return_type=None,
|
|
continue_final_message=None,
|
|
skip_special_tokens=None,
|
|
**postprocess_kwargs,
|
|
):
|
|
input_texts = model_outputs["prompt_text"]
|
|
input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
|
|
generated_sequence = model_outputs["generated_sequence"]
|
|
input_ids = model_outputs["input_ids"]
|
|
if return_type == ReturnType.TENSORS:
|
|
return [
|
|
{"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]}
|
|
for i in range(len(input_texts))
|
|
]
|
|
|
|
# Decode inputs and outputs the same way to remove input text from generated text if present
|
|
skip_special_tokens = skip_special_tokens if skip_special_tokens is not None else True
|
|
generation_mode = postprocess_kwargs["generation_mode"] or "text"
|
|
if generation_mode == "image" and hasattr(self.model, "decode_image_tokens"):
|
|
generated_sequence = self.model.decode_image_tokens(generated_sequence.to(self.model.device))
|
|
generated_outputs = self.processor.post_process_multimodal_output(
|
|
generated_sequence, skip_special_tokens=skip_special_tokens, **postprocess_kwargs
|
|
)
|
|
|
|
# Force consistent behavior for including the input text in the output
|
|
if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
|
|
# Remove the input text from the generated text if the generated text starts with the input text
|
|
# (accounting for the possibility of a space between the input and generated text)
|
|
new_generated_texts = []
|
|
postprocess_kwargs["generation_mode"] = "text"
|
|
decoded_inputs = self.processor.post_process_multimodal_output(
|
|
input_ids, skip_special_tokens=skip_special_tokens, **postprocess_kwargs
|
|
)
|
|
for text_generated, decoded_input in zip(generated_outputs, decoded_inputs):
|
|
# There can be added characters before the input text, so we need to find the beginning of the input text in the generated text
|
|
index_input_text = text_generated.find(decoded_input)
|
|
# Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer
|
|
if 0 <= index_input_text <= 2:
|
|
# If the input text is found, we remove it
|
|
new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :])
|
|
else:
|
|
new_generated_texts.append(text_generated)
|
|
generated_outputs = new_generated_texts
|
|
if return_type == ReturnType.FULL_TEXT:
|
|
full_texts = []
|
|
for prompt_text, generated_text in zip(input_texts, generated_outputs):
|
|
if isinstance(prompt_text, str):
|
|
generated_text = prompt_text + generated_text
|
|
elif isinstance(prompt_text, Chat):
|
|
if continue_final_message is None:
|
|
# If the user passes a chat ending in an assistant message, we treat it as a prefill by
|
|
# default because very few models support multiple separate, consecutive assistant messages
|
|
continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
|
|
if continue_final_message:
|
|
# With assistant prefill, concat onto the end of the last message
|
|
new_text = dict(prompt_text.messages[-1]["content"][-1].items())
|
|
new_text["text"] += generated_text
|
|
generated_text = list(prompt_text.messages)[:-1] + [
|
|
{
|
|
"role": prompt_text.messages[-1]["role"],
|
|
"content": prompt_text.messages[-1]["content"][:-1] + [new_text],
|
|
}
|
|
]
|
|
else:
|
|
# When we're not starting from a prefill, the output is a new assistant message
|
|
generated_text = list(prompt_text.messages) + [
|
|
{"role": "assistant", "content": generated_text}
|
|
]
|
|
full_texts.append(generated_text)
|
|
generated_outputs = full_texts
|
|
|
|
records = [
|
|
{
|
|
"input_text": input_text.messages if isinstance(input_text, Chat) else input_text,
|
|
f"generated_{generation_mode}": generated_output,
|
|
}
|
|
for input_text, generated_output in zip(input_texts, generated_outputs)
|
|
]
|
|
|
|
return records
|