You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
126 lines
5.6 KiB
126 lines
5.6 KiB
# Copyright 2022 The HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
Processor class for MarkupLM.
|
|
"""
|
|
|
|
from ...file_utils import TensorType
|
|
from ...processing_utils import ProcessorMixin
|
|
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy
|
|
from ...utils import auto_docstring
|
|
|
|
|
|
@auto_docstring
|
|
class MarkupLMProcessor(ProcessorMixin):
|
|
parse_html = True
|
|
|
|
def __init__(self, feature_extractor, tokenizer):
|
|
super().__init__(feature_extractor, tokenizer)
|
|
|
|
@auto_docstring
|
|
def __call__(
|
|
self,
|
|
html_strings=None,
|
|
nodes=None,
|
|
xpaths=None,
|
|
node_labels=None,
|
|
questions=None,
|
|
add_special_tokens: bool = True,
|
|
padding: bool | str | PaddingStrategy = False,
|
|
truncation: bool | str | TruncationStrategy = None,
|
|
max_length: int | None = None,
|
|
stride: int = 0,
|
|
pad_to_multiple_of: int | None = None,
|
|
return_token_type_ids: bool | None = None,
|
|
return_attention_mask: bool | None = None,
|
|
return_overflowing_tokens: bool = False,
|
|
return_special_tokens_mask: bool = False,
|
|
return_offsets_mapping: bool = False,
|
|
return_length: bool = False,
|
|
verbose: bool = True,
|
|
return_tensors: str | TensorType | None = None,
|
|
**kwargs,
|
|
) -> BatchEncoding:
|
|
# first, create nodes and xpaths
|
|
r"""
|
|
html_strings (`str` or `list[str]`, *optional*):
|
|
Raw HTML strings to parse and process. When `parse_html=True` (default), these strings are parsed
|
|
to extract nodes and xpaths automatically. If provided, `nodes`, `xpaths`, and `node_labels` should
|
|
not be provided. Required when `parse_html=True`.
|
|
nodes (`list[list[str]]`, *optional*):
|
|
Pre-extracted HTML nodes as a list of lists, where each inner list contains the text content of nodes
|
|
for a single document. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
|
|
xpaths (`list[list[str]]`, *optional*):
|
|
Pre-extracted XPath expressions corresponding to the nodes. Should be a list of lists with the same
|
|
structure as `nodes`, where each XPath identifies the location of the corresponding node in the HTML
|
|
tree. Required when `parse_html=False`. Should not be provided when `parse_html=True`.
|
|
node_labels (`list[list[int]]`, *optional*):
|
|
Labels for the nodes, typically used for training or fine-tuning tasks. Should be a list of lists
|
|
with the same structure as `nodes`, where each label corresponds to a node. Optional and only used
|
|
when `parse_html=False`.
|
|
questions (`str` or `list[str]`, *optional*):
|
|
Question strings for question-answering tasks. When provided, the tokenizer processes questions
|
|
as the first sequence and nodes as the second sequence (text_pair). If a single string is provided,
|
|
it is converted to a list to match the batch dimension of the parsed HTML.
|
|
"""
|
|
if self.parse_html:
|
|
if html_strings is None:
|
|
raise ValueError("Make sure to pass HTML strings in case `parse_html` is set to `True`")
|
|
|
|
if nodes is not None or xpaths is not None or node_labels is not None:
|
|
raise ValueError(
|
|
"Please don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`"
|
|
)
|
|
|
|
features = self.feature_extractor(html_strings)
|
|
nodes = features["nodes"]
|
|
xpaths = features["xpaths"]
|
|
else:
|
|
if html_strings is not None:
|
|
raise ValueError("You have passed HTML strings but `parse_html` is set to `False`.")
|
|
if nodes is None or xpaths is None:
|
|
raise ValueError("Make sure to pass nodes and xpaths in case `parse_html` is set to `False`")
|
|
|
|
# # second, apply the tokenizer
|
|
if questions is not None and self.parse_html:
|
|
if isinstance(questions, str):
|
|
questions = [questions] # add batch dimension (as the feature extractor always adds a batch dimension)
|
|
|
|
encoded_inputs = self.tokenizer(
|
|
text=questions if questions is not None else nodes,
|
|
text_pair=nodes if questions is not None else None,
|
|
xpaths=xpaths,
|
|
node_labels=node_labels,
|
|
add_special_tokens=add_special_tokens,
|
|
padding=padding,
|
|
truncation=truncation,
|
|
max_length=max_length,
|
|
stride=stride,
|
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
return_token_type_ids=return_token_type_ids,
|
|
return_attention_mask=return_attention_mask,
|
|
return_overflowing_tokens=return_overflowing_tokens,
|
|
return_special_tokens_mask=return_special_tokens_mask,
|
|
return_offsets_mapping=return_offsets_mapping,
|
|
return_length=return_length,
|
|
verbose=verbose,
|
|
return_tensors=return_tensors,
|
|
**kwargs,
|
|
)
|
|
|
|
return encoded_inputs
|
|
|
|
|
|
__all__ = ["MarkupLMProcessor"]
|