You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

44 lines
1.8 KiB

from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself for close apposition and
# measurement construction, the span is sometimes extended to the right of
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
# and not just "eine Tasse", same for "das Thema Familie".
# fmt: off
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_label = doc.vocab.strings.add("NP")
np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add("nk")
rbracket = 0
prev_end = -1
for i, word in enumerate(doclike):
if i < rbracket:
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
rbracket = word.i + 1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in doc[word.i].rights:
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
rbracket = rdep.i + 1
prev_end = rbracket - 1
yield word.left_edge.i, rbracket, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}