You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

50 lines
1.9 KiB

from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
np_label = doc.vocab.strings.add("NP")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
left = word.left_edge.i
right = word.right_edge.i + 1
# leave prepositions and punctuation out of the left side of the chunk
if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
left = word.left_edge.i + 1
prev_end = word.right_edge.i
# leave subordinated clauses and appositions out of the chunk
a = word.i + 1
while a < word.right_edge.i:
paraula = doc[a]
if paraula.pos_ == "VERB":
right = paraula.left_edge.i
prev_end = paraula.left_edge.i - 1
elif paraula.dep_ == "appos":
right = paraula.left_edge.i + 1
prev_end = paraula.left_edge.i - 1
a += 1
# leave punctuation out of the right side of the chunk
if word.right_edge.pos_ == "PUNCT":
right = right - 1
yield left, right, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}