You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

59 lines
1.8 KiB

from typing import Iterator, Tuple, Union
from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
# Please see documentation for Turkish NP structure
labels = [
"nsubj",
"iobj",
"obj",
"obl",
"appos",
"orphan",
"dislocated",
"ROOT",
]
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
flat = doc.vocab.strings.add("flat")
np_label = doc.vocab.strings.add("NP")
def extend_right(w): # Playing a trick for flat
rindex = w.i + 1
for rdep in doc[w.i].rights: # Extend the span to right if there is a flat
if rdep.dep == flat and rdep.pos in (NOUN, PROPN):
rindex = rdep.i + 1
else:
break
return rindex
prev_end = len(doc) + 1
for i, word in reversed(list(enumerate(doclike))):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i >= prev_end:
continue
if word.dep in np_deps:
prev_end = word.left_edge.i
yield word.left_edge.i, extend_right(word), np_label
elif word.dep == conj:
cc_token = word.left_edge
prev_end = cc_token.i
# Shave off cc tokens from the NP
yield cc_token.right_edge.i + 1, extend_right(word), np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}