You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
2.3 KiB
75 lines
2.3 KiB
from typing import Iterator, Tuple, Union
|
|
|
|
from ...errors import Errors
|
|
from ...symbols import NOUN, PRON, PROPN
|
|
from ...tokens import Doc, Span
|
|
|
|
|
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|
"""
|
|
Detect base noun phrases from a dependency parse for Haitian Creole.
|
|
Works on both Doc and Span objects.
|
|
"""
|
|
|
|
# Core nominal dependencies common in Haitian Creole
|
|
labels = [
|
|
"nsubj",
|
|
"obj",
|
|
"obl",
|
|
"nmod",
|
|
"appos",
|
|
"ROOT",
|
|
]
|
|
|
|
# Modifiers to optionally include in chunk (to the right)
|
|
post_modifiers = ["compound", "flat", "flat:name", "fixed"]
|
|
|
|
doc = doclike.doc
|
|
if not doc.has_annotation("DEP"):
|
|
raise ValueError(Errors.E029)
|
|
|
|
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
|
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
|
|
conj_label = doc.vocab.strings.add("conj")
|
|
np_label = doc.vocab.strings.add("NP")
|
|
adp_pos = doc.vocab.strings.add("ADP")
|
|
cc_pos = doc.vocab.strings.add("CCONJ")
|
|
|
|
prev_end = -1
|
|
for i, word in enumerate(doclike):
|
|
if word.pos not in (NOUN, PROPN, PRON):
|
|
continue
|
|
if word.left_edge.i <= prev_end:
|
|
continue
|
|
|
|
if word.dep in np_deps:
|
|
right_end = word
|
|
# expand to include known modifiers to the right
|
|
for child in word.rights:
|
|
if child.dep in np_mods:
|
|
right_end = child.right_edge
|
|
elif child.pos == NOUN:
|
|
right_end = child.right_edge
|
|
|
|
left_index = word.left_edge.i
|
|
# Skip prepositions at the start
|
|
if word.left_edge.pos == adp_pos:
|
|
left_index += 1
|
|
|
|
prev_end = right_end.i
|
|
yield left_index, right_end.i + 1, np_label
|
|
|
|
elif word.dep == conj_label:
|
|
head = word.head
|
|
while head.dep == conj_label and head.head.i < head.i:
|
|
head = head.head
|
|
if head.dep in np_deps:
|
|
left_index = word.left_edge.i
|
|
if word.left_edge.pos == cc_pos:
|
|
left_index += 1
|
|
prev_end = word.i
|
|
yield left_index, word.i + 1, np_label
|
|
|
|
|
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|