You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

50 lines
870 B

4 days ago
from ...symbols import ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
for raw in [
"a-e",
"a-o",
"a-i",
"a-a",
"co-a",
"co-e",
"co-i",
"co-o",
"da-a",
"da-e",
"da-i",
"da-o",
"pe-a",
"pe-e",
"pe-i",
"pe-o",
]:
for orth in [raw, raw.capitalize()]:
_exc[orth] = [{ORTH: orth}]
# Prefix + prepositions with à (e.g. "sott'a-o")
for prep in [
"a-a",
"a-e",
"a-o",
"a-i",
]:
for prefix in [
"sott'",
"sott",
"contr'",
"contr",
"ch'",
"ch",
"s'",
"s",
]:
for prefix_orth in [prefix, prefix.capitalize()]:
_exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)