You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

58 lines
1.5 KiB

from typing import Callable, Optional
from thinc.api import Model
from ...attrs import LANG
from ...language import BaseDefaults, Language
from ...lookups import Lookups
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lemmatizer import MacedonianLemmatizer
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class MacedonianDefaults(BaseDefaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "mk"
# Optional: replace flags with custom functions, e.g. like_num()
lex_attr_getters.update(LEX_ATTRS)
# Merge base exceptions and custom tokenizer exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
class Macedonian(Language):
lang = "mk"
Defaults = MacedonianDefaults
@Macedonian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return MacedonianLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["Macedonian"]