You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
1.3 KiB

from typing import Callable, Optional
from thinc.api import Model
from ...language import BaseDefaults, Language
from .lemmatizer import FrenchLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKEN_MATCH, TOKENIZER_EXCEPTIONS
class FrenchDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
stop_words = STOP_WORDS
class French(Language):
lang = "fr"
Defaults = FrenchDefaults
@French.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "rule",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool,
scorer: Optional[Callable],
):
return FrenchLemmatizer(
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
__all__ = ["French"]