You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

214 lines
5.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import unicodedata
from typing import Set
from .. import attrs
from .tokenizer_exceptions import URL_MATCH
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
_tlds = set(
"com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
"name|pro|tel|travel|xyz|icu|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|"
"ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|"
"cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|"
"ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|"
"gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|"
"je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|"
"lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|"
"na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|"
"pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|"
"ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|"
"ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw".split("|")
)
def is_punct(text: str) -> bool:
for char in text:
if not unicodedata.category(char).startswith("P"):
return False
return True
def is_ascii(text: str) -> bool:
for char in text:
if ord(char) >= 128:
return False
return True
def like_num(text: str) -> bool:
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
# can be overwritten by lang with list of number words
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
return False
def is_bracket(text: str) -> bool:
brackets = ("(", ")", "[", "]", "{", "}", "<", ">")
return text in brackets
def is_quote(text: str) -> bool:
# fmt: off
quotes = ('"', "'", "`", "«", "»", "", "", "", "", "", "", "", "", "", "", "", "", "''", "``")
# fmt: on
return text in quotes
def is_left_punct(text: str) -> bool:
# fmt: off
left_punct = ("(", "[", "{", "<", '"', "'", "«", "", "", "", "", "", "", "", "", "``")
# fmt: on
return text in left_punct
def is_right_punct(text: str) -> bool:
right_punct = (")", "]", "}", ">", '"', "'", "»", "", "", "", "", "''")
return text in right_punct
def is_currency(text: str) -> bool:
# can be overwritten by lang with list of currency words, e.g. dollar, euro
for char in text:
if unicodedata.category(char) != "Sc":
return False
return True
def like_email(text: str) -> bool:
return bool(_like_email(text))
def like_url(text: str) -> bool:
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if text.startswith("http://") or text.startswith("https://"):
return True
elif text.startswith("www.") and len(text) >= 5:
return True
if text[0] == "." or text[-1] == ".":
return False
if "@" in text:
# prevent matches on e-mail addresses check after splitting the text
# to still allow URLs containing an '@' character (see #1715)
return False
for i in range(len(text)):
if text[i] == ".":
break
else:
return False
tld = text.rsplit(".", 1)[1].split(":", 1)[0]
if tld.endswith("/"):
return True
if tld.isalpha() and tld in _tlds:
return True
if URL_MATCH(text):
return True
return False
def word_shape(text: str) -> str:
if len(text) >= 100:
return "LONG"
shape = []
last = ""
shape_char = ""
seq = 0
for char in text:
if char.isalpha():
if char.isupper():
shape_char = "X"
else:
shape_char = "x"
elif char.isdigit():
shape_char = "d"
else:
shape_char = char
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 4:
shape.append(shape_char)
return "".join(shape)
def lower(string: str) -> str:
return string.lower()
def prefix(string: str) -> str:
return string[0]
def suffix(string: str) -> str:
return string[-3:]
def is_alpha(string: str) -> bool:
return string.isalpha()
def is_digit(string: str) -> bool:
return string.isdigit()
def is_lower(string: str) -> bool:
return string.islower()
def is_space(string: str) -> bool:
return string.isspace()
def is_title(string: str) -> bool:
return string.istitle()
def is_upper(string: str) -> bool:
return string.isupper()
def is_stop(string: str, stops: Set[str] = set()) -> bool:
return string.lower() in stops
def get_lang(text: str, lang: str = "") -> str:
# This function is partially applied so lang code can be passed in
# automatically while still allowing pickling
return lang
LEX_ATTRS = {
attrs.LOWER: lower,
attrs.NORM: lower,
attrs.PREFIX: prefix,
attrs.SUFFIX: suffix,
attrs.IS_ALPHA: is_alpha,
attrs.IS_DIGIT: is_digit,
attrs.IS_LOWER: is_lower,
attrs.IS_SPACE: is_space,
attrs.IS_TITLE: is_title,
attrs.IS_UPPER: is_upper,
attrs.IS_STOP: is_stop,
attrs.LIKE_EMAIL: like_email,
attrs.LIKE_NUM: like_num,
attrs.IS_PUNCT: is_punct,
attrs.IS_ASCII: is_ascii,
attrs.SHAPE: word_shape,
attrs.IS_BRACKET: is_bracket,
attrs.IS_QUOTE: is_quote,
attrs.IS_LEFT_PUNCT: is_left_punct,
attrs.IS_RIGHT_PUNCT: is_right_punct,
attrs.IS_CURRENCY: is_currency,
attrs.LIKE_URL: like_url,
}