You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
38 lines
1.1 KiB
38 lines
1.1 KiB
|
4 days ago
|
# app/normalizer.py
|
||
|
|
import re
|
||
|
|
import spacy
|
||
|
|
|
||
|
|
# Load spaCy model (fast, production-grade)
|
||
|
|
nlp = spacy.load("en_core_web_sm")
|
||
|
|
|
||
|
|
def normalize_query(query: str) -> str:
|
||
|
|
"""
|
||
|
|
Production-grade query normalization using spaCy.
|
||
|
|
|
||
|
|
Steps:
|
||
|
|
1. Lowercase + remove special characters
|
||
|
|
2. spaCy POS-aware lemmatization (correct results)
|
||
|
|
3. Light stopword removal (keeps meaning-critical words)
|
||
|
|
"""
|
||
|
|
# Step 1: lowercase + remove special chars
|
||
|
|
q = query.lower().strip()
|
||
|
|
q = re.sub(r"[^a-z0-9\s]", "", q)
|
||
|
|
q = re.sub(r"\s+", " ", q).strip()
|
||
|
|
|
||
|
|
# Step 2: spaCy pipeline (POS tagging + lemmatization)
|
||
|
|
doc = nlp(q)
|
||
|
|
|
||
|
|
# Step 3: lemmatize, keep stopwords that carry meaning
|
||
|
|
KEEP_STOPWORDS = {"not", "no", "best", "worst", "cheap", "free"}
|
||
|
|
words = []
|
||
|
|
for token in doc:
|
||
|
|
# Skip pure stopwords UNLESS they carry meaning
|
||
|
|
if token.is_stop and token.text not in KEEP_STOPWORDS:
|
||
|
|
continue
|
||
|
|
# Skip punctuation and spaces
|
||
|
|
if token.is_punct or token.is_space:
|
||
|
|
continue
|
||
|
|
# Use POS-aware lemma
|
||
|
|
words.append(token.lemma_)
|
||
|
|
|
||
|
|
return " ".join(words)
|