You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

38 lines
1.1 KiB

4 days ago
# app/normalizer.py
import re
import spacy
# Load spaCy model (fast, production-grade)
nlp = spacy.load("en_core_web_sm")
def normalize_query(query: str) -> str:
"""
Production-grade query normalization using spaCy.
Steps:
1. Lowercase + remove special characters
2. spaCy POS-aware lemmatization (correct results)
3. Light stopword removal (keeps meaning-critical words)
"""
# Step 1: lowercase + remove special chars
q = query.lower().strip()
q = re.sub(r"[^a-z0-9\s]", "", q)
q = re.sub(r"\s+", " ", q).strip()
# Step 2: spaCy pipeline (POS tagging + lemmatization)
doc = nlp(q)
# Step 3: lemmatize, keep stopwords that carry meaning
KEEP_STOPWORDS = {"not", "no", "best", "worst", "cheap", "free"}
words = []
for token in doc:
# Skip pure stopwords UNLESS they carry meaning
if token.is_stop and token.text not in KEEP_STOPWORDS:
continue
# Skip punctuation and spaces
if token.is_punct or token.is_space:
continue
# Use POS-aware lemma
words.append(token.lemma_)
return " ".join(words)