# app/normalizer.py import re import spacy # Load spaCy model (fast, production-grade) nlp = spacy.load("en_core_web_sm") def normalize_query(query: str) -> str: """ Production-grade query normalization using spaCy. Steps: 1. Lowercase + remove special characters 2. spaCy POS-aware lemmatization (correct results) 3. Light stopword removal (keeps meaning-critical words) """ # Step 1: lowercase + remove special chars q = query.lower().strip() q = re.sub(r"[^a-z0-9\s]", "", q) q = re.sub(r"\s+", " ", q).strip() # Step 2: spaCy pipeline (POS tagging + lemmatization) doc = nlp(q) # Step 3: lemmatize, keep stopwords that carry meaning KEEP_STOPWORDS = {"not", "no", "best", "worst", "cheap", "free"} words = [] for token in doc: # Skip pure stopwords UNLESS they carry meaning if token.is_stop and token.text not in KEEP_STOPWORDS: continue # Skip punctuation and spaces if token.is_punct or token.is_space: continue # Use POS-aware lemma words.append(token.lemma_) return " ".join(words)