webapp/app/bootstrap.py

# app/bootstrap.py
from dotenv import load_dotenv
import time
import os, json, re, requests
from sqlalchemy import create_engine, text
from sentence_transformers import SentenceTransformer
import requests as http_requests
import google.auth
import google.auth.transport.requests
from app.vertex_client import get_access_token

load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(DATABASE_URL)
model = SentenceTransformer("all-MiniLM-L6-v2")

PROJECT_ID = os.getenv("GOOGLE_PROJECT_ID")
LOCATION = "us-central1"


VALID_CATEGORIES = {
    "Performance", "Financial", "Risk", "Maintenance", "Benefits",
    "Time", "Requirements", "Scalability", "Alternatives", "Usability",
    "Security", "Reliability", "Support", "Sustainability"
}


def clean_json_response(raw: str) -> str:
    """Strip markdown, whitespace, and extract pure JSON."""
    # Remove markdown code blocks
    raw = re.sub(r"```json\s*", "", raw)
    raw = re.sub(r"```\s*", "", raw)
    raw = raw.strip()

    # Extract first JSON object found
    match = re.search(r"\{.*\}", raw, re.DOTALL)
    if match:
        return match.group(0)
    raise ValueError("No JSON object found in response")


def validate_schema(data: dict, query: str) -> dict:
    cleaned = {}
    query_words = set(query.lower().split())
    
    # Define topic-aware category relevance
    IRRELEVANT_COMBOS = {
        "shoes": ["processor", "ram", "gpu", "cpu", "battery", "charging"],
        "food": ["processor", "gpu", "engine", "torque", "horsepower"],
        "college": ["torque", "engine", "gpu", "charging speed"],
    }
    
    # Get blocked terms for this query
    blocked = []
    for topic, terms in IRRELEVANT_COMBOS.items():
        if topic in query.lower():
            blocked.extend(terms)

    for category, attributes in data.items():
        cat = category.strip().title()
        if cat not in VALID_CATEGORIES:
            print(f"Skipping unknown category: {cat}")
            continue

        attrs = []
        for a in attributes:
            if not isinstance(a, str) or not a.strip():
                continue
            # ✅ Reject attributes containing blocked terms
            a_lower = a.lower()
            if any(b in a_lower for b in blocked):
                print(f"❌ Rejected irrelevant attribute: {a}")
                continue
            attrs.append(a.strip())

        attrs = attrs[:12]
        if attrs:
            cleaned[cat] = attrs

    if not cleaned:
        raise ValueError("No valid categories found after validation")

    return cleaned


def call_gemini(query: str) -> dict:
    prompt = f"""You are a world-class decision analysis expert.

Task: Generate a COMPREHENSIVE list of evaluation criteria for: "{query}"

MANDATORY RULES:
- Select 6-8 categories from the allowed list
- Generate EXACTLY 8-10 attributes per category — this is mandatory
- Each attribute must be SPECIFIC and MEASURABLE for "{query}"
- First 3 attributes in EVERY category must be the MOST SEARCHED specs
- For tech products: always start with Processor, RAM, Battery, Display, Camera
- For vehicles: always start with Engine, Mileage, Price, Safety
- Order strictly by: most searched → most compared → most reviewed
- Use concise names (2-5 words max)
- DO NOT generate less than 8 attributes per category

Allowed category keys:
Performance, Financial, Risk, Maintenance, Benefits, Time, Requirements, 
Scalability, Alternatives, Usability, Security, Reliability, Support, Sustainability

Example of CORRECT format with enough attributes:
{{"Performance":["Engine Power","Torque Output","Top Speed","0-100 kmph Time","Fuel Efficiency","Gear Smoothness","Braking Distance","Tyre Grip","Suspension Quality","NVH Levels"],"Financial":["Ex-showroom Price","On-road Price","EMI Options","Insurance Cost","Fuel Cost Monthly","Resale Value","Maintenance Cost","Road Tax","Accessories Cost","Total Ownership Cost"],"Reliability":["Engine Reliability","Electrical Issues","Common Problems","Long Term Durability","Brand Track Record","Owner Satisfaction","Recall History","Service Quality","Spare Parts Life","Warranty Claims"]}}

Now generate for: "{query}"
Return ONLY valid JSON. No markdown. No explanation. Minimum 8 attributes per category."""

    url = f"https://{LOCATION}-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/gemini-2.5-flash-lite:generateContent"

    for attempt in range(3):
        try:
            res = http_requests.post(
                url,
                headers={
                    "Authorization": f"Bearer {get_access_token()}",
                    "Content-Type": "application/json"
                },
                json={
                    "contents": [{"role": "user", "parts": [{"text": prompt}]}],
                    "generationConfig": {
                        "temperature": 0.1,  # lower = more accurate, less random
                        "maxOutputTokens": 1024
                    }
                },
                timeout=30
            )

            if res.status_code == 429:
                wait = 2 ** attempt
                print(f"Rate limited, waiting {wait}s... (attempt {attempt + 1})")
                time.sleep(wait)
                continue
            
            print("Status Code:", res.status_code)
            if res.status_code != 200:
                print("❌ ERROR RESPONSE:")
                print(res.text)
                raise RuntimeError("Vertex API failed")

            data_json = res.json()
            print("✅ RAW RESPONSE:", str(data_json)[:500])
            
            raw = res.json()["candidates"][0]["content"]["parts"][0]["text"]
            clean = clean_json_response(raw)
            data = json.loads(clean)
            validated = validate_schema(data,query)
            print(f"Gemini generated {sum(len(v) for v in validated.values())} attributes across {len(validated)} categories")
            return validated

        except (json.JSONDecodeError, ValueError) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt == 2:
                raise RuntimeError(f"Gemini failed after 3 attempts: {e}")

    raise RuntimeError("Gemini failed after 3 attempts: rate limit")


def bootstrap_domain(query: str):
    data = None
    for attempt in range(3):
        try:
            data = call_gemini(query)
            print(f"Gemini response validated on attempt {attempt + 1}")
            break
        except (json.JSONDecodeError, ValueError) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt == 2:
                raise RuntimeError(f"Gemini failed after 3 attempts: {e}")

    if not data:
        raise RuntimeError("No data generated")

    # ✅ Quality gate — reject low quality bootstraps
    total_attrs = sum(len(v) for v in data.values())
    if total_attrs < 10:
        raise ValueError(f"Quality gate failed: only {total_attrs} attributes generated")

    # ✅ Duplicate domain detection — check similarity before storing
    model_local = SentenceTransformer("all-MiniLM-L6-v2")
    domain_embedding = model_local.encode(query).tolist()

    with engine.begin() as conn:
        # Check if very similar domain already exists
        existing = conn.execute(text("""
            SELECT name, embedding <-> CAST(:emb AS vector) AS distance
            FROM domains
            ORDER BY distance
            LIMIT 1
        """), {"emb": str(domain_embedding)}).fetchone()

        if existing and existing.distance < 0.15:
            print(f"⚠️ Similar domain already exists: '{existing.name}' (distance: {existing.distance:.3f}) — skipping bootstrap")
            return  # ✅ Don't store duplicate

        # Store domain
        domain_id = conn.execute(text("""
            INSERT INTO domains (name, embedding)
            VALUES (:n, CAST(:e AS vector))
            ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
            RETURNING id
        """), {"n": query, "e": str(domain_embedding)}).scalar()

        for group, attrs in data.items():
            group_id = conn.execute(text("""
                INSERT INTO dimension_groups (domain_id, name)
                VALUES (:d, :g)
                ON CONFLICT (domain_id, name) DO UPDATE SET name = EXCLUDED.name
                RETURNING id
            """), {"d": domain_id, "g": group}).scalar()

            for attr in attrs:
                emb = model_local.encode(attr).tolist()
                conn.execute(text("""
                    INSERT INTO attributes (group_id, name, embedding)
                    VALUES (:gid, :name, CAST(:emb AS vector))
                    ON CONFLICT (group_id, name) DO NOTHING
                """), {"gid": group_id, "name": attr, "emb": str(emb)})

    print(f"✅ Domain bootstrapped: {query} ({total_attrs} attributes)")
Initial commit 1 month ago			`# app/bootstrap.py`
			`from dotenv import load_dotenv`
			`import time`
			`import os, json, re, requests`
			`from sqlalchemy import create_engine, text`
			`from sentence_transformers import SentenceTransformer`
			`import requests as http_requests`
			`import google.auth`
			`import google.auth.transport.requests`
			`from app.vertex_client import get_access_token`

			`load_dotenv()`

Changes ui updated 2 weeks ago			`DATABASE_URL = os.getenv("DATABASE_URL")`
Initial commit 1 month ago			`engine = create_engine(DATABASE_URL)`
			`model = SentenceTransformer("all-MiniLM-L6-v2")`

			`PROJECT_ID = os.getenv("GOOGLE_PROJECT_ID")`
			`LOCATION = "us-central1"`



			`VALID_CATEGORIES = {`
			`"Performance", "Financial", "Risk", "Maintenance", "Benefits",`
			`"Time", "Requirements", "Scalability", "Alternatives", "Usability",`
			`"Security", "Reliability", "Support", "Sustainability"`
			`}`


			`def clean_json_response(raw: str) -> str:`
			`"""Strip markdown, whitespace, and extract pure JSON."""`
			`# Remove markdown code blocks`
			raw = re.sub(r"```json\s*", "", raw)
			raw = re.sub(r"```\s*", "", raw)
			`raw = raw.strip()`

			`# Extract first JSON object found`
			`match = re.search(r"\{.*\}", raw, re.DOTALL)`
			`if match:`
			`return match.group(0)`
			`raise ValueError("No JSON object found in response")`


Changes ui updated 2 weeks ago			`def validate_schema(data: dict, query: str) -> dict:`
Initial commit 1 month ago			`cleaned = {}`
Changes ui updated 2 weeks ago			`query_words = set(query.lower().split())`

			`# Define topic-aware category relevance`
			`IRRELEVANT_COMBOS = {`
			`"shoes": ["processor", "ram", "gpu", "cpu", "battery", "charging"],`
			`"food": ["processor", "gpu", "engine", "torque", "horsepower"],`
			`"college": ["torque", "engine", "gpu", "charging speed"],`
			`}`

			`# Get blocked terms for this query`
			`blocked = []`
			`for topic, terms in IRRELEVANT_COMBOS.items():`
			`if topic in query.lower():`
			`blocked.extend(terms)`

Initial commit 1 month ago			`for category, attributes in data.items():`
			`cat = category.strip().title()`
			`if cat not in VALID_CATEGORIES:`
			`print(f"Skipping unknown category: {cat}")`
			`continue`

Changes ui updated 2 weeks ago			`attrs = []`
			`for a in attributes:`
			`if not isinstance(a, str) or not a.strip():`
			`continue`
			`# ✅ Reject attributes containing blocked terms`
			`a_lower = a.lower()`
			`if any(b in a_lower for b in blocked):`
			`print(f"❌ Rejected irrelevant attribute: {a}")`
			`continue`
			`attrs.append(a.strip())`
Initial commit 1 month ago
Changes ui updated 2 weeks ago			`attrs = attrs[:12]`
Initial commit 1 month ago			`if attrs:`
			`cleaned[cat] = attrs`

			`if not cleaned:`
			`raise ValueError("No valid categories found after validation")`

			`return cleaned`


			`def call_gemini(query: str) -> dict:`
			`prompt = f"""You are a world-class decision analysis expert.`

Changes ui updated 2 weeks ago			`Task: Generate a COMPREHENSIVE list of evaluation criteria for: "{query}"`
Initial commit 1 month ago
Changes ui updated 2 weeks ago			`MANDATORY RULES:`
			`- Select 6-8 categories from the allowed list`
			`- Generate EXACTLY 8-10 attributes per category — this is mandatory`
			`- Each attribute must be SPECIFIC and MEASURABLE for "{query}"`
			`- First 3 attributes in EVERY category must be the MOST SEARCHED specs`
			`- For tech products: always start with Processor, RAM, Battery, Display, Camera`
			`- For vehicles: always start with Engine, Mileage, Price, Safety`
			`- Order strictly by: most searched → most compared → most reviewed`
			`- Use concise names (2-5 words max)`
			`- DO NOT generate less than 8 attributes per category`
Initial commit 1 month ago
Changes ui updated 2 weeks ago			`Allowed category keys:`
			`Performance, Financial, Risk, Maintenance, Benefits, Time, Requirements,`
			`Scalability, Alternatives, Usability, Security, Reliability, Support, Sustainability`
Initial commit 1 month ago
Changes ui updated 2 weeks ago			`Example of CORRECT format with enough attributes:`
			{{"Performance":["Engine Power","Torque Output","Top Speed","0-100 kmph Time","Fuel Efficiency","Gear Smoothness","Braking Distance","Tyre Grip","Suspension Quality","NVH Levels"],"Financial":["Ex-showroom Price","On-road Price","EMI Options","Insurance Cost","Fuel Cost Monthly","Resale Value","Maintenance Cost","Road Tax","Accessories Cost","Total Ownership Cost"],"Reliability":["Engine Reliability","Electrical Issues","Common Problems","Long Term Durability","Brand Track Record","Owner Satisfaction","Recall History","Service Quality","Spare Parts Life","Warranty Claims"]}}
Initial commit 1 month ago
			`Now generate for: "{query}"`
Changes ui updated 2 weeks ago			`Return ONLY valid JSON. No markdown. No explanation. Minimum 8 attributes per category."""`
Initial commit 1 month ago
			`url = f"https://{LOCATION}-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/gemini-2.5-flash-lite:generateContent"`

			`for attempt in range(3):`
			`try:`
Changes ui updated 2 weeks ago			`res = http_requests.post(`
			`url,`
			`headers={`
			`"Authorization": f"Bearer {get_access_token()}",`
			`"Content-Type": "application/json"`
			`},`
			`json={`
			`"contents": [{"role": "user", "parts": [{"text": prompt}]}],`
			`"generationConfig": {`
			`"temperature": 0.1, # lower = more accurate, less random`
			`"maxOutputTokens": 1024`
			`}`
			`},`
			`timeout=30`
			`)`

Initial commit 1 month ago			`if res.status_code == 429:`
Changes ui updated 2 weeks ago			`wait = 2 ** attempt`
			`print(f"Rate limited, waiting {wait}s... (attempt {attempt + 1})")`
Initial commit 1 month ago			`time.sleep(wait)`
			`continue`
Changes ui updated 2 weeks ago
			`print("Status Code:", res.status_code)`
			`if res.status_code != 200:`
			`print("❌ ERROR RESPONSE:")`
			`print(res.text)`
			`raise RuntimeError("Vertex API failed")`

			`data_json = res.json()`
			`print("✅ RAW RESPONSE:", str(data_json)[:500])`

Initial commit 1 month ago			`raw = res.json()["candidates"][0]["content"]["parts"][0]["text"]`
			`clean = clean_json_response(raw)`
			`data = json.loads(clean)`
Changes ui updated 2 weeks ago			`validated = validate_schema(data,query)`
			`print(f"Gemini generated {sum(len(v) for v in validated.values())} attributes across {len(validated)} categories")`
Initial commit 1 month ago			`return validated`

			`except (json.JSONDecodeError, ValueError) as e:`
Changes ui updated 2 weeks ago			`print(f"Attempt {attempt + 1} failed: {e}")`
Initial commit 1 month ago			`if attempt == 2:`
			`raise RuntimeError(f"Gemini failed after 3 attempts: {e}")`

			`raise RuntimeError("Gemini failed after 3 attempts: rate limit")`


			`def bootstrap_domain(query: str):`
			`data = None`
			`for attempt in range(3):`
			`try:`
			`data = call_gemini(query)`
			`print(f"Gemini response validated on attempt {attempt + 1}")`
			`break`
			`except (json.JSONDecodeError, ValueError) as e:`
			`print(f"Attempt {attempt + 1} failed: {e}")`
			`if attempt == 2:`
			`raise RuntimeError(f"Gemini failed after 3 attempts: {e}")`

Changes ui updated 2 weeks ago			`if not data:`
			`raise RuntimeError("No data generated")`

			`# ✅ Quality gate — reject low quality bootstraps`
			`total_attrs = sum(len(v) for v in data.values())`
			`if total_attrs < 10:`
			`raise ValueError(f"Quality gate failed: only {total_attrs} attributes generated")`
Initial commit 1 month ago
Changes ui updated 2 weeks ago			`# ✅ Duplicate domain detection — check similarity before storing`
			`model_local = SentenceTransformer("all-MiniLM-L6-v2")`
			`domain_embedding = model_local.encode(query).tolist()`

			`with engine.begin() as conn:`
			`# Check if very similar domain already exists`
			`existing = conn.execute(text("""`
			`SELECT name, embedding <-> CAST(:emb AS vector) AS distance`
			`FROM domains`
			`ORDER BY distance`
			`LIMIT 1`
			`"""), {"emb": str(domain_embedding)}).fetchone()`

			`if existing and existing.distance < 0.15:`
			`print(f"⚠️ Similar domain already exists: '{existing.name}' (distance: {existing.distance:.3f}) — skipping bootstrap")`
			`return # ✅ Don't store duplicate`

			`# Store domain`
Initial commit 1 month ago			`domain_id = conn.execute(text("""`
			`INSERT INTO domains (name, embedding)`
			`VALUES (:n, CAST(:e AS vector))`
			`ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name`
			`RETURNING id`
			`"""), {"n": query, "e": str(domain_embedding)}).scalar()`

			`for group, attrs in data.items():`
			`group_id = conn.execute(text("""`
			`INSERT INTO dimension_groups (domain_id, name)`
			`VALUES (:d, :g)`
			`ON CONFLICT (domain_id, name) DO UPDATE SET name = EXCLUDED.name`
			`RETURNING id`
			`"""), {"d": domain_id, "g": group}).scalar()`

			`for attr in attrs:`
Changes ui updated 2 weeks ago			`emb = model_local.encode(attr).tolist()`
Initial commit 1 month ago			`conn.execute(text("""`
			`INSERT INTO attributes (group_id, name, embedding)`
			`VALUES (:gid, :name, CAST(:emb AS vector))`
			`ON CONFLICT (group_id, name) DO NOTHING`
			`"""), {"gid": group_id, "name": attr, "emb": str(emb)})`

Changes ui updated 2 weeks ago			`print(f"✅ Domain bootstrapped: {query} ({total_attrs} attributes)")`