darkplex-core/cortex/intent_classifier.py
Claudia 43d033e242 feat: initial cortex package — 8 intelligence modules, CLI, Docker
Modules: triage, health_scanner, feedback_loop, memory_hygiene,
         roadmap, validate_output, enhanced_search, auto_handoff
         + composite_scorer, intent_classifier

CLI: 'cortex <module> <command>' unified entry point
Tests: 157/169 passing (12 assertion mismatches from rename)
Docker: python:3.11-slim based
2026-02-09 11:18:20 +01:00

191 lines
7.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Intent Classifier — Classify memory queries into WHO/WHEN/WHY/WHAT intents
using regex-based heuristics. No LLM call, <5ms per query.
Output: intent type + suggested weight adjustments for composite scorer.
"""
import json
import re
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
CONFIG_PATH = Path(__file__).parent / "config.json"
@dataclass
class IntentResult:
"""Classification result for a query."""
intent: str # WHO, WHEN, WHY, WHAT
confidence: float # 0.0 to 1.0
weight_adjustments: dict = field(default_factory=dict)
matched_signals: list = field(default_factory=list)
classification_ms: float = 0.0
# Compiled patterns cache
_compiled_patterns: dict[str, list[re.Pattern]] = {}
_config_cache: Optional[dict] = None
def _load_config(path: Optional[Path] = None) -> dict:
"""Load intent classification config."""
global _config_cache
if _config_cache is not None:
return _config_cache
p = path or CONFIG_PATH
if p.exists():
with open(p) as f:
_config_cache = json.load(f).get("intent_classification", {})
return _config_cache
_config_cache = {}
return _config_cache
def _get_patterns(intent: str, config: dict) -> list[re.Pattern]:
"""Get compiled regex patterns for an intent (cached)."""
if intent not in _compiled_patterns:
raw = config.get(intent, {}).get("patterns", [])
_compiled_patterns[intent] = [re.compile(p, re.IGNORECASE) for p in raw]
return _compiled_patterns[intent]
def classify(query: str, config: Optional[dict] = None) -> IntentResult:
"""Classify a query into WHO/WHEN/WHY/WHAT intent.
Uses keyword matching and regex patterns. Designed for <5ms execution.
Args:
query: The search query string.
config: Optional config dict. Loaded from config.json if None.
Returns:
IntentResult with intent type, confidence, and weight adjustments.
"""
start = time.perf_counter()
cfg = config or _load_config()
if not query or not query.strip():
elapsed = (time.perf_counter() - start) * 1000
return IntentResult(intent="WHAT", confidence=0.1, classification_ms=elapsed,
matched_signals=["empty_query"])
query_lower = query.lower().strip()
scores: dict[str, float] = {"WHO": 0.0, "WHEN": 0.0, "WHY": 0.0, "WHAT": 0.0}
signals: dict[str, list[str]] = {"WHO": [], "WHEN": [], "WHY": [], "WHAT": []}
for intent in ("WHO", "WHEN", "WHY", "WHAT"):
intent_cfg = cfg.get(intent, {})
# Keyword matching (fast)
keywords = intent_cfg.get("keywords", [])
for kw in keywords:
if kw.lower() in query_lower:
scores[intent] += 1.0
signals[intent].append(f"kw:{kw}")
# Regex pattern matching
for pattern in _get_patterns(intent, cfg):
if pattern.search(query_lower):
scores[intent] += 2.0 # patterns are more specific
signals[intent].append(f"re:{pattern.pattern[:30]}")
# Additional heuristics
# Names (capitalized words not at start) suggest WHO — but exclude known non-person terms
# German nouns are capitalized — so caps heuristic needs extra guard:
# Only count as person-name if the word is NOT a common German/tech noun
# and there's additional WHO signal (keyword/pattern already scored).
_NON_PERSON_CAPS = {
"sparkasse", "taskboard", "uptime", "kuma", "forgejo", "traefik", "nginx",
"docker", "chromadb", "typedb", "nats", "kafka", "pinecone", "odoo",
"mondo", "gate", "vainplex", "openclaw", "telegram", "discord", "matrix",
"opus", "sonnet", "haiku", "claude", "gemini", "ollama", "mona", "vera",
"stella", "viola", "hetzner", "proxmox", "debian", "linux", "python",
"api", "cli", "dns", "ssl", "tls", "ssh", "http", "https", "sepa",
"bafin", "iso", "iban", "postgres", "sqlite", "redis", "github",
# Common German capitalized nouns that aren't people
"aufgabe", "zugang", "status", "server", "konto", "liste", "daten",
"problem", "fehler", "lösung", "version", "projekt", "system", "email",
"rechnung", "zahlung", "vertrag", "termin", "meeting", "deploy",
"update", "config", "setup", "deployment", "monitoring", "backup",
"migration", "integration", "infrastruktur", "netzwerk", "sicherheit",
}
words = query.split()
if len(words) >= 2:
# Check ALL words for name-like capitalization (including first word)
caps = [w for w in words if w[0].isupper() and len(w) > 2
and not w.isupper() and w.lower() not in _NON_PERSON_CAPS]
if len(caps) >= 2:
# Two+ unknown capitalized words strongly suggest names (e.g. "Sebastian Baier")
scores["WHO"] += 0.8 * len(caps)
signals["WHO"].append(f"multi_caps:{','.join(caps[:3])}")
elif caps:
# Single unknown cap word — weak signal in German
scores["WHO"] += 0.3
signals["WHO"].append(f"caps:{caps[0]}")
# Date-like tokens suggest WHEN
if re.search(r'\b\d{4}[-/]\d{2}', query) or re.search(r'\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|januar|februar|märz)\b', query_lower):
scores["WHEN"] += 1.5
signals["WHEN"].append("date_token")
# Question words at start
if query_lower.startswith(("warum ", "why ", "wieso ", "weshalb ")):
scores["WHY"] += 3.0
signals["WHY"].append("start_why")
elif query_lower.startswith(("wer ", "who ")):
scores["WHO"] += 3.0
signals["WHO"].append("start_who")
elif query_lower.startswith(("wann ", "when ")):
scores["WHEN"] += 3.0
signals["WHEN"].append("start_when")
# Pick winner
best_intent = max(scores, key=scores.get)
total = sum(scores.values())
confidence = scores[best_intent] / total if total > 0 else 0.25
# If no strong signal, default to WHAT
if scores[best_intent] < 0.5:
best_intent = "WHAT"
confidence = 0.3
# Get weight adjustments
adjustments = cfg.get(best_intent, {}).get("weight_adjustments", {})
elapsed = (time.perf_counter() - start) * 1000
return IntentResult(
intent=best_intent,
confidence=round(confidence, 3),
weight_adjustments=adjustments,
matched_signals=signals[best_intent],
classification_ms=round(elapsed, 3),
)
def reset_cache():
"""Reset config and pattern caches (for testing)."""
global _config_cache, _compiled_patterns
_config_cache = None
_compiled_patterns = {}
if __name__ == "__main__":
import sys
queries = sys.argv[1:] or [
"Albert Hild contact",
"when did we fix the gateway",
"why did we choose NATS over Kafka",
"Mondo Gate regulatory status",
"wer ist Sebastian Baier",
"wann wurde TypeDB eingerichtet",
"warum ChromaDB statt Pinecone",
]
for q in queries:
r = classify(q)
print(f" [{r.intent}] ({r.confidence:.2f}, {r.classification_ms:.2f}ms) {q}")
if r.matched_signals:
print(f" signals: {r.matched_signals}")