darkplex-core/cortex/entity_manager.py

#!/usr/bin/env python3
"""
Entity Manager — File-based knowledge graph for entity extraction and relationship mapping.
Part of Level 4.4 AGI Roadmap.

Usage:
    entity-manager.py bootstrap              — Bootstrap from life/areas/
    entity-manager.py extract "text"          — Extract entities from text
    entity-manager.py relate "A" "B" [type]   — Create/update relationship
    entity-manager.py query "entity"          — Query relationships for entity
    entity-manager.py graph                   — Output relationship summary
"""

import sys
import os
import json
import re
import time
from pathlib import Path

KNOWLEDGE_DIR = Path.home() / ".cortex" / "knowledge"
ENTITIES_FILE = KNOWLEDGE_DIR / "entities.json"
RELATIONSHIPS_FILE = KNOWLEDGE_DIR / "relationships.json"
LIFE_AREAS = Path.home() / "life" / "areas"

# Common words to skip during entity extraction
STOP_WORDS = {
    "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "do", "does", "did", "will", "would", "could",
    "should", "may", "might", "shall", "can", "need", "must", "i", "you",
    "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
    "my", "your", "his", "its", "our", "their", "this", "that", "these",
    "those", "what", "which", "who", "whom", "where", "when", "why", "how",
    "all", "each", "every", "both", "few", "more", "most", "other", "some",
    "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "just", "because", "as", "until", "while", "of", "at", "by",
    "for", "with", "about", "against", "between", "through", "during",
    "before", "after", "above", "below", "to", "from", "up", "down", "in",
    "out", "on", "off", "over", "under", "again", "further", "then", "once",
    "here", "there", "and", "but", "or", "if", "then", "else", "also",
    "system", "cron", "heartbeat", "ok", "error", "warning", "info",
    "message", "session", "agent", "main", "matrix", "telegram",
    "read", "write", "check", "run", "send", "get", "set", "let", "see",
    "know", "think", "want", "like", "make", "take", "come", "go", "say",
    "tell", "ask", "try", "use", "find", "give", "new", "good", "first",
    "last", "long", "great", "little", "right", "big", "high", "old",
    "different", "small", "large", "next", "early", "young", "important",
    "public", "bad", "sure", "sure", "yes", "no", "maybe", "ok", "okay",
    "thanks", "thank", "please", "hello", "hi", "hey", "bye", "well",
    "now", "today", "tomorrow", "yesterday", "monday", "tuesday",
    "wednesday", "thursday", "friday", "saturday", "sunday",
    "january", "february", "march", "april", "may", "june", "july",
    "august", "september", "october", "november", "december",
    "still", "already", "currently", "actually", "really", "right",
    "look", "keep", "going", "based", "done", "work", "working",
}


def normalize(name):
    """Normalize entity name."""
    return name.strip().lower().replace("_", "-")


def load_json(path):
    """Load JSON file, return empty dict if missing/invalid."""
    try:
        with open(path) as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return {}


def save_json(path, data):
    """Save JSON file, creating directories as needed."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)


def load_known_entities():
    """Load known entity names from life/areas/ and entities.json."""
    known = {}
    # From life/areas
    for category in ["people", "companies", "projects"]:
        area_dir = LIFE_AREAS / category
        if not area_dir.exists():
            continue
        etype = category.rstrip("s")  # person, company, project
        if category == "people":
            etype = "person"
        for entry in area_dir.iterdir():
            if entry.is_dir():
                name = normalize(entry.name)
                known[name] = {"type": etype, "source": f"life/areas/{category}"}
    # From entities.json
    entities = load_json(ENTITIES_FILE)
    for name, info in entities.items():
        if name not in known:
            known[name] = info
    return known


def extract_entities(text, known=None):
    """Extract entities from text using heuristics and known entity matching."""
    if known is None:
        known = load_known_entities()

    found = {}
    text_lower = text.lower()

    # 1. Match known entities
    for name, info in known.items():
        # Check for name or slug in text
        variants = [name, name.replace("-", " "), name.replace("-", "")]
        for v in variants:
            if v in text_lower and len(v) > 2:
                found[name] = {"type": info.get("type", "unknown"), "match": "known"}
                break

    # 2. Extract @mentions
    for m in re.finditer(r"@(\w+)", text):
        name = normalize(m.group(1))
        if name not in found and name not in STOP_WORDS and len(name) > 2:
            found[name] = {"type": "person", "match": "mention"}

    # 3. Extract capitalized multi-word names (likely proper nouns)
    for m in re.finditer(r"\b([A-Z][a-zäöüß]+(?:\s+[A-Z][a-zäöüß]+)+)\b", text):
        name = normalize(m.group(1))
        if name not in found and name not in STOP_WORDS and len(name) > 3:
            # Heuristic: if 2-3 words, likely person; if more, likely org/topic
            words = name.split()
            etype = "person" if len(words) <= 3 else "topic"
            found[name] = {"type": etype, "match": "capitalized"}

    # 4. Extract standalone capitalized words (potential entities)
    for m in re.finditer(r"\b([A-Z][a-zäöüß]{2,})\b", text):
        name = normalize(m.group(1))
        if name not in found and name not in STOP_WORDS:
            found[name] = {"type": "unknown", "match": "capitalized_single"}

    # 5. Extract ALL-CAPS acronyms (likely companies/products)
    for m in re.finditer(r"\b([A-Z]{2,6})\b", text):
        name = normalize(m.group(1))
        if name not in found and name not in STOP_WORDS and name not in {
            "ok", "am", "pm", "gmt", "utc", "url", "api", "cli", "ssh", "dns",
            "http", "https", "json", "html", "css", "js", "ts", "py", "md",
            "id", "ui", "ux", "io", "os", "ip", "gb", "mb", "kb", "tb",
        }:
            found[name] = {"type": "organization", "match": "acronym"}

    return found


def cmd_bootstrap():
    """Bootstrap entities from life/areas/."""
    entities = load_json(ENTITIES_FILE)
    relationships = load_json(RELATIONSHIPS_FILE)
    count = 0

    for category in ["people", "companies"]:
        area_dir = LIFE_AREAS / category
        if not area_dir.exists():
            continue
        etype = "person" if category == "people" else "company"
        for entry in sorted(area_dir.iterdir()):
            if not entry.is_dir():
                continue
            name = normalize(entry.name)
            if name in entities:
                continue

            info = {"type": etype, "source": f"life/areas/{category}", "bootstrapped": True}

            # Try to extract extra info from summary.md
            summary_path = entry / "summary.md"
            if summary_path.exists():
                try:
                    summary = summary_path.read_text(errors="replace")[:2000]
                    # Extract email
                    em = re.search(r"\*\*Email:\*\*\s*(\S+)", summary)
                    if em:
                        info["email"] = em.group(1)
                    # Extract context
                    ctx = re.search(r"\*\*Kontext:\*\*\s*(.+)", summary)
                    if ctx:
                        info["context"] = ctx.group(1).strip()
                except Exception:
                    pass

            entities[name] = info
            count += 1

    save_json(ENTITIES_FILE, entities)
    save_json(RELATIONSHIPS_FILE, relationships)
    print(f"Bootstrapped {count} new entities. Total: {len(entities)}")


def cmd_extract(text):
    """Extract and display entities from text."""
    known = load_known_entities()
    found = extract_entities(text, known)

    if not found:
        print("No entities found.")
        return

    # Update entities.json with new discoveries
    entities = load_json(ENTITIES_FILE)
    new_count = 0
    for name, info in found.items():
        if name not in entities:
            entities[name] = {
                "type": info["type"],
                "source": "extraction",
                "first_seen": time.strftime("%Y-%m-%dT%H:%M:%S"),
            }
            new_count += 1
        print(f"  [{info['type']:12s}] {name} ({info['match']})")

    if new_count:
        save_json(ENTITIES_FILE, entities)
        print(f"\n{new_count} new entities added to registry.")


def cmd_relate(entity_a, entity_b, rel_type="related"):
    """Create or update a relationship between two entities."""
    a, b = normalize(entity_a), normalize(entity_b)
    relationships = load_json(RELATIONSHIPS_FILE)
    entities = load_json(ENTITIES_FILE)

    key = f"{min(a,b)}::{max(a,b)}"
    ts = time.strftime("%Y-%m-%dT%H:%M:%S")

    if key in relationships:
        rel = relationships[key]
        rel["count"] = rel.get("count", 1) + 1
        rel["last_seen"] = ts
        if rel_type != "related" and rel_type not in rel.get("types", []):
            rel.setdefault("types", []).append(rel_type)
        print(f"Updated: {a} <-> {b} (seen {rel['count']}x)")
    else:
        relationships[key] = {
            "a": a, "b": b,
            "types": [rel_type],
            "count": 1,
            "first_seen": ts,
            "last_seen": ts,
        }
        print(f"Created: {a} <-> {b} ({rel_type})")

    # Ensure both entities exist
    for name in [a, b]:
        if name not in entities:
            entities[name] = {"type": "unknown", "source": "relationship", "first_seen": ts}

    save_json(RELATIONSHIPS_FILE, relationships)
    save_json(ENTITIES_FILE, entities)


def cmd_query(entity_name):
    """Query all relationships for an entity."""
    name = normalize(entity_name)
    relationships = load_json(RELATIONSHIPS_FILE)
    entities = load_json(ENTITIES_FILE)

    # Entity info
    if name in entities:
        info = entities[name]
        print(f"Entity: {name}")
        print(f"  Type: {info.get('type', 'unknown')}")
        if info.get("email"):
            print(f"  Email: {info['email']}")
        if info.get("context"):
            print(f"  Context: {info['context']}")
        if info.get("source"):
            print(f"  Source: {info['source']}")
    else:
        print(f"Entity '{name}' not found in registry.")

    # Relationships
    rels = []
    for key, rel in relationships.items():
        if rel["a"] == name or rel["b"] == name:
            other = rel["b"] if rel["a"] == name else rel["a"]
            rels.append((other, rel))

    if rels:
        print(f"\nRelationships ({len(rels)}):")
        for other, rel in sorted(rels, key=lambda x: -x[1].get("count", 1)):
            types = ", ".join(rel.get("types", ["related"]))
            print(f"  {name} <-> {other} [{types}] (seen {rel.get('count', 1)}x)")
    else:
        print("\nNo relationships found.")

    # Check life/areas/
    for category in ["people", "companies", "projects"]:
        area_path = LIFE_AREAS / category / name.replace(" ", "-")
        if area_path.exists():
            summary_path = area_path / "summary.md"
            if summary_path.exists():
                print(f"\nLife area ({category}): {area_path}")
                content = summary_path.read_text(errors="replace")[:500]
                print(content)


def cmd_graph():
    """Output a simple relationship graph summary."""
    relationships = load_json(RELATIONSHIPS_FILE)
    entities = load_json(ENTITIES_FILE)

    if not relationships:
        print("No relationships in knowledge graph.")
        return

    # Count connections per entity
    connections = {}
    for key, rel in relationships.items():
        for name in [rel["a"], rel["b"]]:
            connections[name] = connections.get(name, 0) + 1

    # Sort by connections
    top = sorted(connections.items(), key=lambda x: -x[1])

    print(f"Knowledge Graph: {len(entities)} entities, {len(relationships)} relationships\n")
    print("Top connected entities:")
    for name, count in top[:20]:
        etype = entities.get(name, {}).get("type", "?")
        print(f"  {name} ({etype}): {count} connections")

    print(f"\nRecent relationships:")
    recent = sorted(relationships.values(), key=lambda r: r.get("last_seen", ""), reverse=True)[:10]
    for rel in recent:
        types = ", ".join(rel.get("types", ["related"]))
        print(f"  {rel['a']} <-> {rel['b']} [{types}]")


def main():
    if len(sys.argv) < 2:
        print(__doc__)
        sys.exit(1)

    cmd = sys.argv[1]

    if cmd == "bootstrap":
        cmd_bootstrap()
    elif cmd == "extract":
        if len(sys.argv) < 3:
            print("Usage: entity-manager.py extract \"text\"")
            sys.exit(1)
        cmd_extract(" ".join(sys.argv[2:]))
    elif cmd == "relate":
        if len(sys.argv) < 4:
            print("Usage: entity-manager.py relate \"entity_a\" \"entity_b\" [type]")
            sys.exit(1)
        rel_type = sys.argv[4] if len(sys.argv) > 4 else "related"
        cmd_relate(sys.argv[2], sys.argv[3], rel_type)
    elif cmd == "query":
        if len(sys.argv) < 3:
            print("Usage: entity-manager.py query \"entity\"")
            sys.exit(1)
        cmd_query(" ".join(sys.argv[2:]))
    elif cmd == "graph":
        cmd_graph()
    else:
        print(f"Unknown command: {cmd}")
        print(__doc__)
        sys.exit(1)


if __name__ == "__main__":
    main()