#!/usr/bin/env python3 """ Entity Manager — File-based knowledge graph for entity extraction and relationship mapping. Part of Level 4.4 AGI Roadmap. Usage: entity-manager.py bootstrap — Bootstrap from life/areas/ entity-manager.py extract "text" — Extract entities from text entity-manager.py relate "A" "B" [type] — Create/update relationship entity-manager.py query "entity" — Query relationships for entity entity-manager.py graph — Output relationship summary """ import sys import os import json import re import time from pathlib import Path KNOWLEDGE_DIR = Path.home() / ".cortex" / "knowledge" ENTITIES_FILE = KNOWLEDGE_DIR / "entities.json" RELATIONSHIPS_FILE = KNOWLEDGE_DIR / "relationships.json" LIFE_AREAS = Path.home() / "life" / "areas" # Common words to skip during entity extraction STOP_WORDS = { "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can", "need", "must", "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "my", "your", "his", "its", "our", "their", "this", "that", "these", "those", "what", "which", "who", "whom", "where", "when", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "just", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "and", "but", "or", "if", "then", "else", "also", "system", "cron", "heartbeat", "ok", "error", "warning", "info", "message", "session", "agent", "main", "matrix", "telegram", "read", "write", "check", "run", "send", "get", "set", "let", "see", "know", "think", "want", "like", "make", "take", "come", "go", "say", "tell", "ask", "try", "use", "find", "give", "new", "good", "first", "last", "long", "great", "little", "right", "big", "high", "old", "different", "small", "large", "next", "early", "young", "important", "public", "bad", "sure", "sure", "yes", "no", "maybe", "ok", "okay", "thanks", "thank", "please", "hello", "hi", "hey", "bye", "well", "now", "today", "tomorrow", "yesterday", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", "still", "already", "currently", "actually", "really", "right", "look", "keep", "going", "based", "done", "work", "working", } def normalize(name): """Normalize entity name.""" return name.strip().lower().replace("_", "-") def load_json(path): """Load JSON file, return empty dict if missing/invalid.""" try: with open(path) as f: return json.load(f) except (FileNotFoundError, json.JSONDecodeError): return {} def save_json(path, data): """Save JSON file, creating directories as needed.""" path.parent.mkdir(parents=True, exist_ok=True) with open(path, "w") as f: json.dump(data, f, indent=2, ensure_ascii=False) def load_known_entities(): """Load known entity names from life/areas/ and entities.json.""" known = {} # From life/areas for category in ["people", "companies", "projects"]: area_dir = LIFE_AREAS / category if not area_dir.exists(): continue etype = category.rstrip("s") # person, company, project if category == "people": etype = "person" for entry in area_dir.iterdir(): if entry.is_dir(): name = normalize(entry.name) known[name] = {"type": etype, "source": f"life/areas/{category}"} # From entities.json entities = load_json(ENTITIES_FILE) for name, info in entities.items(): if name not in known: known[name] = info return known def extract_entities(text, known=None): """Extract entities from text using heuristics and known entity matching.""" if known is None: known = load_known_entities() found = {} text_lower = text.lower() # 1. Match known entities for name, info in known.items(): # Check for name or slug in text variants = [name, name.replace("-", " "), name.replace("-", "")] for v in variants: if v in text_lower and len(v) > 2: found[name] = {"type": info.get("type", "unknown"), "match": "known"} break # 2. Extract @mentions for m in re.finditer(r"@(\w+)", text): name = normalize(m.group(1)) if name not in found and name not in STOP_WORDS and len(name) > 2: found[name] = {"type": "person", "match": "mention"} # 3. Extract capitalized multi-word names (likely proper nouns) for m in re.finditer(r"\b([A-Z][a-zäöüß]+(?:\s+[A-Z][a-zäöüß]+)+)\b", text): name = normalize(m.group(1)) if name not in found and name not in STOP_WORDS and len(name) > 3: # Heuristic: if 2-3 words, likely person; if more, likely org/topic words = name.split() etype = "person" if len(words) <= 3 else "topic" found[name] = {"type": etype, "match": "capitalized"} # 4. Extract standalone capitalized words (potential entities) for m in re.finditer(r"\b([A-Z][a-zäöüß]{2,})\b", text): name = normalize(m.group(1)) if name not in found and name not in STOP_WORDS: found[name] = {"type": "unknown", "match": "capitalized_single"} # 5. Extract ALL-CAPS acronyms (likely companies/products) for m in re.finditer(r"\b([A-Z]{2,6})\b", text): name = normalize(m.group(1)) if name not in found and name not in STOP_WORDS and name not in { "ok", "am", "pm", "gmt", "utc", "url", "api", "cli", "ssh", "dns", "http", "https", "json", "html", "css", "js", "ts", "py", "md", "id", "ui", "ux", "io", "os", "ip", "gb", "mb", "kb", "tb", }: found[name] = {"type": "organization", "match": "acronym"} return found def cmd_bootstrap(): """Bootstrap entities from life/areas/.""" entities = load_json(ENTITIES_FILE) relationships = load_json(RELATIONSHIPS_FILE) count = 0 for category in ["people", "companies"]: area_dir = LIFE_AREAS / category if not area_dir.exists(): continue etype = "person" if category == "people" else "company" for entry in sorted(area_dir.iterdir()): if not entry.is_dir(): continue name = normalize(entry.name) if name in entities: continue info = {"type": etype, "source": f"life/areas/{category}", "bootstrapped": True} # Try to extract extra info from summary.md summary_path = entry / "summary.md" if summary_path.exists(): try: summary = summary_path.read_text(errors="replace")[:2000] # Extract email em = re.search(r"\*\*Email:\*\*\s*(\S+)", summary) if em: info["email"] = em.group(1) # Extract context ctx = re.search(r"\*\*Kontext:\*\*\s*(.+)", summary) if ctx: info["context"] = ctx.group(1).strip() except Exception: pass entities[name] = info count += 1 save_json(ENTITIES_FILE, entities) save_json(RELATIONSHIPS_FILE, relationships) print(f"Bootstrapped {count} new entities. Total: {len(entities)}") def cmd_extract(text): """Extract and display entities from text.""" known = load_known_entities() found = extract_entities(text, known) if not found: print("No entities found.") return # Update entities.json with new discoveries entities = load_json(ENTITIES_FILE) new_count = 0 for name, info in found.items(): if name not in entities: entities[name] = { "type": info["type"], "source": "extraction", "first_seen": time.strftime("%Y-%m-%dT%H:%M:%S"), } new_count += 1 print(f" [{info['type']:12s}] {name} ({info['match']})") if new_count: save_json(ENTITIES_FILE, entities) print(f"\n{new_count} new entities added to registry.") def cmd_relate(entity_a, entity_b, rel_type="related"): """Create or update a relationship between two entities.""" a, b = normalize(entity_a), normalize(entity_b) relationships = load_json(RELATIONSHIPS_FILE) entities = load_json(ENTITIES_FILE) key = f"{min(a,b)}::{max(a,b)}" ts = time.strftime("%Y-%m-%dT%H:%M:%S") if key in relationships: rel = relationships[key] rel["count"] = rel.get("count", 1) + 1 rel["last_seen"] = ts if rel_type != "related" and rel_type not in rel.get("types", []): rel.setdefault("types", []).append(rel_type) print(f"Updated: {a} <-> {b} (seen {rel['count']}x)") else: relationships[key] = { "a": a, "b": b, "types": [rel_type], "count": 1, "first_seen": ts, "last_seen": ts, } print(f"Created: {a} <-> {b} ({rel_type})") # Ensure both entities exist for name in [a, b]: if name not in entities: entities[name] = {"type": "unknown", "source": "relationship", "first_seen": ts} save_json(RELATIONSHIPS_FILE, relationships) save_json(ENTITIES_FILE, entities) def cmd_query(entity_name): """Query all relationships for an entity.""" name = normalize(entity_name) relationships = load_json(RELATIONSHIPS_FILE) entities = load_json(ENTITIES_FILE) # Entity info if name in entities: info = entities[name] print(f"Entity: {name}") print(f" Type: {info.get('type', 'unknown')}") if info.get("email"): print(f" Email: {info['email']}") if info.get("context"): print(f" Context: {info['context']}") if info.get("source"): print(f" Source: {info['source']}") else: print(f"Entity '{name}' not found in registry.") # Relationships rels = [] for key, rel in relationships.items(): if rel["a"] == name or rel["b"] == name: other = rel["b"] if rel["a"] == name else rel["a"] rels.append((other, rel)) if rels: print(f"\nRelationships ({len(rels)}):") for other, rel in sorted(rels, key=lambda x: -x[1].get("count", 1)): types = ", ".join(rel.get("types", ["related"])) print(f" {name} <-> {other} [{types}] (seen {rel.get('count', 1)}x)") else: print("\nNo relationships found.") # Check life/areas/ for category in ["people", "companies", "projects"]: area_path = LIFE_AREAS / category / name.replace(" ", "-") if area_path.exists(): summary_path = area_path / "summary.md" if summary_path.exists(): print(f"\nLife area ({category}): {area_path}") content = summary_path.read_text(errors="replace")[:500] print(content) def cmd_graph(): """Output a simple relationship graph summary.""" relationships = load_json(RELATIONSHIPS_FILE) entities = load_json(ENTITIES_FILE) if not relationships: print("No relationships in knowledge graph.") return # Count connections per entity connections = {} for key, rel in relationships.items(): for name in [rel["a"], rel["b"]]: connections[name] = connections.get(name, 0) + 1 # Sort by connections top = sorted(connections.items(), key=lambda x: -x[1]) print(f"Knowledge Graph: {len(entities)} entities, {len(relationships)} relationships\n") print("Top connected entities:") for name, count in top[:20]: etype = entities.get(name, {}).get("type", "?") print(f" {name} ({etype}): {count} connections") print(f"\nRecent relationships:") recent = sorted(relationships.values(), key=lambda r: r.get("last_seen", ""), reverse=True)[:10] for rel in recent: types = ", ".join(rel.get("types", ["related"])) print(f" {rel['a']} <-> {rel['b']} [{types}]") def main(): if len(sys.argv) < 2: print(__doc__) sys.exit(1) cmd = sys.argv[1] if cmd == "bootstrap": cmd_bootstrap() elif cmd == "extract": if len(sys.argv) < 3: print("Usage: entity-manager.py extract \"text\"") sys.exit(1) cmd_extract(" ".join(sys.argv[2:])) elif cmd == "relate": if len(sys.argv) < 4: print("Usage: entity-manager.py relate \"entity_a\" \"entity_b\" [type]") sys.exit(1) rel_type = sys.argv[4] if len(sys.argv) > 4 else "related" cmd_relate(sys.argv[2], sys.argv[3], rel_type) elif cmd == "query": if len(sys.argv) < 3: print("Usage: entity-manager.py query \"entity\"") sys.exit(1) cmd_query(" ".join(sys.argv[2:])) elif cmd == "graph": cmd_graph() else: print(f"Unknown command: {cmd}") print(__doc__) sys.exit(1) if __name__ == "__main__": main()