darkplex-core/cortex/entity_manager.py
Claudia fd7d75c0ed
Some checks failed
Tests / test (push) Failing after 2s
Merge darkplex-core into cortex — unified intelligence layer v0.2.0
- Merged all unique darkplex-core modules into cortex:
  - intelligence/ subfolder (anticipator, collective, shared_memory, knowledge_cleanup, temporal, llm_extractor, loop)
  - governance/ subfolder (policy engine, risk scorer, evidence, enforcer, report generator)
  - entity_manager.py, knowledge_extractor.py
- Fixed bare 'from intelligence.' imports to 'from cortex.intelligence.'
- Added 'darkplex' CLI alias alongside 'cortex'
- Package renamed to darkplex-core v0.2.0
- 405 tests passing (was 234)
- 14 new test files covering all merged modules
2026-02-12 08:43:02 +01:00

371 lines
14 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Entity Manager — File-based knowledge graph for entity extraction and relationship mapping.
Part of Level 4.4 AGI Roadmap.
Usage:
entity-manager.py bootstrap — Bootstrap from life/areas/
entity-manager.py extract "text" — Extract entities from text
entity-manager.py relate "A" "B" [type] — Create/update relationship
entity-manager.py query "entity" — Query relationships for entity
entity-manager.py graph — Output relationship summary
"""
import sys
import os
import json
import re
import time
from pathlib import Path
KNOWLEDGE_DIR = Path.home() / ".cortex" / "knowledge"
ENTITIES_FILE = KNOWLEDGE_DIR / "entities.json"
RELATIONSHIPS_FILE = KNOWLEDGE_DIR / "relationships.json"
LIFE_AREAS = Path.home() / "life" / "areas"
# Common words to skip during entity extraction
STOP_WORDS = {
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "shall", "can", "need", "must", "i", "you",
"he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
"my", "your", "his", "its", "our", "their", "this", "that", "these",
"those", "what", "which", "who", "whom", "where", "when", "why", "how",
"all", "each", "every", "both", "few", "more", "most", "other", "some",
"such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
"very", "just", "because", "as", "until", "while", "of", "at", "by",
"for", "with", "about", "against", "between", "through", "during",
"before", "after", "above", "below", "to", "from", "up", "down", "in",
"out", "on", "off", "over", "under", "again", "further", "then", "once",
"here", "there", "and", "but", "or", "if", "then", "else", "also",
"system", "cron", "heartbeat", "ok", "error", "warning", "info",
"message", "session", "agent", "main", "matrix", "telegram",
"read", "write", "check", "run", "send", "get", "set", "let", "see",
"know", "think", "want", "like", "make", "take", "come", "go", "say",
"tell", "ask", "try", "use", "find", "give", "new", "good", "first",
"last", "long", "great", "little", "right", "big", "high", "old",
"different", "small", "large", "next", "early", "young", "important",
"public", "bad", "sure", "sure", "yes", "no", "maybe", "ok", "okay",
"thanks", "thank", "please", "hello", "hi", "hey", "bye", "well",
"now", "today", "tomorrow", "yesterday", "monday", "tuesday",
"wednesday", "thursday", "friday", "saturday", "sunday",
"january", "february", "march", "april", "may", "june", "july",
"august", "september", "october", "november", "december",
"still", "already", "currently", "actually", "really", "right",
"look", "keep", "going", "based", "done", "work", "working",
}
def normalize(name):
"""Normalize entity name."""
return name.strip().lower().replace("_", "-")
def load_json(path):
"""Load JSON file, return empty dict if missing/invalid."""
try:
with open(path) as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return {}
def save_json(path, data):
"""Save JSON file, creating directories as needed."""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def load_known_entities():
"""Load known entity names from life/areas/ and entities.json."""
known = {}
# From life/areas
for category in ["people", "companies", "projects"]:
area_dir = LIFE_AREAS / category
if not area_dir.exists():
continue
etype = category.rstrip("s") # person, company, project
if category == "people":
etype = "person"
for entry in area_dir.iterdir():
if entry.is_dir():
name = normalize(entry.name)
known[name] = {"type": etype, "source": f"life/areas/{category}"}
# From entities.json
entities = load_json(ENTITIES_FILE)
for name, info in entities.items():
if name not in known:
known[name] = info
return known
def extract_entities(text, known=None):
"""Extract entities from text using heuristics and known entity matching."""
if known is None:
known = load_known_entities()
found = {}
text_lower = text.lower()
# 1. Match known entities
for name, info in known.items():
# Check for name or slug in text
variants = [name, name.replace("-", " "), name.replace("-", "")]
for v in variants:
if v in text_lower and len(v) > 2:
found[name] = {"type": info.get("type", "unknown"), "match": "known"}
break
# 2. Extract @mentions
for m in re.finditer(r"@(\w+)", text):
name = normalize(m.group(1))
if name not in found and name not in STOP_WORDS and len(name) > 2:
found[name] = {"type": "person", "match": "mention"}
# 3. Extract capitalized multi-word names (likely proper nouns)
for m in re.finditer(r"\b([A-Z][a-zäöüß]+(?:\s+[A-Z][a-zäöüß]+)+)\b", text):
name = normalize(m.group(1))
if name not in found and name not in STOP_WORDS and len(name) > 3:
# Heuristic: if 2-3 words, likely person; if more, likely org/topic
words = name.split()
etype = "person" if len(words) <= 3 else "topic"
found[name] = {"type": etype, "match": "capitalized"}
# 4. Extract standalone capitalized words (potential entities)
for m in re.finditer(r"\b([A-Z][a-zäöüß]{2,})\b", text):
name = normalize(m.group(1))
if name not in found and name not in STOP_WORDS:
found[name] = {"type": "unknown", "match": "capitalized_single"}
# 5. Extract ALL-CAPS acronyms (likely companies/products)
for m in re.finditer(r"\b([A-Z]{2,6})\b", text):
name = normalize(m.group(1))
if name not in found and name not in STOP_WORDS and name not in {
"ok", "am", "pm", "gmt", "utc", "url", "api", "cli", "ssh", "dns",
"http", "https", "json", "html", "css", "js", "ts", "py", "md",
"id", "ui", "ux", "io", "os", "ip", "gb", "mb", "kb", "tb",
}:
found[name] = {"type": "organization", "match": "acronym"}
return found
def cmd_bootstrap():
"""Bootstrap entities from life/areas/."""
entities = load_json(ENTITIES_FILE)
relationships = load_json(RELATIONSHIPS_FILE)
count = 0
for category in ["people", "companies"]:
area_dir = LIFE_AREAS / category
if not area_dir.exists():
continue
etype = "person" if category == "people" else "company"
for entry in sorted(area_dir.iterdir()):
if not entry.is_dir():
continue
name = normalize(entry.name)
if name in entities:
continue
info = {"type": etype, "source": f"life/areas/{category}", "bootstrapped": True}
# Try to extract extra info from summary.md
summary_path = entry / "summary.md"
if summary_path.exists():
try:
summary = summary_path.read_text(errors="replace")[:2000]
# Extract email
em = re.search(r"\*\*Email:\*\*\s*(\S+)", summary)
if em:
info["email"] = em.group(1)
# Extract context
ctx = re.search(r"\*\*Kontext:\*\*\s*(.+)", summary)
if ctx:
info["context"] = ctx.group(1).strip()
except Exception:
pass
entities[name] = info
count += 1
save_json(ENTITIES_FILE, entities)
save_json(RELATIONSHIPS_FILE, relationships)
print(f"Bootstrapped {count} new entities. Total: {len(entities)}")
def cmd_extract(text):
"""Extract and display entities from text."""
known = load_known_entities()
found = extract_entities(text, known)
if not found:
print("No entities found.")
return
# Update entities.json with new discoveries
entities = load_json(ENTITIES_FILE)
new_count = 0
for name, info in found.items():
if name not in entities:
entities[name] = {
"type": info["type"],
"source": "extraction",
"first_seen": time.strftime("%Y-%m-%dT%H:%M:%S"),
}
new_count += 1
print(f" [{info['type']:12s}] {name} ({info['match']})")
if new_count:
save_json(ENTITIES_FILE, entities)
print(f"\n{new_count} new entities added to registry.")
def cmd_relate(entity_a, entity_b, rel_type="related"):
"""Create or update a relationship between two entities."""
a, b = normalize(entity_a), normalize(entity_b)
relationships = load_json(RELATIONSHIPS_FILE)
entities = load_json(ENTITIES_FILE)
key = f"{min(a,b)}::{max(a,b)}"
ts = time.strftime("%Y-%m-%dT%H:%M:%S")
if key in relationships:
rel = relationships[key]
rel["count"] = rel.get("count", 1) + 1
rel["last_seen"] = ts
if rel_type != "related" and rel_type not in rel.get("types", []):
rel.setdefault("types", []).append(rel_type)
print(f"Updated: {a} <-> {b} (seen {rel['count']}x)")
else:
relationships[key] = {
"a": a, "b": b,
"types": [rel_type],
"count": 1,
"first_seen": ts,
"last_seen": ts,
}
print(f"Created: {a} <-> {b} ({rel_type})")
# Ensure both entities exist
for name in [a, b]:
if name not in entities:
entities[name] = {"type": "unknown", "source": "relationship", "first_seen": ts}
save_json(RELATIONSHIPS_FILE, relationships)
save_json(ENTITIES_FILE, entities)
def cmd_query(entity_name):
"""Query all relationships for an entity."""
name = normalize(entity_name)
relationships = load_json(RELATIONSHIPS_FILE)
entities = load_json(ENTITIES_FILE)
# Entity info
if name in entities:
info = entities[name]
print(f"Entity: {name}")
print(f" Type: {info.get('type', 'unknown')}")
if info.get("email"):
print(f" Email: {info['email']}")
if info.get("context"):
print(f" Context: {info['context']}")
if info.get("source"):
print(f" Source: {info['source']}")
else:
print(f"Entity '{name}' not found in registry.")
# Relationships
rels = []
for key, rel in relationships.items():
if rel["a"] == name or rel["b"] == name:
other = rel["b"] if rel["a"] == name else rel["a"]
rels.append((other, rel))
if rels:
print(f"\nRelationships ({len(rels)}):")
for other, rel in sorted(rels, key=lambda x: -x[1].get("count", 1)):
types = ", ".join(rel.get("types", ["related"]))
print(f" {name} <-> {other} [{types}] (seen {rel.get('count', 1)}x)")
else:
print("\nNo relationships found.")
# Check life/areas/
for category in ["people", "companies", "projects"]:
area_path = LIFE_AREAS / category / name.replace(" ", "-")
if area_path.exists():
summary_path = area_path / "summary.md"
if summary_path.exists():
print(f"\nLife area ({category}): {area_path}")
content = summary_path.read_text(errors="replace")[:500]
print(content)
def cmd_graph():
"""Output a simple relationship graph summary."""
relationships = load_json(RELATIONSHIPS_FILE)
entities = load_json(ENTITIES_FILE)
if not relationships:
print("No relationships in knowledge graph.")
return
# Count connections per entity
connections = {}
for key, rel in relationships.items():
for name in [rel["a"], rel["b"]]:
connections[name] = connections.get(name, 0) + 1
# Sort by connections
top = sorted(connections.items(), key=lambda x: -x[1])
print(f"Knowledge Graph: {len(entities)} entities, {len(relationships)} relationships\n")
print("Top connected entities:")
for name, count in top[:20]:
etype = entities.get(name, {}).get("type", "?")
print(f" {name} ({etype}): {count} connections")
print(f"\nRecent relationships:")
recent = sorted(relationships.values(), key=lambda r: r.get("last_seen", ""), reverse=True)[:10]
for rel in recent:
types = ", ".join(rel.get("types", ["related"]))
print(f" {rel['a']} <-> {rel['b']} [{types}]")
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
cmd = sys.argv[1]
if cmd == "bootstrap":
cmd_bootstrap()
elif cmd == "extract":
if len(sys.argv) < 3:
print("Usage: entity-manager.py extract \"text\"")
sys.exit(1)
cmd_extract(" ".join(sys.argv[2:]))
elif cmd == "relate":
if len(sys.argv) < 4:
print("Usage: entity-manager.py relate \"entity_a\" \"entity_b\" [type]")
sys.exit(1)
rel_type = sys.argv[4] if len(sys.argv) > 4 else "related"
cmd_relate(sys.argv[2], sys.argv[3], rel_type)
elif cmd == "query":
if len(sys.argv) < 3:
print("Usage: entity-manager.py query \"entity\"")
sys.exit(1)
cmd_query(" ".join(sys.argv[2:]))
elif cmd == "graph":
cmd_graph()
else:
print(f"Unknown command: {cmd}")
print(__doc__)
sys.exit(1)
if __name__ == "__main__":
main()