Some checks failed
Tests / test (push) Failing after 2s
- Merged all unique darkplex-core modules into cortex: - intelligence/ subfolder (anticipator, collective, shared_memory, knowledge_cleanup, temporal, llm_extractor, loop) - governance/ subfolder (policy engine, risk scorer, evidence, enforcer, report generator) - entity_manager.py, knowledge_extractor.py - Fixed bare 'from intelligence.' imports to 'from cortex.intelligence.' - Added 'darkplex' CLI alias alongside 'cortex' - Package renamed to darkplex-core v0.2.0 - 405 tests passing (was 234) - 14 new test files covering all merged modules
371 lines
14 KiB
Python
Executable file
371 lines
14 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Entity Manager — File-based knowledge graph for entity extraction and relationship mapping.
|
|
Part of Level 4.4 AGI Roadmap.
|
|
|
|
Usage:
|
|
entity-manager.py bootstrap — Bootstrap from life/areas/
|
|
entity-manager.py extract "text" — Extract entities from text
|
|
entity-manager.py relate "A" "B" [type] — Create/update relationship
|
|
entity-manager.py query "entity" — Query relationships for entity
|
|
entity-manager.py graph — Output relationship summary
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import json
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
|
|
KNOWLEDGE_DIR = Path.home() / ".cortex" / "knowledge"
|
|
ENTITIES_FILE = KNOWLEDGE_DIR / "entities.json"
|
|
RELATIONSHIPS_FILE = KNOWLEDGE_DIR / "relationships.json"
|
|
LIFE_AREAS = Path.home() / "life" / "areas"
|
|
|
|
# Common words to skip during entity extraction
|
|
STOP_WORDS = {
|
|
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
"should", "may", "might", "shall", "can", "need", "must", "i", "you",
|
|
"he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
|
|
"my", "your", "his", "its", "our", "their", "this", "that", "these",
|
|
"those", "what", "which", "who", "whom", "where", "when", "why", "how",
|
|
"all", "each", "every", "both", "few", "more", "most", "other", "some",
|
|
"such", "no", "nor", "not", "only", "own", "same", "so", "than", "too",
|
|
"very", "just", "because", "as", "until", "while", "of", "at", "by",
|
|
"for", "with", "about", "against", "between", "through", "during",
|
|
"before", "after", "above", "below", "to", "from", "up", "down", "in",
|
|
"out", "on", "off", "over", "under", "again", "further", "then", "once",
|
|
"here", "there", "and", "but", "or", "if", "then", "else", "also",
|
|
"system", "cron", "heartbeat", "ok", "error", "warning", "info",
|
|
"message", "session", "agent", "main", "matrix", "telegram",
|
|
"read", "write", "check", "run", "send", "get", "set", "let", "see",
|
|
"know", "think", "want", "like", "make", "take", "come", "go", "say",
|
|
"tell", "ask", "try", "use", "find", "give", "new", "good", "first",
|
|
"last", "long", "great", "little", "right", "big", "high", "old",
|
|
"different", "small", "large", "next", "early", "young", "important",
|
|
"public", "bad", "sure", "sure", "yes", "no", "maybe", "ok", "okay",
|
|
"thanks", "thank", "please", "hello", "hi", "hey", "bye", "well",
|
|
"now", "today", "tomorrow", "yesterday", "monday", "tuesday",
|
|
"wednesday", "thursday", "friday", "saturday", "sunday",
|
|
"january", "february", "march", "april", "may", "june", "july",
|
|
"august", "september", "october", "november", "december",
|
|
"still", "already", "currently", "actually", "really", "right",
|
|
"look", "keep", "going", "based", "done", "work", "working",
|
|
}
|
|
|
|
|
|
def normalize(name):
|
|
"""Normalize entity name."""
|
|
return name.strip().lower().replace("_", "-")
|
|
|
|
|
|
def load_json(path):
|
|
"""Load JSON file, return empty dict if missing/invalid."""
|
|
try:
|
|
with open(path) as f:
|
|
return json.load(f)
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
return {}
|
|
|
|
|
|
def save_json(path, data):
|
|
"""Save JSON file, creating directories as needed."""
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(path, "w") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
def load_known_entities():
|
|
"""Load known entity names from life/areas/ and entities.json."""
|
|
known = {}
|
|
# From life/areas
|
|
for category in ["people", "companies", "projects"]:
|
|
area_dir = LIFE_AREAS / category
|
|
if not area_dir.exists():
|
|
continue
|
|
etype = category.rstrip("s") # person, company, project
|
|
if category == "people":
|
|
etype = "person"
|
|
for entry in area_dir.iterdir():
|
|
if entry.is_dir():
|
|
name = normalize(entry.name)
|
|
known[name] = {"type": etype, "source": f"life/areas/{category}"}
|
|
# From entities.json
|
|
entities = load_json(ENTITIES_FILE)
|
|
for name, info in entities.items():
|
|
if name not in known:
|
|
known[name] = info
|
|
return known
|
|
|
|
|
|
def extract_entities(text, known=None):
|
|
"""Extract entities from text using heuristics and known entity matching."""
|
|
if known is None:
|
|
known = load_known_entities()
|
|
|
|
found = {}
|
|
text_lower = text.lower()
|
|
|
|
# 1. Match known entities
|
|
for name, info in known.items():
|
|
# Check for name or slug in text
|
|
variants = [name, name.replace("-", " "), name.replace("-", "")]
|
|
for v in variants:
|
|
if v in text_lower and len(v) > 2:
|
|
found[name] = {"type": info.get("type", "unknown"), "match": "known"}
|
|
break
|
|
|
|
# 2. Extract @mentions
|
|
for m in re.finditer(r"@(\w+)", text):
|
|
name = normalize(m.group(1))
|
|
if name not in found and name not in STOP_WORDS and len(name) > 2:
|
|
found[name] = {"type": "person", "match": "mention"}
|
|
|
|
# 3. Extract capitalized multi-word names (likely proper nouns)
|
|
for m in re.finditer(r"\b([A-Z][a-zäöüß]+(?:\s+[A-Z][a-zäöüß]+)+)\b", text):
|
|
name = normalize(m.group(1))
|
|
if name not in found and name not in STOP_WORDS and len(name) > 3:
|
|
# Heuristic: if 2-3 words, likely person; if more, likely org/topic
|
|
words = name.split()
|
|
etype = "person" if len(words) <= 3 else "topic"
|
|
found[name] = {"type": etype, "match": "capitalized"}
|
|
|
|
# 4. Extract standalone capitalized words (potential entities)
|
|
for m in re.finditer(r"\b([A-Z][a-zäöüß]{2,})\b", text):
|
|
name = normalize(m.group(1))
|
|
if name not in found and name not in STOP_WORDS:
|
|
found[name] = {"type": "unknown", "match": "capitalized_single"}
|
|
|
|
# 5. Extract ALL-CAPS acronyms (likely companies/products)
|
|
for m in re.finditer(r"\b([A-Z]{2,6})\b", text):
|
|
name = normalize(m.group(1))
|
|
if name not in found and name not in STOP_WORDS and name not in {
|
|
"ok", "am", "pm", "gmt", "utc", "url", "api", "cli", "ssh", "dns",
|
|
"http", "https", "json", "html", "css", "js", "ts", "py", "md",
|
|
"id", "ui", "ux", "io", "os", "ip", "gb", "mb", "kb", "tb",
|
|
}:
|
|
found[name] = {"type": "organization", "match": "acronym"}
|
|
|
|
return found
|
|
|
|
|
|
def cmd_bootstrap():
|
|
"""Bootstrap entities from life/areas/."""
|
|
entities = load_json(ENTITIES_FILE)
|
|
relationships = load_json(RELATIONSHIPS_FILE)
|
|
count = 0
|
|
|
|
for category in ["people", "companies"]:
|
|
area_dir = LIFE_AREAS / category
|
|
if not area_dir.exists():
|
|
continue
|
|
etype = "person" if category == "people" else "company"
|
|
for entry in sorted(area_dir.iterdir()):
|
|
if not entry.is_dir():
|
|
continue
|
|
name = normalize(entry.name)
|
|
if name in entities:
|
|
continue
|
|
|
|
info = {"type": etype, "source": f"life/areas/{category}", "bootstrapped": True}
|
|
|
|
# Try to extract extra info from summary.md
|
|
summary_path = entry / "summary.md"
|
|
if summary_path.exists():
|
|
try:
|
|
summary = summary_path.read_text(errors="replace")[:2000]
|
|
# Extract email
|
|
em = re.search(r"\*\*Email:\*\*\s*(\S+)", summary)
|
|
if em:
|
|
info["email"] = em.group(1)
|
|
# Extract context
|
|
ctx = re.search(r"\*\*Kontext:\*\*\s*(.+)", summary)
|
|
if ctx:
|
|
info["context"] = ctx.group(1).strip()
|
|
except Exception:
|
|
pass
|
|
|
|
entities[name] = info
|
|
count += 1
|
|
|
|
save_json(ENTITIES_FILE, entities)
|
|
save_json(RELATIONSHIPS_FILE, relationships)
|
|
print(f"Bootstrapped {count} new entities. Total: {len(entities)}")
|
|
|
|
|
|
def cmd_extract(text):
|
|
"""Extract and display entities from text."""
|
|
known = load_known_entities()
|
|
found = extract_entities(text, known)
|
|
|
|
if not found:
|
|
print("No entities found.")
|
|
return
|
|
|
|
# Update entities.json with new discoveries
|
|
entities = load_json(ENTITIES_FILE)
|
|
new_count = 0
|
|
for name, info in found.items():
|
|
if name not in entities:
|
|
entities[name] = {
|
|
"type": info["type"],
|
|
"source": "extraction",
|
|
"first_seen": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
}
|
|
new_count += 1
|
|
print(f" [{info['type']:12s}] {name} ({info['match']})")
|
|
|
|
if new_count:
|
|
save_json(ENTITIES_FILE, entities)
|
|
print(f"\n{new_count} new entities added to registry.")
|
|
|
|
|
|
def cmd_relate(entity_a, entity_b, rel_type="related"):
|
|
"""Create or update a relationship between two entities."""
|
|
a, b = normalize(entity_a), normalize(entity_b)
|
|
relationships = load_json(RELATIONSHIPS_FILE)
|
|
entities = load_json(ENTITIES_FILE)
|
|
|
|
key = f"{min(a,b)}::{max(a,b)}"
|
|
ts = time.strftime("%Y-%m-%dT%H:%M:%S")
|
|
|
|
if key in relationships:
|
|
rel = relationships[key]
|
|
rel["count"] = rel.get("count", 1) + 1
|
|
rel["last_seen"] = ts
|
|
if rel_type != "related" and rel_type not in rel.get("types", []):
|
|
rel.setdefault("types", []).append(rel_type)
|
|
print(f"Updated: {a} <-> {b} (seen {rel['count']}x)")
|
|
else:
|
|
relationships[key] = {
|
|
"a": a, "b": b,
|
|
"types": [rel_type],
|
|
"count": 1,
|
|
"first_seen": ts,
|
|
"last_seen": ts,
|
|
}
|
|
print(f"Created: {a} <-> {b} ({rel_type})")
|
|
|
|
# Ensure both entities exist
|
|
for name in [a, b]:
|
|
if name not in entities:
|
|
entities[name] = {"type": "unknown", "source": "relationship", "first_seen": ts}
|
|
|
|
save_json(RELATIONSHIPS_FILE, relationships)
|
|
save_json(ENTITIES_FILE, entities)
|
|
|
|
|
|
def cmd_query(entity_name):
|
|
"""Query all relationships for an entity."""
|
|
name = normalize(entity_name)
|
|
relationships = load_json(RELATIONSHIPS_FILE)
|
|
entities = load_json(ENTITIES_FILE)
|
|
|
|
# Entity info
|
|
if name in entities:
|
|
info = entities[name]
|
|
print(f"Entity: {name}")
|
|
print(f" Type: {info.get('type', 'unknown')}")
|
|
if info.get("email"):
|
|
print(f" Email: {info['email']}")
|
|
if info.get("context"):
|
|
print(f" Context: {info['context']}")
|
|
if info.get("source"):
|
|
print(f" Source: {info['source']}")
|
|
else:
|
|
print(f"Entity '{name}' not found in registry.")
|
|
|
|
# Relationships
|
|
rels = []
|
|
for key, rel in relationships.items():
|
|
if rel["a"] == name or rel["b"] == name:
|
|
other = rel["b"] if rel["a"] == name else rel["a"]
|
|
rels.append((other, rel))
|
|
|
|
if rels:
|
|
print(f"\nRelationships ({len(rels)}):")
|
|
for other, rel in sorted(rels, key=lambda x: -x[1].get("count", 1)):
|
|
types = ", ".join(rel.get("types", ["related"]))
|
|
print(f" {name} <-> {other} [{types}] (seen {rel.get('count', 1)}x)")
|
|
else:
|
|
print("\nNo relationships found.")
|
|
|
|
# Check life/areas/
|
|
for category in ["people", "companies", "projects"]:
|
|
area_path = LIFE_AREAS / category / name.replace(" ", "-")
|
|
if area_path.exists():
|
|
summary_path = area_path / "summary.md"
|
|
if summary_path.exists():
|
|
print(f"\nLife area ({category}): {area_path}")
|
|
content = summary_path.read_text(errors="replace")[:500]
|
|
print(content)
|
|
|
|
|
|
def cmd_graph():
|
|
"""Output a simple relationship graph summary."""
|
|
relationships = load_json(RELATIONSHIPS_FILE)
|
|
entities = load_json(ENTITIES_FILE)
|
|
|
|
if not relationships:
|
|
print("No relationships in knowledge graph.")
|
|
return
|
|
|
|
# Count connections per entity
|
|
connections = {}
|
|
for key, rel in relationships.items():
|
|
for name in [rel["a"], rel["b"]]:
|
|
connections[name] = connections.get(name, 0) + 1
|
|
|
|
# Sort by connections
|
|
top = sorted(connections.items(), key=lambda x: -x[1])
|
|
|
|
print(f"Knowledge Graph: {len(entities)} entities, {len(relationships)} relationships\n")
|
|
print("Top connected entities:")
|
|
for name, count in top[:20]:
|
|
etype = entities.get(name, {}).get("type", "?")
|
|
print(f" {name} ({etype}): {count} connections")
|
|
|
|
print(f"\nRecent relationships:")
|
|
recent = sorted(relationships.values(), key=lambda r: r.get("last_seen", ""), reverse=True)[:10]
|
|
for rel in recent:
|
|
types = ", ".join(rel.get("types", ["related"]))
|
|
print(f" {rel['a']} <-> {rel['b']} [{types}]")
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
cmd = sys.argv[1]
|
|
|
|
if cmd == "bootstrap":
|
|
cmd_bootstrap()
|
|
elif cmd == "extract":
|
|
if len(sys.argv) < 3:
|
|
print("Usage: entity-manager.py extract \"text\"")
|
|
sys.exit(1)
|
|
cmd_extract(" ".join(sys.argv[2:]))
|
|
elif cmd == "relate":
|
|
if len(sys.argv) < 4:
|
|
print("Usage: entity-manager.py relate \"entity_a\" \"entity_b\" [type]")
|
|
sys.exit(1)
|
|
rel_type = sys.argv[4] if len(sys.argv) > 4 else "related"
|
|
cmd_relate(sys.argv[2], sys.argv[3], rel_type)
|
|
elif cmd == "query":
|
|
if len(sys.argv) < 3:
|
|
print("Usage: entity-manager.py query \"entity\"")
|
|
sys.exit(1)
|
|
cmd_query(" ".join(sys.argv[2:]))
|
|
elif cmd == "graph":
|
|
cmd_graph()
|
|
else:
|
|
print(f"Unknown command: {cmd}")
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|