Some checks failed
Tests / test (push) Failing after 2s
- Merged all unique darkplex-core modules into cortex: - intelligence/ subfolder (anticipator, collective, shared_memory, knowledge_cleanup, temporal, llm_extractor, loop) - governance/ subfolder (policy engine, risk scorer, evidence, enforcer, report generator) - entity_manager.py, knowledge_extractor.py - Fixed bare 'from intelligence.' imports to 'from cortex.intelligence.' - Added 'darkplex' CLI alias alongside 'cortex' - Package renamed to darkplex-core v0.2.0 - 405 tests passing (was 234) - 14 new test files covering all merged modules
420 lines
15 KiB
Python
420 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""Knowledge graph cleanup: classify unknowns, deduplicate entities, score relationships.
|
|
|
|
Usage:
|
|
darkplex cleanup [--classify] [--dedupe] [--score] [--dry-run]
|
|
|
|
If no flags given, runs all three steps.
|
|
"""
|
|
|
|
import argparse
|
|
import copy
|
|
import json
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import sys
|
|
import time
|
|
from collections import defaultdict
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
log = logging.getLogger("knowledge_cleanup")
|
|
|
|
KNOWLEDGE_DIR = Path.home() / ".cortex" / "knowledge"
|
|
ENTITIES_PATH = KNOWLEDGE_DIR / "entities.json"
|
|
RELATIONSHIPS_PATH = KNOWLEDGE_DIR / "relationships.json"
|
|
OLLAMA_URL = "http://localhost:11434"
|
|
OLLAMA_MODEL = "qwen2.5:7b"
|
|
|
|
VALID_TYPES = {"person", "organization", "company", "project", "technology",
|
|
"location", "event", "concept", "product"}
|
|
|
|
|
|
def backup(path: Path) -> Path:
|
|
"""Create timestamped backup."""
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
backup_path = path.with_suffix(f".backup_{ts}.json")
|
|
shutil.copy2(path, backup_path)
|
|
log.info(f"Backed up {path.name} → {backup_path.name}")
|
|
return backup_path
|
|
|
|
|
|
def atomic_write(path: Path, data):
|
|
"""Write JSON atomically via temp file."""
|
|
tmp = path.with_suffix(".tmp")
|
|
with open(tmp, "w") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
tmp.replace(path)
|
|
log.info(f"Wrote {path.name}")
|
|
|
|
|
|
def load_entities() -> dict:
|
|
with open(ENTITIES_PATH) as f:
|
|
return json.load(f)
|
|
|
|
|
|
def load_relationships() -> dict:
|
|
with open(RELATIONSHIPS_PATH) as f:
|
|
return json.load(f)
|
|
|
|
|
|
def ollama_generate(prompt: str, timeout: int = 120) -> str:
|
|
"""Call Ollama generate API."""
|
|
resp = requests.post(f"{OLLAMA_URL}/api/generate", json={
|
|
"model": OLLAMA_MODEL,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 2000}
|
|
}, timeout=timeout)
|
|
resp.raise_for_status()
|
|
return resp.json().get("response", "")
|
|
|
|
|
|
# ─── Task 1: Classify Unknowns ───────────────────────────────────────────────
|
|
|
|
def classify_unknowns(entities: dict, dry_run: bool = False) -> dict:
|
|
"""Classify entities with type='unknown' using LLM."""
|
|
unknowns = {k: v for k, v in entities.items()
|
|
if isinstance(v, dict) and v.get("type") == "unknown"}
|
|
|
|
if not unknowns:
|
|
log.info("No unknown entities to classify.")
|
|
return entities
|
|
|
|
log.info(f"Classifying {len(unknowns)} unknown entities...")
|
|
|
|
names = list(unknowns.keys())
|
|
batch_size = 50
|
|
results = {}
|
|
|
|
for i in range(0, len(names), batch_size):
|
|
batch = names[i:i + batch_size]
|
|
batch_num = i // batch_size + 1
|
|
total_batches = (len(names) + batch_size - 1) // batch_size
|
|
log.info(f"Batch {batch_num}/{total_batches} ({len(batch)} entities)")
|
|
|
|
numbered = "\n".join(f"{j+1}. {name}" for j, name in enumerate(batch))
|
|
prompt = f"""Classify each entity name into exactly one category.
|
|
Categories: person, organization, company, project, technology, location, event, concept, product
|
|
|
|
If a name looks like a person's first name only (e.g. "sarah", "thomas"), classify as person.
|
|
If it's a common word that isn't clearly an entity (e.g. "ahnung", "wir", "evtl", "schau"), classify as concept.
|
|
If unsure, classify as concept.
|
|
|
|
Respond with ONLY a JSON object mapping the number to the category. Example:
|
|
{{"1": "person", "2": "company", "3": "concept"}}
|
|
|
|
Entities:
|
|
{numbered}
|
|
|
|
JSON:"""
|
|
|
|
try:
|
|
response = ollama_generate(prompt)
|
|
# Extract JSON from response
|
|
start = response.find("{")
|
|
end = response.rfind("}") + 1
|
|
if start >= 0 and end > start:
|
|
parsed = json.loads(response[start:end])
|
|
for idx_str, category in parsed.items():
|
|
idx = int(idx_str) - 1
|
|
if 0 <= idx < len(batch):
|
|
cat = category.strip().lower()
|
|
if cat in VALID_TYPES:
|
|
results[batch[idx]] = cat
|
|
except Exception as e:
|
|
log.warning(f"Batch {batch_num} failed: {e}")
|
|
continue
|
|
|
|
time.sleep(0.5) # Be nice to Ollama
|
|
|
|
# Apply results
|
|
stats = defaultdict(int)
|
|
for name, new_type in results.items():
|
|
old_type = entities[name].get("type", "unknown")
|
|
if old_type != new_type:
|
|
stats[f"{old_type} → {new_type}"] += 1
|
|
if not dry_run:
|
|
entities[name]["type"] = new_type
|
|
entities[name]["classified_by"] = "llm_cleanup"
|
|
entities[name]["classified_at"] = datetime.now().isoformat()
|
|
|
|
log.info(f"Classified {len(results)}/{len(unknowns)} unknowns:")
|
|
for transition, count in sorted(stats.items(), key=lambda x: -x[1]):
|
|
log.info(f" {transition}: {count}")
|
|
|
|
remaining = sum(1 for k, v in entities.items()
|
|
if isinstance(v, dict) and v.get("type") == "unknown")
|
|
log.info(f"Remaining unknowns: {remaining}")
|
|
|
|
return entities
|
|
|
|
|
|
# ─── Task 2: Deduplicate ─────────────────────────────────────────────────────
|
|
|
|
def find_duplicates(entities: dict) -> list:
|
|
"""Find duplicate entity groups via case-insensitive matching."""
|
|
# Group by normalized name
|
|
groups = defaultdict(list)
|
|
for name in entities:
|
|
normalized = name.strip().lower()
|
|
groups[normalized].append(name)
|
|
|
|
# Also check for substring containment (e.g. "mondo gate" vs "mondo gate ag")
|
|
names_lower = {name: name.strip().lower() for name in entities}
|
|
sorted_names = sorted(names_lower.items(), key=lambda x: len(x[1]))
|
|
|
|
# Find names where one is a prefix/substring of another
|
|
substring_pairs = []
|
|
for i, (name_a, low_a) in enumerate(sorted_names):
|
|
if len(low_a) < 3:
|
|
continue
|
|
for name_b, low_b in sorted_names[i+1:]:
|
|
if low_a == low_b:
|
|
continue
|
|
if low_b.startswith(low_a + " ") or low_b.startswith(low_a + "-"):
|
|
substring_pairs.append((name_a, name_b))
|
|
|
|
# Build merge groups
|
|
merge_groups = []
|
|
|
|
# Exact case duplicates
|
|
for normalized, names in groups.items():
|
|
if len(names) > 1:
|
|
merge_groups.append(names)
|
|
|
|
# Substring matches (merge into existing groups or create new)
|
|
for short, long in substring_pairs:
|
|
found = False
|
|
for group in merge_groups:
|
|
if short in group or long in group:
|
|
if short not in group:
|
|
group.append(short)
|
|
if long not in group:
|
|
group.append(long)
|
|
found = True
|
|
break
|
|
if not found:
|
|
merge_groups.append([short, long])
|
|
|
|
return merge_groups
|
|
|
|
|
|
def pick_canonical(names: list, entities: dict) -> str:
|
|
"""Pick the most detailed entity name as canonical."""
|
|
# Prefer: longest name, most fields, not all-lowercase
|
|
def score(name):
|
|
e = entities.get(name, {})
|
|
fields = len(e) if isinstance(e, dict) else 0
|
|
length = len(name)
|
|
has_upper = int(any(c.isupper() for c in name))
|
|
return (has_upper, fields, length)
|
|
|
|
return max(names, key=score)
|
|
|
|
|
|
def deduplicate(entities: dict, relationships: dict, dry_run: bool = False) -> tuple:
|
|
"""Deduplicate entities and update relationships."""
|
|
groups = find_duplicates(entities)
|
|
|
|
if not groups:
|
|
log.info("No duplicates found.")
|
|
return entities, relationships
|
|
|
|
log.info(f"Found {len(groups)} duplicate groups:")
|
|
|
|
alias_map = {} # old_name → canonical_name
|
|
|
|
for group in groups:
|
|
canonical = pick_canonical(group, entities)
|
|
aliases = [n for n in group if n != canonical]
|
|
|
|
if not aliases:
|
|
continue
|
|
|
|
log.info(f" Canonical: '{canonical}' ← aliases: {aliases}")
|
|
|
|
for alias in aliases:
|
|
alias_map[alias] = canonical
|
|
|
|
if not dry_run:
|
|
# Merge fields into canonical
|
|
canonical_entry = entities.get(canonical, {})
|
|
if not isinstance(canonical_entry, dict):
|
|
canonical_entry = {}
|
|
|
|
existing_aliases = canonical_entry.get("aliases", [])
|
|
for alias in aliases:
|
|
if alias not in existing_aliases:
|
|
existing_aliases.append(alias)
|
|
alias_entry = entities.get(alias, {})
|
|
if isinstance(alias_entry, dict):
|
|
# Merge non-existing fields
|
|
for k, v in alias_entry.items():
|
|
if k not in canonical_entry and k not in ("type", "aliases"):
|
|
canonical_entry[k] = v
|
|
|
|
canonical_entry["aliases"] = existing_aliases
|
|
entities[canonical] = canonical_entry
|
|
|
|
# Remove aliases from entities
|
|
for alias in aliases:
|
|
if alias in entities:
|
|
del entities[alias]
|
|
|
|
# Update relationships
|
|
if not dry_run and alias_map:
|
|
updated_rels = {}
|
|
remapped = 0
|
|
for key, rel in relationships.items():
|
|
a = rel.get("a", "")
|
|
b = rel.get("b", "")
|
|
new_a = alias_map.get(a, a)
|
|
new_b = alias_map.get(b, b)
|
|
|
|
if new_a != a or new_b != b:
|
|
remapped += 1
|
|
rel["a"] = new_a
|
|
rel["b"] = new_b
|
|
|
|
new_key = f"{new_a}::{new_b}"
|
|
|
|
if new_key in updated_rels:
|
|
# Merge: sum counts, keep latest last_seen
|
|
existing = updated_rels[new_key]
|
|
existing["count"] = existing.get("count", 0) + rel.get("count", 0)
|
|
if rel.get("last_seen", "") > existing.get("last_seen", ""):
|
|
existing["last_seen"] = rel["last_seen"]
|
|
if rel.get("first_seen", "") < existing.get("first_seen", ""):
|
|
existing["first_seen"] = rel["first_seen"]
|
|
# Merge types
|
|
existing_types = set(existing.get("types", []))
|
|
existing_types.update(rel.get("types", []))
|
|
existing["types"] = list(existing_types)
|
|
else:
|
|
updated_rels[new_key] = rel
|
|
|
|
log.info(f"Remapped {remapped} relationships, merged {len(relationships) - len(updated_rels)} duplicates")
|
|
relationships = updated_rels
|
|
|
|
log.info(f"Merged {len(alias_map)} aliases into {len(set(alias_map.values()))} canonical entities")
|
|
|
|
return entities, relationships
|
|
|
|
|
|
# ─── Task 3: Relationship Scoring ────────────────────────────────────────────
|
|
|
|
def score_relationships(relationships: dict, dry_run: bool = False) -> dict:
|
|
"""Add strength scores and decay old relationships."""
|
|
now = datetime.now()
|
|
decay_threshold = now - timedelta(days=30)
|
|
|
|
removed = 0
|
|
scored = 0
|
|
decayed = 0
|
|
|
|
to_remove = []
|
|
|
|
for key, rel in relationships.items():
|
|
count = rel.get("count", 1)
|
|
last_seen_str = rel.get("last_seen", "")
|
|
first_seen_str = rel.get("first_seen", "")
|
|
types = rel.get("types", [])
|
|
|
|
# Base strength from count (log scale, capped at 1)
|
|
import math
|
|
count_score = min(1.0, math.log(count + 1) / math.log(100))
|
|
|
|
# Context diversity: more relationship types = stronger
|
|
diversity_score = min(1.0, len(types) * 0.3)
|
|
|
|
# Recency score
|
|
recency_score = 1.0
|
|
if last_seen_str:
|
|
try:
|
|
last_seen = datetime.fromisoformat(last_seen_str)
|
|
days_ago = (now - last_seen).days
|
|
if days_ago > 30:
|
|
recency_score = max(0.0, 1.0 - (days_ago - 30) / 180)
|
|
decayed += 1
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Combined strength
|
|
strength = round(
|
|
count_score * 0.4 + diversity_score * 0.3 + recency_score * 0.3,
|
|
3
|
|
)
|
|
|
|
if strength < 0.1:
|
|
to_remove.append(key)
|
|
removed += 1
|
|
else:
|
|
if not dry_run:
|
|
rel["strength"] = strength
|
|
scored += 1
|
|
|
|
if not dry_run:
|
|
for key in to_remove:
|
|
del relationships[key]
|
|
|
|
log.info(f"Scored {scored} relationships, decayed {decayed}, removed {removed} (strength < 0.1)")
|
|
|
|
return relationships
|
|
|
|
|
|
# ─── Main ────────────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Knowledge graph cleanup")
|
|
parser.add_argument("--classify", action="store_true", help="Classify unknown entities")
|
|
parser.add_argument("--dedupe", action="store_true", help="Deduplicate entities")
|
|
parser.add_argument("--score", action="store_true", help="Score relationships")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show changes without writing")
|
|
args = parser.parse_args()
|
|
|
|
# If no specific flags, run all
|
|
run_all = not (args.classify or args.dedupe or args.score)
|
|
|
|
entities = load_entities()
|
|
relationships = load_relationships()
|
|
|
|
log.info(f"Loaded {len(entities)} entities, {len(relationships)} relationships")
|
|
|
|
# Backup before any modifications
|
|
if not args.dry_run:
|
|
backup(ENTITIES_PATH)
|
|
backup(RELATIONSHIPS_PATH)
|
|
|
|
if args.dry_run:
|
|
log.info("═══ DRY RUN — no files will be modified ═══")
|
|
|
|
if run_all or args.classify:
|
|
log.info("─── Step 1: Classify Unknowns ───")
|
|
entities = classify_unknowns(entities, dry_run=args.dry_run)
|
|
|
|
if run_all or args.dedupe:
|
|
log.info("─── Step 2: Deduplicate Entities ───")
|
|
entities, relationships = deduplicate(entities, relationships, dry_run=args.dry_run)
|
|
|
|
if run_all or args.score:
|
|
log.info("─── Step 3: Score Relationships ───")
|
|
relationships = score_relationships(relationships, dry_run=args.dry_run)
|
|
|
|
if not args.dry_run:
|
|
atomic_write(ENTITIES_PATH, entities)
|
|
atomic_write(RELATIONSHIPS_PATH, relationships)
|
|
log.info(f"Done. Final: {len(entities)} entities, {len(relationships)} relationships")
|
|
else:
|
|
log.info(f"Dry run complete. Would result in: {len(entities)} entities, {len(relationships)} relationships")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(
|
|
format='%(asctime)s %(name)s %(levelname)s %(message)s',
|
|
level=logging.INFO,
|
|
)
|
|
main()
|