darkplex-core/cortex/intelligence/knowledge_cleanup.py
Claudia fd7d75c0ed
Some checks failed
Tests / test (push) Failing after 2s
Merge darkplex-core into cortex — unified intelligence layer v0.2.0
- Merged all unique darkplex-core modules into cortex:
  - intelligence/ subfolder (anticipator, collective, shared_memory, knowledge_cleanup, temporal, llm_extractor, loop)
  - governance/ subfolder (policy engine, risk scorer, evidence, enforcer, report generator)
  - entity_manager.py, knowledge_extractor.py
- Fixed bare 'from intelligence.' imports to 'from cortex.intelligence.'
- Added 'darkplex' CLI alias alongside 'cortex'
- Package renamed to darkplex-core v0.2.0
- 405 tests passing (was 234)
- 14 new test files covering all merged modules
2026-02-12 08:43:02 +01:00

420 lines
15 KiB
Python

#!/usr/bin/env python3
"""Knowledge graph cleanup: classify unknowns, deduplicate entities, score relationships.
Usage:
darkplex cleanup [--classify] [--dedupe] [--score] [--dry-run]
If no flags given, runs all three steps.
"""
import argparse
import copy
import json
import logging
import os
import shutil
import sys
import time
from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path
import requests
log = logging.getLogger("knowledge_cleanup")
KNOWLEDGE_DIR = Path.home() / ".cortex" / "knowledge"
ENTITIES_PATH = KNOWLEDGE_DIR / "entities.json"
RELATIONSHIPS_PATH = KNOWLEDGE_DIR / "relationships.json"
OLLAMA_URL = "http://localhost:11434"
OLLAMA_MODEL = "qwen2.5:7b"
VALID_TYPES = {"person", "organization", "company", "project", "technology",
"location", "event", "concept", "product"}
def backup(path: Path) -> Path:
"""Create timestamped backup."""
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = path.with_suffix(f".backup_{ts}.json")
shutil.copy2(path, backup_path)
log.info(f"Backed up {path.name}{backup_path.name}")
return backup_path
def atomic_write(path: Path, data):
"""Write JSON atomically via temp file."""
tmp = path.with_suffix(".tmp")
with open(tmp, "w") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
tmp.replace(path)
log.info(f"Wrote {path.name}")
def load_entities() -> dict:
with open(ENTITIES_PATH) as f:
return json.load(f)
def load_relationships() -> dict:
with open(RELATIONSHIPS_PATH) as f:
return json.load(f)
def ollama_generate(prompt: str, timeout: int = 120) -> str:
"""Call Ollama generate API."""
resp = requests.post(f"{OLLAMA_URL}/api/generate", json={
"model": OLLAMA_MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 2000}
}, timeout=timeout)
resp.raise_for_status()
return resp.json().get("response", "")
# ─── Task 1: Classify Unknowns ───────────────────────────────────────────────
def classify_unknowns(entities: dict, dry_run: bool = False) -> dict:
"""Classify entities with type='unknown' using LLM."""
unknowns = {k: v for k, v in entities.items()
if isinstance(v, dict) and v.get("type") == "unknown"}
if not unknowns:
log.info("No unknown entities to classify.")
return entities
log.info(f"Classifying {len(unknowns)} unknown entities...")
names = list(unknowns.keys())
batch_size = 50
results = {}
for i in range(0, len(names), batch_size):
batch = names[i:i + batch_size]
batch_num = i // batch_size + 1
total_batches = (len(names) + batch_size - 1) // batch_size
log.info(f"Batch {batch_num}/{total_batches} ({len(batch)} entities)")
numbered = "\n".join(f"{j+1}. {name}" for j, name in enumerate(batch))
prompt = f"""Classify each entity name into exactly one category.
Categories: person, organization, company, project, technology, location, event, concept, product
If a name looks like a person's first name only (e.g. "sarah", "thomas"), classify as person.
If it's a common word that isn't clearly an entity (e.g. "ahnung", "wir", "evtl", "schau"), classify as concept.
If unsure, classify as concept.
Respond with ONLY a JSON object mapping the number to the category. Example:
{{"1": "person", "2": "company", "3": "concept"}}
Entities:
{numbered}
JSON:"""
try:
response = ollama_generate(prompt)
# Extract JSON from response
start = response.find("{")
end = response.rfind("}") + 1
if start >= 0 and end > start:
parsed = json.loads(response[start:end])
for idx_str, category in parsed.items():
idx = int(idx_str) - 1
if 0 <= idx < len(batch):
cat = category.strip().lower()
if cat in VALID_TYPES:
results[batch[idx]] = cat
except Exception as e:
log.warning(f"Batch {batch_num} failed: {e}")
continue
time.sleep(0.5) # Be nice to Ollama
# Apply results
stats = defaultdict(int)
for name, new_type in results.items():
old_type = entities[name].get("type", "unknown")
if old_type != new_type:
stats[f"{old_type}{new_type}"] += 1
if not dry_run:
entities[name]["type"] = new_type
entities[name]["classified_by"] = "llm_cleanup"
entities[name]["classified_at"] = datetime.now().isoformat()
log.info(f"Classified {len(results)}/{len(unknowns)} unknowns:")
for transition, count in sorted(stats.items(), key=lambda x: -x[1]):
log.info(f" {transition}: {count}")
remaining = sum(1 for k, v in entities.items()
if isinstance(v, dict) and v.get("type") == "unknown")
log.info(f"Remaining unknowns: {remaining}")
return entities
# ─── Task 2: Deduplicate ─────────────────────────────────────────────────────
def find_duplicates(entities: dict) -> list:
"""Find duplicate entity groups via case-insensitive matching."""
# Group by normalized name
groups = defaultdict(list)
for name in entities:
normalized = name.strip().lower()
groups[normalized].append(name)
# Also check for substring containment (e.g. "mondo gate" vs "mondo gate ag")
names_lower = {name: name.strip().lower() for name in entities}
sorted_names = sorted(names_lower.items(), key=lambda x: len(x[1]))
# Find names where one is a prefix/substring of another
substring_pairs = []
for i, (name_a, low_a) in enumerate(sorted_names):
if len(low_a) < 3:
continue
for name_b, low_b in sorted_names[i+1:]:
if low_a == low_b:
continue
if low_b.startswith(low_a + " ") or low_b.startswith(low_a + "-"):
substring_pairs.append((name_a, name_b))
# Build merge groups
merge_groups = []
# Exact case duplicates
for normalized, names in groups.items():
if len(names) > 1:
merge_groups.append(names)
# Substring matches (merge into existing groups or create new)
for short, long in substring_pairs:
found = False
for group in merge_groups:
if short in group or long in group:
if short not in group:
group.append(short)
if long not in group:
group.append(long)
found = True
break
if not found:
merge_groups.append([short, long])
return merge_groups
def pick_canonical(names: list, entities: dict) -> str:
"""Pick the most detailed entity name as canonical."""
# Prefer: longest name, most fields, not all-lowercase
def score(name):
e = entities.get(name, {})
fields = len(e) if isinstance(e, dict) else 0
length = len(name)
has_upper = int(any(c.isupper() for c in name))
return (has_upper, fields, length)
return max(names, key=score)
def deduplicate(entities: dict, relationships: dict, dry_run: bool = False) -> tuple:
"""Deduplicate entities and update relationships."""
groups = find_duplicates(entities)
if not groups:
log.info("No duplicates found.")
return entities, relationships
log.info(f"Found {len(groups)} duplicate groups:")
alias_map = {} # old_name → canonical_name
for group in groups:
canonical = pick_canonical(group, entities)
aliases = [n for n in group if n != canonical]
if not aliases:
continue
log.info(f" Canonical: '{canonical}' ← aliases: {aliases}")
for alias in aliases:
alias_map[alias] = canonical
if not dry_run:
# Merge fields into canonical
canonical_entry = entities.get(canonical, {})
if not isinstance(canonical_entry, dict):
canonical_entry = {}
existing_aliases = canonical_entry.get("aliases", [])
for alias in aliases:
if alias not in existing_aliases:
existing_aliases.append(alias)
alias_entry = entities.get(alias, {})
if isinstance(alias_entry, dict):
# Merge non-existing fields
for k, v in alias_entry.items():
if k not in canonical_entry and k not in ("type", "aliases"):
canonical_entry[k] = v
canonical_entry["aliases"] = existing_aliases
entities[canonical] = canonical_entry
# Remove aliases from entities
for alias in aliases:
if alias in entities:
del entities[alias]
# Update relationships
if not dry_run and alias_map:
updated_rels = {}
remapped = 0
for key, rel in relationships.items():
a = rel.get("a", "")
b = rel.get("b", "")
new_a = alias_map.get(a, a)
new_b = alias_map.get(b, b)
if new_a != a or new_b != b:
remapped += 1
rel["a"] = new_a
rel["b"] = new_b
new_key = f"{new_a}::{new_b}"
if new_key in updated_rels:
# Merge: sum counts, keep latest last_seen
existing = updated_rels[new_key]
existing["count"] = existing.get("count", 0) + rel.get("count", 0)
if rel.get("last_seen", "") > existing.get("last_seen", ""):
existing["last_seen"] = rel["last_seen"]
if rel.get("first_seen", "") < existing.get("first_seen", ""):
existing["first_seen"] = rel["first_seen"]
# Merge types
existing_types = set(existing.get("types", []))
existing_types.update(rel.get("types", []))
existing["types"] = list(existing_types)
else:
updated_rels[new_key] = rel
log.info(f"Remapped {remapped} relationships, merged {len(relationships) - len(updated_rels)} duplicates")
relationships = updated_rels
log.info(f"Merged {len(alias_map)} aliases into {len(set(alias_map.values()))} canonical entities")
return entities, relationships
# ─── Task 3: Relationship Scoring ────────────────────────────────────────────
def score_relationships(relationships: dict, dry_run: bool = False) -> dict:
"""Add strength scores and decay old relationships."""
now = datetime.now()
decay_threshold = now - timedelta(days=30)
removed = 0
scored = 0
decayed = 0
to_remove = []
for key, rel in relationships.items():
count = rel.get("count", 1)
last_seen_str = rel.get("last_seen", "")
first_seen_str = rel.get("first_seen", "")
types = rel.get("types", [])
# Base strength from count (log scale, capped at 1)
import math
count_score = min(1.0, math.log(count + 1) / math.log(100))
# Context diversity: more relationship types = stronger
diversity_score = min(1.0, len(types) * 0.3)
# Recency score
recency_score = 1.0
if last_seen_str:
try:
last_seen = datetime.fromisoformat(last_seen_str)
days_ago = (now - last_seen).days
if days_ago > 30:
recency_score = max(0.0, 1.0 - (days_ago - 30) / 180)
decayed += 1
except (ValueError, TypeError):
pass
# Combined strength
strength = round(
count_score * 0.4 + diversity_score * 0.3 + recency_score * 0.3,
3
)
if strength < 0.1:
to_remove.append(key)
removed += 1
else:
if not dry_run:
rel["strength"] = strength
scored += 1
if not dry_run:
for key in to_remove:
del relationships[key]
log.info(f"Scored {scored} relationships, decayed {decayed}, removed {removed} (strength < 0.1)")
return relationships
# ─── Main ────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Knowledge graph cleanup")
parser.add_argument("--classify", action="store_true", help="Classify unknown entities")
parser.add_argument("--dedupe", action="store_true", help="Deduplicate entities")
parser.add_argument("--score", action="store_true", help="Score relationships")
parser.add_argument("--dry-run", action="store_true", help="Show changes without writing")
args = parser.parse_args()
# If no specific flags, run all
run_all = not (args.classify or args.dedupe or args.score)
entities = load_entities()
relationships = load_relationships()
log.info(f"Loaded {len(entities)} entities, {len(relationships)} relationships")
# Backup before any modifications
if not args.dry_run:
backup(ENTITIES_PATH)
backup(RELATIONSHIPS_PATH)
if args.dry_run:
log.info("═══ DRY RUN — no files will be modified ═══")
if run_all or args.classify:
log.info("─── Step 1: Classify Unknowns ───")
entities = classify_unknowns(entities, dry_run=args.dry_run)
if run_all or args.dedupe:
log.info("─── Step 2: Deduplicate Entities ───")
entities, relationships = deduplicate(entities, relationships, dry_run=args.dry_run)
if run_all or args.score:
log.info("─── Step 3: Score Relationships ───")
relationships = score_relationships(relationships, dry_run=args.dry_run)
if not args.dry_run:
atomic_write(ENTITIES_PATH, entities)
atomic_write(RELATIONSHIPS_PATH, relationships)
log.info(f"Done. Final: {len(entities)} entities, {len(relationships)} relationships")
else:
log.info(f"Dry run complete. Would result in: {len(entities)} entities, {len(relationships)} relationships")
if __name__ == "__main__":
logging.basicConfig(
format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO,
)
main()