#!/usr/bin/env python3 """Knowledge graph cleanup: classify unknowns, deduplicate entities, score relationships. Usage: darkplex cleanup [--classify] [--dedupe] [--score] [--dry-run] If no flags given, runs all three steps. """ import argparse import copy import json import logging import os import shutil import sys import time from collections import defaultdict from datetime import datetime, timedelta from pathlib import Path import requests log = logging.getLogger("knowledge_cleanup") KNOWLEDGE_DIR = Path.home() / ".cortex" / "knowledge" ENTITIES_PATH = KNOWLEDGE_DIR / "entities.json" RELATIONSHIPS_PATH = KNOWLEDGE_DIR / "relationships.json" OLLAMA_URL = "http://localhost:11434" OLLAMA_MODEL = "qwen2.5:7b" VALID_TYPES = {"person", "organization", "company", "project", "technology", "location", "event", "concept", "product"} def backup(path: Path) -> Path: """Create timestamped backup.""" ts = datetime.now().strftime("%Y%m%d_%H%M%S") backup_path = path.with_suffix(f".backup_{ts}.json") shutil.copy2(path, backup_path) log.info(f"Backed up {path.name} → {backup_path.name}") return backup_path def atomic_write(path: Path, data): """Write JSON atomically via temp file.""" tmp = path.with_suffix(".tmp") with open(tmp, "w") as f: json.dump(data, f, ensure_ascii=False, indent=2) tmp.replace(path) log.info(f"Wrote {path.name}") def load_entities() -> dict: with open(ENTITIES_PATH) as f: return json.load(f) def load_relationships() -> dict: with open(RELATIONSHIPS_PATH) as f: return json.load(f) def ollama_generate(prompt: str, timeout: int = 120) -> str: """Call Ollama generate API.""" resp = requests.post(f"{OLLAMA_URL}/api/generate", json={ "model": OLLAMA_MODEL, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 2000} }, timeout=timeout) resp.raise_for_status() return resp.json().get("response", "") # ─── Task 1: Classify Unknowns ─────────────────────────────────────────────── def classify_unknowns(entities: dict, dry_run: bool = False) -> dict: """Classify entities with type='unknown' using LLM.""" unknowns = {k: v for k, v in entities.items() if isinstance(v, dict) and v.get("type") == "unknown"} if not unknowns: log.info("No unknown entities to classify.") return entities log.info(f"Classifying {len(unknowns)} unknown entities...") names = list(unknowns.keys()) batch_size = 50 results = {} for i in range(0, len(names), batch_size): batch = names[i:i + batch_size] batch_num = i // batch_size + 1 total_batches = (len(names) + batch_size - 1) // batch_size log.info(f"Batch {batch_num}/{total_batches} ({len(batch)} entities)") numbered = "\n".join(f"{j+1}. {name}" for j, name in enumerate(batch)) prompt = f"""Classify each entity name into exactly one category. Categories: person, organization, company, project, technology, location, event, concept, product If a name looks like a person's first name only (e.g. "sarah", "thomas"), classify as person. If it's a common word that isn't clearly an entity (e.g. "ahnung", "wir", "evtl", "schau"), classify as concept. If unsure, classify as concept. Respond with ONLY a JSON object mapping the number to the category. Example: {{"1": "person", "2": "company", "3": "concept"}} Entities: {numbered} JSON:""" try: response = ollama_generate(prompt) # Extract JSON from response start = response.find("{") end = response.rfind("}") + 1 if start >= 0 and end > start: parsed = json.loads(response[start:end]) for idx_str, category in parsed.items(): idx = int(idx_str) - 1 if 0 <= idx < len(batch): cat = category.strip().lower() if cat in VALID_TYPES: results[batch[idx]] = cat except Exception as e: log.warning(f"Batch {batch_num} failed: {e}") continue time.sleep(0.5) # Be nice to Ollama # Apply results stats = defaultdict(int) for name, new_type in results.items(): old_type = entities[name].get("type", "unknown") if old_type != new_type: stats[f"{old_type} → {new_type}"] += 1 if not dry_run: entities[name]["type"] = new_type entities[name]["classified_by"] = "llm_cleanup" entities[name]["classified_at"] = datetime.now().isoformat() log.info(f"Classified {len(results)}/{len(unknowns)} unknowns:") for transition, count in sorted(stats.items(), key=lambda x: -x[1]): log.info(f" {transition}: {count}") remaining = sum(1 for k, v in entities.items() if isinstance(v, dict) and v.get("type") == "unknown") log.info(f"Remaining unknowns: {remaining}") return entities # ─── Task 2: Deduplicate ───────────────────────────────────────────────────── def find_duplicates(entities: dict) -> list: """Find duplicate entity groups via case-insensitive matching.""" # Group by normalized name groups = defaultdict(list) for name in entities: normalized = name.strip().lower() groups[normalized].append(name) # Also check for substring containment (e.g. "mondo gate" vs "mondo gate ag") names_lower = {name: name.strip().lower() for name in entities} sorted_names = sorted(names_lower.items(), key=lambda x: len(x[1])) # Find names where one is a prefix/substring of another substring_pairs = [] for i, (name_a, low_a) in enumerate(sorted_names): if len(low_a) < 3: continue for name_b, low_b in sorted_names[i+1:]: if low_a == low_b: continue if low_b.startswith(low_a + " ") or low_b.startswith(low_a + "-"): substring_pairs.append((name_a, name_b)) # Build merge groups merge_groups = [] # Exact case duplicates for normalized, names in groups.items(): if len(names) > 1: merge_groups.append(names) # Substring matches (merge into existing groups or create new) for short, long in substring_pairs: found = False for group in merge_groups: if short in group or long in group: if short not in group: group.append(short) if long not in group: group.append(long) found = True break if not found: merge_groups.append([short, long]) return merge_groups def pick_canonical(names: list, entities: dict) -> str: """Pick the most detailed entity name as canonical.""" # Prefer: longest name, most fields, not all-lowercase def score(name): e = entities.get(name, {}) fields = len(e) if isinstance(e, dict) else 0 length = len(name) has_upper = int(any(c.isupper() for c in name)) return (has_upper, fields, length) return max(names, key=score) def deduplicate(entities: dict, relationships: dict, dry_run: bool = False) -> tuple: """Deduplicate entities and update relationships.""" groups = find_duplicates(entities) if not groups: log.info("No duplicates found.") return entities, relationships log.info(f"Found {len(groups)} duplicate groups:") alias_map = {} # old_name → canonical_name for group in groups: canonical = pick_canonical(group, entities) aliases = [n for n in group if n != canonical] if not aliases: continue log.info(f" Canonical: '{canonical}' ← aliases: {aliases}") for alias in aliases: alias_map[alias] = canonical if not dry_run: # Merge fields into canonical canonical_entry = entities.get(canonical, {}) if not isinstance(canonical_entry, dict): canonical_entry = {} existing_aliases = canonical_entry.get("aliases", []) for alias in aliases: if alias not in existing_aliases: existing_aliases.append(alias) alias_entry = entities.get(alias, {}) if isinstance(alias_entry, dict): # Merge non-existing fields for k, v in alias_entry.items(): if k not in canonical_entry and k not in ("type", "aliases"): canonical_entry[k] = v canonical_entry["aliases"] = existing_aliases entities[canonical] = canonical_entry # Remove aliases from entities for alias in aliases: if alias in entities: del entities[alias] # Update relationships if not dry_run and alias_map: updated_rels = {} remapped = 0 for key, rel in relationships.items(): a = rel.get("a", "") b = rel.get("b", "") new_a = alias_map.get(a, a) new_b = alias_map.get(b, b) if new_a != a or new_b != b: remapped += 1 rel["a"] = new_a rel["b"] = new_b new_key = f"{new_a}::{new_b}" if new_key in updated_rels: # Merge: sum counts, keep latest last_seen existing = updated_rels[new_key] existing["count"] = existing.get("count", 0) + rel.get("count", 0) if rel.get("last_seen", "") > existing.get("last_seen", ""): existing["last_seen"] = rel["last_seen"] if rel.get("first_seen", "") < existing.get("first_seen", ""): existing["first_seen"] = rel["first_seen"] # Merge types existing_types = set(existing.get("types", [])) existing_types.update(rel.get("types", [])) existing["types"] = list(existing_types) else: updated_rels[new_key] = rel log.info(f"Remapped {remapped} relationships, merged {len(relationships) - len(updated_rels)} duplicates") relationships = updated_rels log.info(f"Merged {len(alias_map)} aliases into {len(set(alias_map.values()))} canonical entities") return entities, relationships # ─── Task 3: Relationship Scoring ──────────────────────────────────────────── def score_relationships(relationships: dict, dry_run: bool = False) -> dict: """Add strength scores and decay old relationships.""" now = datetime.now() decay_threshold = now - timedelta(days=30) removed = 0 scored = 0 decayed = 0 to_remove = [] for key, rel in relationships.items(): count = rel.get("count", 1) last_seen_str = rel.get("last_seen", "") first_seen_str = rel.get("first_seen", "") types = rel.get("types", []) # Base strength from count (log scale, capped at 1) import math count_score = min(1.0, math.log(count + 1) / math.log(100)) # Context diversity: more relationship types = stronger diversity_score = min(1.0, len(types) * 0.3) # Recency score recency_score = 1.0 if last_seen_str: try: last_seen = datetime.fromisoformat(last_seen_str) days_ago = (now - last_seen).days if days_ago > 30: recency_score = max(0.0, 1.0 - (days_ago - 30) / 180) decayed += 1 except (ValueError, TypeError): pass # Combined strength strength = round( count_score * 0.4 + diversity_score * 0.3 + recency_score * 0.3, 3 ) if strength < 0.1: to_remove.append(key) removed += 1 else: if not dry_run: rel["strength"] = strength scored += 1 if not dry_run: for key in to_remove: del relationships[key] log.info(f"Scored {scored} relationships, decayed {decayed}, removed {removed} (strength < 0.1)") return relationships # ─── Main ──────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Knowledge graph cleanup") parser.add_argument("--classify", action="store_true", help="Classify unknown entities") parser.add_argument("--dedupe", action="store_true", help="Deduplicate entities") parser.add_argument("--score", action="store_true", help="Score relationships") parser.add_argument("--dry-run", action="store_true", help="Show changes without writing") args = parser.parse_args() # If no specific flags, run all run_all = not (args.classify or args.dedupe or args.score) entities = load_entities() relationships = load_relationships() log.info(f"Loaded {len(entities)} entities, {len(relationships)} relationships") # Backup before any modifications if not args.dry_run: backup(ENTITIES_PATH) backup(RELATIONSHIPS_PATH) if args.dry_run: log.info("═══ DRY RUN — no files will be modified ═══") if run_all or args.classify: log.info("─── Step 1: Classify Unknowns ───") entities = classify_unknowns(entities, dry_run=args.dry_run) if run_all or args.dedupe: log.info("─── Step 2: Deduplicate Entities ───") entities, relationships = deduplicate(entities, relationships, dry_run=args.dry_run) if run_all or args.score: log.info("─── Step 3: Score Relationships ───") relationships = score_relationships(relationships, dry_run=args.dry_run) if not args.dry_run: atomic_write(ENTITIES_PATH, entities) atomic_write(RELATIONSHIPS_PATH, relationships) log.info(f"Done. Final: {len(entities)} entities, {len(relationships)} relationships") else: log.info(f"Dry run complete. Would result in: {len(entities)} entities, {len(relationships)} relationships") if __name__ == "__main__": logging.basicConfig( format='%(asctime)s %(name)s %(levelname)s %(message)s', level=logging.INFO, ) main()