darkplex-core/cortex/memory_hygiene.py
Claudia 43d033e242 feat: initial cortex package — 8 intelligence modules, CLI, Docker
Modules: triage, health_scanner, feedback_loop, memory_hygiene,
         roadmap, validate_output, enhanced_search, auto_handoff
         + composite_scorer, intent_classifier

CLI: 'cortex <module> <command>' unified entry point
Tests: 157/169 passing (12 assertion mismatches from rename)
Docker: python:3.11-slim based
2026-02-09 11:18:20 +01:00

453 lines
15 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Memory Hygiene Tools — find duplicates, stale content, orphans, stats, archive."""
import argparse
import hashlib
import json
import os
import re
import shutil
import sys
from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path
MEMORY_DIR = Path.home() / "clawd" / "memory"
ARCHIVE_DIR = MEMORY_DIR / "archive"
CONFIG_PATH = Path(__file__).parent / "config.json"
PERMANENT_FILES = {
"MEMORY.md", "WORKING.md", "growth-log.md", "BOOT_CONTEXT.md",
"README.md", "active-context.json", "network-map.md",
"learned-context.md", "email-contacts.json",
}
DAILY_NOTE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}(?:-.+)?\.md$")
DATE_RE = re.compile(r"\b(20\d{2})-(\d{2})-(\d{2})\b")
TODO_RE = re.compile(r"(?:TODO|FIXME|HACK|XXX)\b", re.IGNORECASE)
IN_PROGRESS_RE = re.compile(r"status:\s*in.?progress", re.IGNORECASE)
LINK_RE = re.compile(r"\[([^\]]*)\]\(([^)]+)\)")
EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+")
PHONE_RE = re.compile(r"(?:\+\d[\d\s\-]{7,}|\(\d+\)\s*[\d\s\-]{5,})")
def load_config():
if CONFIG_PATH.exists():
return json.loads(CONFIG_PATH.read_text())
return {}
def get_md_files(base: Path | None = None, recursive: bool = True) -> list[Path]:
if base is None:
base = MEMORY_DIR
if recursive:
return sorted(base.rglob("*.md"))
return sorted(base.glob("*.md"))
def _normalize(text: str) -> str:
return re.sub(r"\s+", " ", text.lower().strip())
def _para_hash(para: str) -> str:
return hashlib.md5(_normalize(para).encode()).hexdigest()
# --- Duplicates ---
def find_duplicates(min_length: int = 50, threshold: float = 0.8) -> list[dict]:
"""Find near-duplicate paragraphs across memory files."""
# Collect paragraphs with their fingerprints
para_index: dict[str, list[dict]] = defaultdict(list)
for fp in get_md_files():
try:
text = fp.read_text(errors="replace")
except Exception:
continue
paragraphs = re.split(r"\n\s*\n", text)
line = 1
for para in paragraphs:
stripped = para.strip()
if len(stripped) < min_length:
line += para.count("\n") + 1
continue
h = _para_hash(stripped)
para_index[h].append({
"file": str(fp.relative_to(MEMORY_DIR)),
"line": line,
"preview": stripped[:100],
})
line += para.count("\n") + 1
dupes = []
for h, locations in para_index.items():
if len(locations) > 1:
# Deduplicate by file (same file same hash = skip)
seen = set()
unique = []
for loc in locations:
key = (loc["file"], loc["line"])
if key not in seen:
seen.add(key)
unique.append(loc)
if len(unique) > 1:
dupes.append({"hash": h, "locations": unique})
return dupes
# --- Staleness ---
def find_stale(now: datetime | None = None) -> list[dict]:
"""Find potentially stale content."""
if now is None:
now = datetime.now()
cfg = load_config()
stale_cfg = cfg.get("staleness", {})
date_days = stale_cfg.get("date_days", 90)
todo_days = stale_cfg.get("todo_days", 30)
progress_days = stale_cfg.get("in_progress_days", 14)
contact_days = stale_cfg.get("contact_days", 180)
results = []
for fp in get_md_files():
try:
text = fp.read_text(errors="replace")
mtime = datetime.fromtimestamp(fp.stat().st_mtime)
except Exception:
continue
rel = str(fp.relative_to(MEMORY_DIR))
lines = text.split("\n")
file_age = (now - mtime).days
for i, line in enumerate(lines, 1):
# Old dates in non-historical context
for m in DATE_RE.finditer(line):
try:
d = datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)))
age = (now - d).days
if age > date_days and "history" not in rel.lower() and "archive" not in rel.lower():
results.append({
"file": rel, "line": i,
"reason": f"Date {m.group(0)} is {age} days old",
"severity": "info",
})
except ValueError:
pass
# Old TODOs
if TODO_RE.search(line) and file_age > todo_days:
results.append({
"file": rel, "line": i,
"reason": f"TODO in file not modified for {file_age} days",
"severity": "warning",
})
# Stale in_progress
if IN_PROGRESS_RE.search(line) and file_age > progress_days:
results.append({
"file": rel, "line": i,
"reason": f"in_progress status, file not updated for {file_age} days",
"severity": "warning",
})
# Contact info staleness
if (EMAIL_RE.search(line) or PHONE_RE.search(line)) and file_age > contact_days:
results.append({
"file": rel, "line": i,
"reason": f"Contact info in file not updated for {file_age} days",
"severity": "info",
})
return results
# --- Orphans ---
def find_orphans() -> dict:
"""Find orphaned files and broken links."""
all_files = set()
for fp in MEMORY_DIR.rglob("*"):
if fp.is_file():
all_files.add(str(fp.relative_to(MEMORY_DIR)))
# Collect all references
referenced = set()
broken_links = []
for fp in get_md_files():
try:
text = fp.read_text(errors="replace")
except Exception:
continue
rel = str(fp.relative_to(MEMORY_DIR))
for m in LINK_RE.finditer(text):
target = m.group(2)
if target.startswith("http://") or target.startswith("https://"):
continue
# Resolve relative to file's directory
target_clean = target.split("#")[0].split("?")[0]
if not target_clean:
continue
resolved = (fp.parent / target_clean).resolve()
try:
ref_rel = str(resolved.relative_to(MEMORY_DIR))
referenced.add(ref_rel)
except ValueError:
pass
if not resolved.exists():
broken_links.append({
"file": rel,
"link_text": m.group(1),
"target": target,
})
# Orphaned files (never referenced, not permanent)
orphaned = []
for f in sorted(all_files):
name = Path(f).name
if name in PERMANENT_FILES:
continue
if f.startswith("archive/"):
continue
if f not in referenced:
orphaned.append(f)
# Empty/near-empty files
empty = []
for fp in MEMORY_DIR.rglob("*"):
if fp.is_file() and fp.stat().st_size < 10:
empty.append(str(fp.relative_to(MEMORY_DIR)))
return {
"orphaned_files": orphaned,
"broken_links": broken_links,
"empty_files": sorted(empty),
}
# --- Stats ---
def gather_stats() -> dict:
"""Gather statistics about the memory directory."""
now = datetime.now()
files = list(MEMORY_DIR.rglob("*"))
file_list = [f for f in files if f.is_file()]
total_size = sum(f.stat().st_size for f in file_list)
by_ext: dict[str, int] = defaultdict(int)
mtimes = []
sizes = []
word_counts = []
changed_24h = changed_7d = changed_30d = 0
for f in file_list:
ext = f.suffix or "(none)"
by_ext[ext] += 1
st = f.stat()
mt = datetime.fromtimestamp(st.st_mtime)
mtimes.append((str(f.relative_to(MEMORY_DIR)), mt))
sizes.append((str(f.relative_to(MEMORY_DIR)), st.st_size))
age = (now - mt).days
if age < 1:
changed_24h += 1
if age < 7:
changed_7d += 1
if age < 30:
changed_30d += 1
if f.suffix == ".md":
try:
words = len(f.read_text(errors="replace").split())
word_counts.append((str(f.relative_to(MEMORY_DIR)), words))
except Exception:
pass
mtimes.sort(key=lambda x: x[1])
sizes.sort(key=lambda x: x[1], reverse=True)
return {
"total_files": len(file_list),
"total_size_bytes": total_size,
"total_size_human": f"{total_size / 1024:.1f} KB",
"files_by_extension": dict(sorted(by_ext.items())),
"oldest": {"file": mtimes[0][0], "date": mtimes[0][1].isoformat()} if mtimes else None,
"newest": {"file": mtimes[-1][0], "date": mtimes[-1][1].isoformat()} if mtimes else None,
"largest_files": [{"file": f, "bytes": s} for f, s in sizes[:10]],
"changed_24h": changed_24h,
"changed_7d": changed_7d,
"changed_30d": changed_30d,
"word_count_top10": sorted(word_counts, key=lambda x: x[1], reverse=True)[:10],
}
# --- Archive ---
def archive_old_notes(older_than_days: int = 90, execute: bool = False, now: datetime | None = None) -> dict:
"""Archive old daily notes."""
if now is None:
now = datetime.now()
cutoff = now - timedelta(days=older_than_days)
to_move = []
for fp in MEMORY_DIR.glob("*.md"):
name = fp.name
if name in PERMANENT_FILES:
continue
if not DAILY_NOTE_RE.match(name):
continue
# Extract date from filename
m = DATE_RE.match(name)
if not m:
continue
try:
file_date = datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)))
except ValueError:
continue
if file_date < cutoff:
year = m.group(1)
dest_dir = ARCHIVE_DIR / year
to_move.append({
"source": str(fp.relative_to(MEMORY_DIR)),
"dest": str(dest_dir.relative_to(MEMORY_DIR) / name),
"date": file_date.isoformat(),
})
manifest = {
"archived_at": now.isoformat(),
"older_than_days": older_than_days,
"dry_run": not execute,
"files": to_move,
"count": len(to_move),
}
if execute and to_move:
for item in to_move:
src = MEMORY_DIR / item["source"]
dst = MEMORY_DIR / item["dest"]
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(src), str(dst))
# Write manifest
manifest_path = ARCHIVE_DIR / f"manifest-{now.strftime('%Y%m%d-%H%M%S')}.json"
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(json.dumps(manifest, indent=2))
manifest["manifest_path"] = str(manifest_path.relative_to(MEMORY_DIR))
return manifest
# --- Report ---
def generate_report() -> tuple[str, bool]:
"""Generate combined markdown report. Returns (report_text, has_critical)."""
lines = ["# Memory Hygiene Report", f"Generated: {datetime.now().isoformat()}", ""]
has_critical = False
# Stats
stats = gather_stats()
lines.append("## Stats")
lines.append(f"- **Files:** {stats['total_files']} ({stats['total_size_human']})")
lines.append(f"- **Changed 24h/7d/30d:** {stats['changed_24h']}/{stats['changed_7d']}/{stats['changed_30d']}")
lines.append("")
# Duplicates
dupes = find_duplicates()
lines.append(f"## Duplicates ({len(dupes)} found)")
for d in dupes[:10]:
locs = ", ".join(f"`{l['file']}:{l['line']}`" for l in d["locations"])
lines.append(f"- {locs}: {d['locations'][0]['preview'][:60]}...")
lines.append("")
# Staleness
stale = find_stale()
warnings = [s for s in stale if s["severity"] == "warning"]
lines.append(f"## Stale Items ({len(stale)} total, {len(warnings)} warnings)")
if warnings:
has_critical = True
for s in stale[:20]:
icon = "⚠️" if s["severity"] == "warning" else ""
lines.append(f"- {icon} `{s['file']}:{s['line']}` — {s['reason']}")
lines.append("")
# Orphans
orph = find_orphans()
bl = orph["broken_links"]
lines.append(f"## Orphans")
lines.append(f"- **Orphaned files:** {len(orph['orphaned_files'])}")
lines.append(f"- **Broken links:** {len(bl)}")
lines.append(f"- **Empty files:** {len(orph['empty_files'])}")
if bl:
has_critical = True
for b in bl[:10]:
lines.append(f" - `{b['file']}` → `{b['target']}` (broken)")
lines.append("")
# Archive candidates
archive = archive_old_notes(older_than_days=90, execute=False)
lines.append(f"## Archive Candidates ({archive['count']} files older than 90 days)")
for f in archive["files"][:10]:
lines.append(f"- `{f['source']}` → `{f['dest']}`")
lines.append("")
return "\n".join(lines), has_critical
def main():
parser = argparse.ArgumentParser(description="Memory Hygiene Tools")
sub = parser.add_subparsers(dest="command")
sub.add_parser("dupes", help="Find duplicate content")
sub.add_parser("stale", help="Find stale content")
sub.add_parser("orphans", help="Find orphaned files and broken links")
sub.add_parser("stats", help="Memory statistics")
arc = sub.add_parser("archive", help="Archive old daily notes")
arc.add_argument("--older-than", default="90d", help="Age threshold (e.g., 90d)")
arc.add_argument("--execute", action="store_true", help="Actually move files (default: dry-run)")
sub.add_parser("report", help="Full hygiene report")
args = parser.parse_args()
if not args.command:
parser.print_help()
sys.exit(1)
if args.command == "dupes":
dupes = find_duplicates()
print(json.dumps(dupes, indent=2, ensure_ascii=False))
print(f"\n{len(dupes)} duplicate groups found.", file=sys.stderr)
elif args.command == "stale":
stale = find_stale()
print(json.dumps(stale, indent=2, ensure_ascii=False))
print(f"\n{len(stale)} stale items found.", file=sys.stderr)
elif args.command == "orphans":
orph = find_orphans()
print(json.dumps(orph, indent=2, ensure_ascii=False))
elif args.command == "stats":
stats = gather_stats()
print(json.dumps(stats, indent=2, ensure_ascii=False))
elif args.command == "archive":
days = int(args.older_than.rstrip("d"))
result = archive_old_notes(older_than_days=days, execute=args.execute)
print(json.dumps(result, indent=2, ensure_ascii=False))
if not args.execute and result["count"] > 0:
print(f"\nDry run: {result['count']} files would be archived. Use --execute to proceed.", file=sys.stderr)
elif args.command == "report":
report, has_critical = generate_report()
print(report)
if has_critical:
sys.exit(1)
if __name__ == "__main__":
main()