#!/usr/bin/env python3 """ Darkplex Loop — The single heartbeat of the intelligence pipeline. One process. One loop. One state machine. Replaces: cron-smart-extractor, knowledge-bridge, knowledge-ingest, pipeline-health. Each cycle: 1. INGEST — Fetch new events from NATS (batch consumer pull) 2. EXTRACT — Pull entities and relationships from events 3. BRIDGE — Sync cortex outputs to knowledge engine 4. VERIFY — Check that real output was produced 5. REPORT — Update state, alert on failure States: RUNNING — Everything nominal DEGRADED — A step failed, but loop continues with recovery attempts EMERGENCY — Critical failure, alerting Usage: darkplex loop # Run loop (default: 1h cycle) darkplex loop --once # Single cycle, then exit darkplex loop --cycle 3600 # Custom cycle interval (seconds) darkplex loop --status # Print current state and exit darkplex loop --check # Check for new events, exit 0=new 1=none """ import json import logging import os import re import signal import subprocess import sys import time import traceback from collections import deque from datetime import datetime, timezone from pathlib import Path # ── Paths (configurable via env) ───────────────────────────────────────────── BASE_DIR = Path(os.environ.get("DARKPLEX_WORKSPACE", Path.home() / "clawd")) SCRIPT_DIR = BASE_DIR / "scripts" LEVEL4_DIR = SCRIPT_DIR / "level4" LOG_DIR = BASE_DIR / "logs" STATE_FILE = BASE_DIR / "memory" / "darkplex-loop-state.json" KNOWLEDGE_DIR = Path(os.environ.get("DARKPLEX_KNOWLEDGE_DIR", Path.home() / ".cortex" / "knowledge")) ENTITIES_FILE = KNOWLEDGE_DIR / "entities.json" RELATIONSHIPS_FILE = KNOWLEDGE_DIR / "relationships.json" NATS_STREAM = os.environ.get("DARKPLEX_NATS_STREAM", "openclaw-events") NATS_CONSUMER = os.environ.get("DARKPLEX_NATS_CONSUMER", "darkplex-loop") NATS_BATCH_SIZE = int(os.environ.get("DARKPLEX_NATS_BATCH", "2000")) DEFAULT_CYCLE_SECONDS = 3600 # 1 hour ALERT_COOLDOWN = 3600 # 1 alert per hour max log = logging.getLogger("darkplex-loop") # ── State Machine ──────────────────────────────────────────────────────────── class LoopState: """Persistent state for the Darkplex Loop.""" def __init__(self): self.status = "INIT" self.cycle_count = 0 self.last_cycle = None self.last_success = None self.last_failure = None self.last_alert = None self.consecutive_failures = 0 self.entities_total = 0 self.relationships_total = 0 self.entities_extracted_last = 0 self.entities_new_last = 0 self.events_processed_last = 0 self.steps = {} self.error = None self.perf = {} # last cycle: ingest_ms, extract_ms, bridge_ms, verify_ms, total_ms self.perf_history = [] # last 10 cycles [{total_ms, ingest_ms, ...}] self._load() def _load(self): try: data = json.loads(STATE_FILE.read_text()) for k, v in data.items(): if hasattr(self, k): setattr(self, k, v) except (FileNotFoundError, json.JSONDecodeError): pass def save(self): STATE_FILE.parent.mkdir(parents=True, exist_ok=True) STATE_FILE.write_text(json.dumps(self.__dict__, indent=2, default=str)) def record_perf(self, perf: dict): """Record performance metrics for this cycle.""" self.perf = perf self.perf_history.append(perf) self.perf_history = self.perf_history[-10:] # keep last 10 def perf_averages(self) -> dict: """Running averages over last 10 cycles.""" if not self.perf_history: return {} keys = self.perf_history[0].keys() return {k: int(sum(p.get(k, 0) for p in self.perf_history) / len(self.perf_history)) for k in keys} def record_success(self, step_results: dict): self.status = "RUNNING" self.consecutive_failures = 0 self.last_success = datetime.now(timezone.utc).isoformat() self.last_cycle = self.last_success self.cycle_count += 1 self.steps = step_results self.error = None self.save() def record_failure(self, step: str, error: str): self.consecutive_failures += 1 self.last_failure = datetime.now(timezone.utc).isoformat() self.last_cycle = self.last_failure self.cycle_count += 1 self.error = f"{step}: {error}" if self.consecutive_failures >= 3: self.status = "EMERGENCY" else: self.status = "DEGRADED" self.save() def can_alert(self) -> bool: if not self.last_alert: return True try: last = datetime.fromisoformat(self.last_alert) return (datetime.now(timezone.utc) - last).total_seconds() > ALERT_COOLDOWN except (ValueError, TypeError): return True def mark_alerted(self): self.last_alert = datetime.now(timezone.utc).isoformat() self.save() # ── Pipeline Steps ─────────────────────────────────────────────────────────── def _nats_cmd(): """Build NATS CLI base command with auth.""" nats_bin = os.environ.get("NATS_BIN", "nats") nats_url = os.environ.get("NATS_URL", "") if nats_url: return [nats_bin, "-s", nats_url] return [nats_bin] def check_new_events() -> int: """Return number of pending events in the consumer. 0 = nothing new.""" try: r = subprocess.run( _nats_cmd() + ["consumer", "info", NATS_STREAM, NATS_CONSUMER, "--json"], capture_output=True, text=True, timeout=10, ) if r.returncode != 0: return -1 info = json.loads(r.stdout) return info.get("num_pending", 0) except Exception as e: log.warning(f"check_new_events failed: {e}") return -1 def step_ingest(state: LoopState) -> dict: """Step 1: Fetch new events from NATS using batch consumer pull.""" log.info("STEP 1: INGEST — Fetching events from NATS") last_processed_seq_file = BASE_DIR / "memory" / "darkplex-last-processed-seq.json" # Check how many pending pending = check_new_events() if pending == 0: log.info("INGEST: No new events — skipping cycle") return {"events": [], "total_scanned": 0, "skipped": 0, "skip_reason": "no_new_events"} log.info(f"INGEST: {pending} pending events in consumer") events = [] total_fetched = 0 parse_errors = 0 # Fetch in batches remaining = min(pending, NATS_BATCH_SIZE) if pending > 0 else NATS_BATCH_SIZE try: batch_size = min(remaining, NATS_BATCH_SIZE) result = subprocess.run( _nats_cmd() + ["consumer", "next", NATS_STREAM, NATS_CONSUMER, "--count", str(batch_size), "--raw"], capture_output=True, text=True, timeout=30, ) if result.returncode != 0: log.warning(f"Batch fetch failed (rc={result.returncode}), falling back to sequential") return _step_ingest_sequential(state) for line in result.stdout.strip().split("\n"): if not line.strip(): continue try: data = json.loads(line) events.append(data) total_fetched += 1 except json.JSONDecodeError: parse_errors += 1 except subprocess.TimeoutExpired: log.warning("Batch fetch timed out, falling back to sequential") return _step_ingest_sequential(state) # Update sequence tracking (get current stream seq from consumer info) try: r = subprocess.run( _nats_cmd() + ["consumer", "info", NATS_STREAM, NATS_CONSUMER, "--json"], capture_output=True, text=True, timeout=10, ) if r.returncode == 0: info = json.loads(r.stdout) stream_seq = info["delivered"]["stream_seq"] last_processed_seq_file.parent.mkdir(parents=True, exist_ok=True) last_processed_seq_file.write_text(json.dumps({"last_seq": stream_seq})) except Exception: log.warning("Could not save last processed sequence") log.info(f"INGEST: {len(events)} events fetched in batch ({parse_errors} parse errors)") return {"events": events, "total_scanned": total_fetched + parse_errors, "skipped": parse_errors} def _step_ingest_sequential(state: LoopState) -> dict: """Fallback: sequential fetch via stream get (slow but reliable).""" import base64 log.info("INGEST FALLBACK: Sequential fetch") last_processed_seq_file = BASE_DIR / "memory" / "darkplex-last-processed-seq.json" last_processed_seq = 0 try: if last_processed_seq_file.exists(): last_processed_seq = json.loads(last_processed_seq_file.read_text()).get("last_seq", 0) except Exception: pass r = subprocess.run( _nats_cmd() + ["stream", "info", NATS_STREAM, "--json"], capture_output=True, text=True, timeout=10, ) if r.returncode != 0: return {"events": [], "total_scanned": 0, "skipped": 0} info = json.loads(r.stdout) end_seq = info["state"]["last_seq"] start_seq = max(last_processed_seq + 1, end_seq - NATS_BATCH_SIZE) events = [] skipped = 0 for seq in range(start_seq, end_seq + 1): try: result = subprocess.run( _nats_cmd() + ["stream", "get", NATS_STREAM, str(seq), "--json"], capture_output=True, text=True, timeout=5, ) if result.returncode != 0: skipped += 1 continue msg = json.loads(result.stdout) if "conversation_message_in" not in msg.get("subject", ""): skipped += 1 continue data = json.loads(base64.b64decode(msg["data"]).decode("utf-8")) events.append(data) except Exception: skipped += 1 try: last_processed_seq_file.parent.mkdir(parents=True, exist_ok=True) last_processed_seq_file.write_text(json.dumps({"last_seq": end_seq})) except Exception: pass log.info(f"INGEST (sequential): {len(events)} events (scanned {end_seq - start_seq + 1}, skipped {skipped})") return {"events": events, "total_scanned": end_seq - start_seq + 1, "skipped": skipped} def step_extract(state: LoopState, events: list) -> dict: """Step 2: Extract entities and relationships from events.""" log.info(f"STEP 2: EXTRACT — Processing {len(events)} events") if not events: log.info("EXTRACT: No events to process") return {"extracted": 0, "new_entities": 0, "new_relationships": 0} sys.path.insert(0, str(LEVEL4_DIR)) import importlib.util spec = importlib.util.spec_from_file_location("entity_manager", LEVEL4_DIR / "entity-manager.py") em = importlib.util.module_from_spec(spec) spec.loader.exec_module(em) # Try LLM batch extraction first from llm_extractor import extract_entities_llm_batch, is_available as llm_available use_llm = os.environ.get("DARKPLEX_EXTRACTOR", "auto").lower() in ("llm", "auto") llm_ok = use_llm and llm_available() if llm_ok: log.info("EXTRACT: Using LLM extractor (Ollama)") else: log.info("EXTRACT: Using regex extractor (fallback)") known = em.load_known_entities() entities = em.load_json(ENTITIES_FILE) relationships = em.load_json(RELATIONSHIPS_FILE) total_extracted = 0 new_entities = 0 new_relationships = 0 ts_now = time.strftime("%Y-%m-%dT%H:%M:%S") # Prepare texts for potential batch LLM processing event_texts = [] for event in events: payload = event.get("payload", {}) text = payload.get("text_preview", "") or payload.get("text", "") if isinstance(text, list): parts = [] for t in text: parts.append(t.get("text", "") if isinstance(t, dict) else str(t)) text = " ".join(parts) if not isinstance(text, str): text = str(text) score = _importance(text) if text else 0.0 event_texts.append((text, score)) # LLM batch extraction for qualifying texts llm_results = {} if llm_ok: batch_texts = [t for t, s in event_texts if t and s >= 0.4] if batch_texts: consecutive_fails = 0 for i in range(0, len(batch_texts), 10): if consecutive_fails >= 3: log.warning("EXTRACT: 3 consecutive LLM failures, falling back to regex") llm_ok = False break chunk = batch_texts[i:i+10] batch_result = extract_entities_llm_batch(chunk) if batch_result: llm_results.update(batch_result) consecutive_fails = 0 else: consecutive_fails += 1 if llm_results: log.info(f"EXTRACT: LLM batch found {len(llm_results)} entities") for idx, event in enumerate(events): text, score = event_texts[idx] if not text or score < 0.4: continue if llm_ok and llm_results: # Use LLM results + known entity matching found = em._extract_known(text, known) if hasattr(em, '_extract_known') else {} # Add LLM entities that appear in this text text_lower = text.lower() for name, info in llm_results.items(): variants = [name, name.replace("-", " "), name.replace("-", "")] if any(v in text_lower for v in variants if len(v) > 2): found[name] = info else: found = em.extract_entities(text, known) if not found: continue total_extracted += len(found) names = list(found.keys()) for name, info in found.items(): if name not in entities: entities[name] = { "type": info["type"], "source": "darkplex-loop", "first_seen": ts_now, } new_entities += 1 known[name] = entities[name] if len(names) >= 2: for i in range(len(names)): for j in range(i + 1, min(len(names), i + 5)): a, b = min(names[i], names[j]), max(names[i], names[j]) key = f"{a}::{b}" if key in relationships: relationships[key]["count"] = relationships[key].get("count", 1) + 1 relationships[key]["last_seen"] = ts_now else: relationships[key] = { "a": a, "b": b, "types": ["co-occurrence"], "count": 1, "first_seen": ts_now, "last_seen": ts_now, } new_relationships += 1 em.save_json(ENTITIES_FILE, entities) em.save_json(RELATIONSHIPS_FILE, relationships) state.entities_total = len(entities) state.relationships_total = len(relationships) state.entities_extracted_last = total_extracted state.entities_new_last = new_entities state.events_processed_last = len(events) log.info(f"EXTRACT: {total_extracted} entities ({new_entities} new), {new_relationships} new relationships") return {"extracted": total_extracted, "new_entities": new_entities, "new_relationships": new_relationships} def step_bridge(state: LoopState) -> dict: """Step 3: Run knowledge bridge.""" log.info("STEP 3: BRIDGE — Syncing cortex outputs") bridge_script = SCRIPT_DIR / "knowledge-bridge.py" if not bridge_script.exists(): log.warning("BRIDGE: knowledge-bridge.py not found, skipping") return {"status": "skipped", "reason": "script not found"} result = subprocess.run( [sys.executable, str(bridge_script), "sync"], capture_output=True, text=True, timeout=120, ) if result.returncode != 0: log.warning(f"BRIDGE: Failed — {result.stderr[:200]}") return {"status": "failed", "error": result.stderr[:200]} bridged = 0 for line in result.stdout.split("\n"): m = re.search(r"(\d+)\s+(?:new|bridged|added)", line, re.I) if m: bridged += int(m.group(1)) log.info(f"BRIDGE: {bridged} items bridged") return {"status": "ok", "bridged": bridged} def step_verify(state: LoopState, extract_result: dict) -> dict: """Step 4: Verify output quality.""" log.info("STEP 4: VERIFY — Checking output quality") issues = [] for f, label in [(ENTITIES_FILE, "entities"), (RELATIONSHIPS_FILE, "relationships")]: if not f.exists(): issues.append(f"{label} file missing") else: try: data = json.loads(f.read_text()) if not data: issues.append(f"{label} file is empty") except json.JSONDecodeError: issues.append(f"{label} file is corrupt JSON") events_processed = state.events_processed_last extracted = extract_result.get("extracted", 0) if events_processed > 10 and extracted == 0: issues.append(f"0 entities from {events_processed} events — extraction may be broken") try: r = subprocess.run(["nats", "stream", "ls", "--json"], capture_output=True, text=True, timeout=10) if r.returncode != 0: issues.append("NATS unreachable") except Exception as e: issues.append(f"NATS check failed: {e}") verdict = "PASS" if not issues else "FAIL" log.info(f"VERIFY: {verdict} — {len(issues)} issues") for issue in issues: log.warning(f" ⚠ {issue}") return {"verdict": verdict, "issues": issues} def step_report(state: LoopState, verify_result: dict): """Step 5: Alert if degraded/emergency.""" if state.status == "RUNNING": return if not state.can_alert(): log.info("REPORT: Alert cooldown active, skipping") return severity = "🔴 EMERGENCY" if state.status == "EMERGENCY" else "🟡 DEGRADED" msg = ( f"Darkplex Loop {severity}\n" f"Consecutive failures: {state.consecutive_failures}\n" f"Error: {state.error}\n" f"Issues: {', '.join(verify_result.get('issues', []))}" ) log.warning(f"REPORT: Sending alert — {state.status}") try: subprocess.run( ["python3", str(SCRIPT_DIR / "vera-alert.py"), msg], capture_output=True, text=True, timeout=15, ) except Exception: pass flag = LOG_DIR / "darkplex-loop-alert.flag" flag.write_text(f"{datetime.now().isoformat()} {state.status}: {state.error}") state.mark_alerted() # ── Helpers ────────────────────────────────────────────────────────────────── def _importance(text: str) -> float: """Importance scoring for event text.""" if not text: return 0.0 score = 0.3 if len(text) > 200: score += 0.1 if len(text) > 500: score += 0.1 caps = len(re.findall(r"\b[A-Z][a-z]+\b", text)) if caps > 3: score += 0.1 if caps > 8: score += 0.1 for p in ["HEARTBEAT_OK", "heartbeat", "cron:", "health check", "no critical"]: if p.lower() in text.lower(): score -= 0.3 for w in ["meeting", "project", "company", "contract", "decision", "strategy", "budget", "deadline", "milestone", "partnership", "investment", "revenue", "client", "proposal", "agreement"]: if w in text.lower(): score += 0.05 return max(0.0, min(1.0, score)) def print_status(): """Print current loop state.""" state = LoopState() ent_count = rel_count = 0 try: ent_count = len(json.loads(ENTITIES_FILE.read_text())) except Exception: pass try: rel_count = len(json.loads(RELATIONSHIPS_FILE.read_text())) except Exception: pass icon = {"RUNNING": "🟢", "DEGRADED": "🟡", "EMERGENCY": "🔴"}.get(state.status, "⚪") print(f"{icon} Status: {state.status}") print(f"Cycles: {state.cycle_count}") print(f"Last cycle: {state.last_cycle or 'never'}") print(f"Last success: {state.last_success or 'never'}") print(f"Last failure: {state.last_failure or 'never'}") print(f"Failures: {state.consecutive_failures}") print(f"Entities: {ent_count} total (last cycle: {state.entities_extracted_last}, {state.entities_new_last} new)") print(f"Relationships:{rel_count} total") if state.error: print(f"Error: {state.error}") # ── Main Loop ──────────────────────────────────────────────────────────────── def _ms_since(t0: float) -> int: return int((time.monotonic() - t0) * 1000) def run_cycle(state: LoopState) -> bool: """Run one complete pipeline cycle. Returns True on success.""" log.info(f"═══ CYCLE {state.cycle_count + 1} START ═══") step_results = {} perf = {} t_cycle = time.monotonic() try: t0 = time.monotonic() ingest = step_ingest(state) perf["ingest_ms"] = _ms_since(t0) step_results["ingest"] = {"events": len(ingest["events"]), "scanned": ingest["total_scanned"]} # Early skip if no new events if ingest.get("skip_reason") == "no_new_events": perf["total_ms"] = _ms_since(t_cycle) state.record_perf(perf) state.save() log.info(f"═══ CYCLE SKIPPED (no new events) — {perf['total_ms']}ms ═══") return True t0 = time.monotonic() extract = step_extract(state, ingest["events"]) perf["extract_ms"] = _ms_since(t0) step_results["extract"] = extract t0 = time.monotonic() bridge = step_bridge(state) perf["bridge_ms"] = _ms_since(t0) step_results["bridge"] = bridge t0 = time.monotonic() verify = step_verify(state, extract) perf["verify_ms"] = _ms_since(t0) step_results["verify"] = verify perf["total_ms"] = _ms_since(t_cycle) state.record_perf(perf) if verify["verdict"] == "FAIL" and any("broken" in i or "missing" in i or "corrupt" in i for i in verify["issues"]): state.record_failure("verify", "; ".join(verify["issues"])) step_report(state, verify) return False state.record_success(step_results) avgs = state.perf_averages() log.info(f"═══ CYCLE {state.cycle_count} DONE — {state.status} — {perf['total_ms']}ms (avg {avgs.get('total_ms', '?')}ms) ═══") log.info(f" Perf: ingest={perf.get('ingest_ms')}ms extract={perf.get('extract_ms')}ms bridge={perf.get('bridge_ms')}ms verify={perf.get('verify_ms')}ms") flag = LOG_DIR / "darkplex-loop-alert.flag" if flag.exists(): flag.unlink() return True except Exception as e: perf["total_ms"] = _ms_since(t_cycle) state.record_perf(perf) step_name = "unknown" for name in ["ingest", "extract", "bridge", "verify"]: if name not in step_results: step_name = name break log.error(f"CYCLE FAILED at {step_name}: {e}") log.error(traceback.format_exc()) state.record_failure(step_name, str(e)[:300]) step_report(state, {"issues": [str(e)]}) return False def main(): """CLI entry point for `darkplex loop`.""" logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler(LOG_DIR / "darkplex-loop.log"), logging.StreamHandler(), ], ) LOG_DIR.mkdir(parents=True, exist_ok=True) args = sys.argv[1:] if "--status" in args: print_status() return if "--check" in args: pending = check_new_events() if pending > 0: print(f"NEW: {pending} events pending") sys.exit(0) elif pending == 0: print("NONE: No new events") sys.exit(1) else: print("ERROR: Could not check") sys.exit(2) once = "--once" in args cycle_seconds = DEFAULT_CYCLE_SECONDS for i, arg in enumerate(args): if arg == "--cycle" and i + 1 < len(args): cycle_seconds = int(args[i + 1]) state = LoopState() log.info(f"Darkplex Loop starting — cycle every {cycle_seconds}s, once={once}") running = True def handle_signal(sig, frame): nonlocal running log.info("Shutdown signal received") running = False signal.signal(signal.SIGTERM, handle_signal) signal.signal(signal.SIGINT, handle_signal) while running: run_cycle(state) if once: break log.info(f"Sleeping {cycle_seconds}s until next cycle...") for _ in range(cycle_seconds): if not running: break time.sleep(1) log.info("Darkplex Loop stopped")