Some checks failed
Tests / test (push) Failing after 5s
New cortex/memory/ module that provides: - boot_assembler: builds BOOTSTRAP.md from threads, decisions, narrative - thread_tracker: tracks conversation threads across sessions via NATS - narrative_generator: daily narrative with Ollama LLM (fallback: structured) - pre_compaction: snapshot pipeline before context compaction CLI commands: - cortex memory bootstrap [--dry-run] [--workspace DIR] - cortex memory snapshot [--workspace DIR] - cortex memory threads [--summary] [--hours N] All paths configurable via WORKSPACE_DIR, NATS_URL, AGENT_NAME env vars. No hardcoded paths. Works with any OpenClaw agent. Fixes array/dict handling for empty threads.json and decisions.json.
830 lines
30 KiB
Python
830 lines
30 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Darkplex Loop — The single heartbeat of the intelligence pipeline.
|
|
|
|
One process. One loop. One state machine.
|
|
Replaces: cron-smart-extractor, knowledge-bridge, knowledge-ingest, pipeline-health.
|
|
|
|
Each cycle:
|
|
1. INGEST — Fetch new events from NATS (batch consumer pull)
|
|
2. EXTRACT — Pull entities and relationships from events
|
|
3. BRIDGE — Sync cortex outputs to knowledge engine
|
|
4. VERIFY — Check that real output was produced
|
|
5. REPORT — Update state, alert on failure
|
|
|
|
States:
|
|
RUNNING — Everything nominal
|
|
DEGRADED — A step failed, but loop continues with recovery attempts
|
|
EMERGENCY — Critical failure, alerting
|
|
|
|
Usage:
|
|
darkplex loop # Run loop (default: 1h cycle)
|
|
darkplex loop --once # Single cycle, then exit
|
|
darkplex loop --cycle 3600 # Custom cycle interval (seconds)
|
|
darkplex loop --status # Print current state and exit
|
|
darkplex loop --check # Check for new events, exit 0=new 1=none
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import traceback
|
|
import urllib.request
|
|
from collections import deque
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# ── Paths (configurable via env) ─────────────────────────────────────────────
|
|
|
|
BASE_DIR = Path(os.environ.get("DARKPLEX_WORKSPACE", Path.home() / "clawd"))
|
|
SCRIPT_DIR = BASE_DIR / "scripts"
|
|
LEVEL4_DIR = SCRIPT_DIR / "level4"
|
|
LOG_DIR = BASE_DIR / "logs"
|
|
STATE_FILE = BASE_DIR / "memory" / "darkplex-loop-state.json"
|
|
KNOWLEDGE_DIR = Path(os.environ.get("DARKPLEX_KNOWLEDGE_DIR", Path.home() / ".cortex" / "knowledge"))
|
|
ENTITIES_FILE = KNOWLEDGE_DIR / "entities.json"
|
|
RELATIONSHIPS_FILE = KNOWLEDGE_DIR / "relationships.json"
|
|
|
|
NATS_STREAM = os.environ.get("DARKPLEX_NATS_STREAM", "openclaw-events")
|
|
NATS_CONSUMER = os.environ.get("DARKPLEX_NATS_CONSUMER", "darkplex-loop")
|
|
NATS_BATCH_SIZE = int(os.environ.get("DARKPLEX_NATS_BATCH", "2000"))
|
|
DEFAULT_CYCLE_SECONDS = 3600 # 1 hour
|
|
ALERT_COOLDOWN = 3600 # 1 alert per hour max
|
|
|
|
log = logging.getLogger("darkplex-loop")
|
|
|
|
|
|
# ── State Machine ────────────────────────────────────────────────────────────
|
|
|
|
class LoopState:
|
|
"""Persistent state for the Darkplex Loop."""
|
|
|
|
def __init__(self):
|
|
self.status = "INIT"
|
|
self.cycle_count = 0
|
|
self.last_cycle = None
|
|
self.last_success = None
|
|
self.last_failure = None
|
|
self.last_alert = None
|
|
self.consecutive_failures = 0
|
|
self.entities_total = 0
|
|
self.relationships_total = 0
|
|
self.entities_extracted_last = 0
|
|
self.entities_new_last = 0
|
|
self.events_processed_last = 0
|
|
self.steps = {}
|
|
self.error = None
|
|
self.perf = {} # last cycle: ingest_ms, extract_ms, bridge_ms, verify_ms, total_ms
|
|
self.perf_history = [] # last 10 cycles [{total_ms, ingest_ms, ...}]
|
|
self.quality_metrics = {} # {unknown_rate, llm_success_rate, avg_entities_per_event}
|
|
self.quality_history = [] # last 10: [{cycle, unknown_rate, llm_success_rate}]
|
|
self.ollama_status = "unknown" # healthy|degraded|down
|
|
self._load()
|
|
|
|
def _load(self):
|
|
try:
|
|
data = json.loads(STATE_FILE.read_text())
|
|
for k, v in data.items():
|
|
if hasattr(self, k):
|
|
setattr(self, k, v)
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
pass
|
|
|
|
def save(self):
|
|
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
STATE_FILE.write_text(json.dumps(self.__dict__, indent=2, default=str))
|
|
|
|
def record_perf(self, perf: dict):
|
|
"""Record performance metrics for this cycle."""
|
|
self.perf = perf
|
|
# Include unknown_rate in perf_history if available
|
|
if self.quality_metrics:
|
|
perf["unknown_rate"] = self.quality_metrics.get("unknown_rate", 0)
|
|
self.perf_history.append(perf)
|
|
self.perf_history = self.perf_history[-10:] # keep last 10
|
|
|
|
def perf_averages(self) -> dict:
|
|
"""Running averages over last 10 cycles."""
|
|
if not self.perf_history:
|
|
return {}
|
|
keys = self.perf_history[0].keys()
|
|
return {k: int(sum(p.get(k, 0) for p in self.perf_history) / len(self.perf_history)) for k in keys}
|
|
|
|
def record_success(self, step_results: dict):
|
|
self.status = "RUNNING"
|
|
self.consecutive_failures = 0
|
|
self.last_success = datetime.now(timezone.utc).isoformat()
|
|
self.last_cycle = self.last_success
|
|
self.cycle_count += 1
|
|
self.steps = step_results
|
|
self.error = None
|
|
self.save()
|
|
|
|
def record_failure(self, step: str, error: str):
|
|
self.consecutive_failures += 1
|
|
self.last_failure = datetime.now(timezone.utc).isoformat()
|
|
self.last_cycle = self.last_failure
|
|
self.cycle_count += 1
|
|
self.error = f"{step}: {error}"
|
|
if self.consecutive_failures >= 3:
|
|
self.status = "EMERGENCY"
|
|
else:
|
|
self.status = "DEGRADED"
|
|
self.save()
|
|
|
|
def can_alert(self) -> bool:
|
|
if not self.last_alert:
|
|
return True
|
|
try:
|
|
last = datetime.fromisoformat(self.last_alert)
|
|
return (datetime.now(timezone.utc) - last).total_seconds() > ALERT_COOLDOWN
|
|
except (ValueError, TypeError):
|
|
return True
|
|
|
|
def mark_alerted(self):
|
|
self.last_alert = datetime.now(timezone.utc).isoformat()
|
|
self.save()
|
|
|
|
|
|
# ── Pipeline Steps ───────────────────────────────────────────────────────────
|
|
|
|
def _nats_cmd():
|
|
"""Build NATS CLI base command with auth."""
|
|
nats_bin = os.environ.get("NATS_BIN", "nats")
|
|
nats_url = os.environ.get("NATS_URL", "")
|
|
if nats_url:
|
|
return [nats_bin, "-s", nats_url]
|
|
return [nats_bin]
|
|
|
|
|
|
def check_new_events() -> int:
|
|
"""Return number of pending events in the consumer. 0 = nothing new."""
|
|
try:
|
|
r = subprocess.run(
|
|
_nats_cmd() + ["consumer", "info", NATS_STREAM, NATS_CONSUMER, "--json"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
if r.returncode != 0:
|
|
return -1
|
|
info = json.loads(r.stdout)
|
|
return info.get("num_pending", 0)
|
|
except Exception as e:
|
|
log.warning(f"check_new_events failed: {e}")
|
|
return -1
|
|
|
|
|
|
def step_ingest(state: LoopState) -> dict:
|
|
"""Step 1: Fetch new events from NATS using batch consumer pull."""
|
|
log.info("STEP 1: INGEST — Fetching events from NATS")
|
|
|
|
last_processed_seq_file = BASE_DIR / "memory" / "darkplex-last-processed-seq.json"
|
|
|
|
# Check how many pending
|
|
pending = check_new_events()
|
|
if pending == 0:
|
|
log.info("INGEST: No new events — skipping cycle")
|
|
return {"events": [], "total_scanned": 0, "skipped": 0, "skip_reason": "no_new_events"}
|
|
log.info(f"INGEST: {pending} pending events in consumer")
|
|
|
|
events = []
|
|
total_fetched = 0
|
|
parse_errors = 0
|
|
|
|
# Fetch in batches
|
|
remaining = min(pending, NATS_BATCH_SIZE) if pending > 0 else NATS_BATCH_SIZE
|
|
try:
|
|
batch_size = min(remaining, NATS_BATCH_SIZE)
|
|
result = subprocess.run(
|
|
_nats_cmd() + ["consumer", "next", NATS_STREAM, NATS_CONSUMER,
|
|
"--count", str(batch_size), "--raw"],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
if result.returncode != 0:
|
|
log.warning(f"Batch fetch failed (rc={result.returncode}), falling back to sequential")
|
|
return _step_ingest_sequential(state)
|
|
|
|
for line in result.stdout.strip().split("\n"):
|
|
if not line.strip():
|
|
continue
|
|
try:
|
|
data = json.loads(line)
|
|
events.append(data)
|
|
total_fetched += 1
|
|
except json.JSONDecodeError:
|
|
parse_errors += 1
|
|
|
|
except subprocess.TimeoutExpired:
|
|
log.warning("Batch fetch timed out, falling back to sequential")
|
|
return _step_ingest_sequential(state)
|
|
|
|
# Update sequence tracking (get current stream seq from consumer info)
|
|
try:
|
|
r = subprocess.run(
|
|
_nats_cmd() + ["consumer", "info", NATS_STREAM, NATS_CONSUMER, "--json"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
if r.returncode == 0:
|
|
info = json.loads(r.stdout)
|
|
stream_seq = info["delivered"]["stream_seq"]
|
|
last_processed_seq_file.parent.mkdir(parents=True, exist_ok=True)
|
|
last_processed_seq_file.write_text(json.dumps({"last_seq": stream_seq}))
|
|
except Exception:
|
|
log.warning("Could not save last processed sequence")
|
|
|
|
log.info(f"INGEST: {len(events)} events fetched in batch ({parse_errors} parse errors)")
|
|
return {"events": events, "total_scanned": total_fetched + parse_errors, "skipped": parse_errors}
|
|
|
|
|
|
def _step_ingest_sequential(state: LoopState) -> dict:
|
|
"""Fallback: sequential fetch via stream get (slow but reliable)."""
|
|
import base64
|
|
log.info("INGEST FALLBACK: Sequential fetch")
|
|
|
|
last_processed_seq_file = BASE_DIR / "memory" / "darkplex-last-processed-seq.json"
|
|
last_processed_seq = 0
|
|
try:
|
|
if last_processed_seq_file.exists():
|
|
last_processed_seq = json.loads(last_processed_seq_file.read_text()).get("last_seq", 0)
|
|
except Exception:
|
|
pass
|
|
|
|
r = subprocess.run(
|
|
_nats_cmd() + ["stream", "info", NATS_STREAM, "--json"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
if r.returncode != 0:
|
|
return {"events": [], "total_scanned": 0, "skipped": 0}
|
|
|
|
info = json.loads(r.stdout)
|
|
end_seq = info["state"]["last_seq"]
|
|
start_seq = max(last_processed_seq + 1, end_seq - NATS_BATCH_SIZE)
|
|
|
|
events = []
|
|
skipped = 0
|
|
for seq in range(start_seq, end_seq + 1):
|
|
try:
|
|
result = subprocess.run(
|
|
_nats_cmd() + ["stream", "get", NATS_STREAM, str(seq), "--json"],
|
|
capture_output=True, text=True, timeout=5,
|
|
)
|
|
if result.returncode != 0:
|
|
skipped += 1
|
|
continue
|
|
msg = json.loads(result.stdout)
|
|
if "conversation_message_in" not in msg.get("subject", ""):
|
|
skipped += 1
|
|
continue
|
|
data = json.loads(base64.b64decode(msg["data"]).decode("utf-8"))
|
|
events.append(data)
|
|
except Exception:
|
|
skipped += 1
|
|
|
|
try:
|
|
last_processed_seq_file.parent.mkdir(parents=True, exist_ok=True)
|
|
last_processed_seq_file.write_text(json.dumps({"last_seq": end_seq}))
|
|
except Exception:
|
|
pass
|
|
|
|
log.info(f"INGEST (sequential): {len(events)} events (scanned {end_seq - start_seq + 1}, skipped {skipped})")
|
|
return {"events": events, "total_scanned": end_seq - start_seq + 1, "skipped": skipped}
|
|
|
|
|
|
def step_extract(state: LoopState, events: list) -> dict:
|
|
"""Step 2: Extract entities and relationships from events."""
|
|
log.info(f"STEP 2: EXTRACT — Processing {len(events)} events")
|
|
|
|
if not events:
|
|
log.info("EXTRACT: No events to process")
|
|
return {"extracted": 0, "new_entities": 0, "new_relationships": 0}
|
|
|
|
sys.path.insert(0, str(LEVEL4_DIR))
|
|
import importlib.util
|
|
spec = importlib.util.spec_from_file_location("entity_manager", LEVEL4_DIR / "entity-manager.py")
|
|
em = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(em)
|
|
|
|
# Try LLM batch extraction first
|
|
from cortex.llm_extractor import extract_entities_llm_batch, is_available as llm_available
|
|
use_llm = os.environ.get("DARKPLEX_EXTRACTOR", "auto").lower() in ("llm", "auto")
|
|
llm_ok = use_llm and llm_available()
|
|
if llm_ok:
|
|
log.info("EXTRACT: Using LLM extractor (Ollama)")
|
|
else:
|
|
log.info("EXTRACT: Using regex extractor (fallback)")
|
|
|
|
known = em.load_known_entities()
|
|
entities = em.load_json(ENTITIES_FILE)
|
|
relationships = em.load_json(RELATIONSHIPS_FILE)
|
|
|
|
total_extracted = 0
|
|
new_entities = 0
|
|
new_relationships = 0
|
|
ts_now = time.strftime("%Y-%m-%dT%H:%M:%S")
|
|
|
|
# Prepare texts for potential batch LLM processing
|
|
event_texts = []
|
|
for event in events:
|
|
payload = event.get("payload", {})
|
|
text = payload.get("text_preview", "") or payload.get("text", "")
|
|
if isinstance(text, list):
|
|
parts = []
|
|
for t in text:
|
|
parts.append(t.get("text", "") if isinstance(t, dict) else str(t))
|
|
text = " ".join(parts)
|
|
if not isinstance(text, str):
|
|
text = str(text)
|
|
score = _importance(text) if text else 0.0
|
|
event_texts.append((text, score))
|
|
|
|
# LLM batch extraction for qualifying texts (cap at 50 to keep cycle time reasonable)
|
|
llm_results = {}
|
|
if llm_ok:
|
|
batch_texts = [t for t, s in sorted(
|
|
[(t, s) for t, s in event_texts if t and s >= 0.4],
|
|
key=lambda x: -x[1] # highest importance first
|
|
)][:50]
|
|
if batch_texts:
|
|
consecutive_fails = 0
|
|
for i in range(0, len(batch_texts), 10):
|
|
if consecutive_fails >= 3:
|
|
log.warning("EXTRACT: 3 consecutive LLM failures, falling back to regex")
|
|
llm_ok = False
|
|
break
|
|
chunk = batch_texts[i:i+10]
|
|
batch_result = extract_entities_llm_batch(chunk)
|
|
if batch_result:
|
|
llm_results.update(batch_result)
|
|
consecutive_fails = 0
|
|
else:
|
|
consecutive_fails += 1
|
|
if llm_results:
|
|
log.info(f"EXTRACT: LLM batch found {len(llm_results)} entities")
|
|
|
|
for idx, event in enumerate(events):
|
|
text, score = event_texts[idx]
|
|
if not text or score < 0.4:
|
|
continue
|
|
|
|
if llm_ok and llm_results:
|
|
# Use LLM results + known entity matching
|
|
found = em._extract_known(text, known) if hasattr(em, '_extract_known') else {}
|
|
# Add LLM entities that appear in this text
|
|
text_lower = text.lower()
|
|
for name, info in llm_results.items():
|
|
variants = [name, name.replace("-", " "), name.replace("-", "")]
|
|
if any(v in text_lower for v in variants if len(v) > 2):
|
|
found[name] = info
|
|
else:
|
|
found = em.extract_entities(text, known)
|
|
if not found:
|
|
continue
|
|
|
|
total_extracted += len(found)
|
|
names = list(found.keys())
|
|
|
|
for name, info in found.items():
|
|
if name not in entities:
|
|
entities[name] = {
|
|
"type": info["type"],
|
|
"source": "darkplex-loop",
|
|
"first_seen": ts_now,
|
|
}
|
|
new_entities += 1
|
|
known[name] = entities[name]
|
|
|
|
if len(names) >= 2:
|
|
for i in range(len(names)):
|
|
for j in range(i + 1, min(len(names), i + 5)):
|
|
a, b = min(names[i], names[j]), max(names[i], names[j])
|
|
key = f"{a}::{b}"
|
|
if key in relationships:
|
|
relationships[key]["count"] = relationships[key].get("count", 1) + 1
|
|
relationships[key]["last_seen"] = ts_now
|
|
else:
|
|
relationships[key] = {
|
|
"a": a, "b": b, "types": ["co-occurrence"],
|
|
"count": 1, "first_seen": ts_now, "last_seen": ts_now,
|
|
}
|
|
new_relationships += 1
|
|
|
|
em.save_json(ENTITIES_FILE, entities)
|
|
em.save_json(RELATIONSHIPS_FILE, relationships)
|
|
|
|
state.entities_total = len(entities)
|
|
state.relationships_total = len(relationships)
|
|
state.entities_extracted_last = total_extracted
|
|
state.entities_new_last = new_entities
|
|
state.events_processed_last = len(events)
|
|
|
|
log.info(f"EXTRACT: {total_extracted} entities ({new_entities} new), {new_relationships} new relationships")
|
|
return {"extracted": total_extracted, "new_entities": new_entities, "new_relationships": new_relationships}
|
|
|
|
|
|
def step_bridge(state: LoopState) -> dict:
|
|
"""Step 3: Run knowledge bridge."""
|
|
log.info("STEP 3: BRIDGE — Syncing cortex outputs")
|
|
|
|
bridge_script = SCRIPT_DIR / "knowledge-bridge.py"
|
|
if not bridge_script.exists():
|
|
log.warning("BRIDGE: knowledge-bridge.py not found, skipping")
|
|
return {"status": "skipped", "reason": "script not found"}
|
|
|
|
result = subprocess.run(
|
|
[sys.executable, str(bridge_script), "sync"],
|
|
capture_output=True, text=True, timeout=120,
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
log.warning(f"BRIDGE: Failed — {result.stderr[:200]}")
|
|
return {"status": "failed", "error": result.stderr[:200]}
|
|
|
|
bridged = 0
|
|
for line in result.stdout.split("\n"):
|
|
m = re.search(r"(\d+)\s+(?:new|bridged|added)", line, re.I)
|
|
if m:
|
|
bridged += int(m.group(1))
|
|
|
|
log.info(f"BRIDGE: {bridged} items bridged")
|
|
return {"status": "ok", "bridged": bridged}
|
|
|
|
|
|
def _check_quality(state: LoopState, extract_result: dict) -> list:
|
|
"""Check entity quality metrics. Returns list of issues/warnings."""
|
|
issues = []
|
|
|
|
# Load entities and compute unknown_rate
|
|
try:
|
|
entities = json.loads(ENTITIES_FILE.read_text()) if ENTITIES_FILE.exists() else {}
|
|
except (json.JSONDecodeError, OSError):
|
|
entities = {}
|
|
|
|
total = len(entities)
|
|
unknown_count = sum(1 for e in entities.values() if e.get("type") == "unknown")
|
|
unknown_rate = (unknown_count / total * 100) if total > 0 else 0.0
|
|
|
|
events_processed = state.events_processed_last or 1
|
|
extracted = extract_result.get("extracted", 0)
|
|
avg_entities_per_event = extracted / events_processed if events_processed > 0 else 0.0
|
|
|
|
# Estimate LLM success rate from extraction (if LLM was used, new_entities > 0 is a proxy)
|
|
llm_success_rate = 100.0 # default if no LLM used
|
|
# We track this per-cycle based on whether extraction produced results
|
|
if events_processed > 10 and extracted == 0:
|
|
llm_success_rate = 0.0
|
|
|
|
state.quality_metrics = {
|
|
"unknown_rate": round(unknown_rate, 1),
|
|
"llm_success_rate": round(llm_success_rate, 1),
|
|
"avg_entities_per_event": round(avg_entities_per_event, 2),
|
|
}
|
|
|
|
if unknown_rate > 30:
|
|
issues.append(f"High unknown entity rate: {unknown_rate:.1f}% ({unknown_count}/{total})")
|
|
|
|
# Track quality history and detect trends
|
|
state.quality_history.append({
|
|
"cycle": state.cycle_count + 1,
|
|
"unknown_rate": round(unknown_rate, 1),
|
|
"llm_success_rate": round(llm_success_rate, 1),
|
|
})
|
|
state.quality_history = state.quality_history[-10:] # keep last 10
|
|
|
|
# Check if unknown_rate rising 3 cycles in a row
|
|
if len(state.quality_history) >= 3:
|
|
last3 = [h["unknown_rate"] for h in state.quality_history[-3:]]
|
|
if last3[0] < last3[1] < last3[2]:
|
|
issues.append(f"Entity quality degrading — unknown_rate rising: {last3}")
|
|
|
|
log.info(f"VERIFY/QUALITY: unknown_rate={unknown_rate:.1f}%, avg_entities/event={avg_entities_per_event:.2f}")
|
|
return issues
|
|
|
|
|
|
def _check_ollama(state: LoopState) -> list:
|
|
"""Check Ollama health. Returns list of issues."""
|
|
issues = []
|
|
model = os.environ.get("DARKPLEX_OLLAMA_MODEL", os.environ.get("OLLAMA_MODEL", ""))
|
|
|
|
try:
|
|
req = urllib.request.Request("http://localhost:11434/api/tags", method="GET")
|
|
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
data = json.loads(resp.read())
|
|
models = [m.get("name", "") for m in data.get("models", [])]
|
|
if model and not any(model in m for m in models):
|
|
state.ollama_status = "degraded"
|
|
issues.append(f"Ollama up but model '{model}' not loaded (available: {models[:5]})")
|
|
log.warning(f"VERIFY/OLLAMA: degraded — model '{model}' not in {models[:5]}")
|
|
else:
|
|
state.ollama_status = "healthy"
|
|
log.info(f"VERIFY/OLLAMA: healthy ({len(models)} models)")
|
|
except Exception as e:
|
|
state.ollama_status = "down"
|
|
issues.append(f"Ollama down: {e}")
|
|
log.warning(f"VERIFY/OLLAMA: down — {e}")
|
|
|
|
return issues
|
|
|
|
|
|
def _check_performance(state: LoopState) -> list:
|
|
"""Check for performance regressions. Returns list of issues."""
|
|
issues = []
|
|
|
|
if len(state.perf_history) < 2:
|
|
return issues
|
|
|
|
current = state.perf
|
|
avgs = state.perf_averages()
|
|
|
|
# Check total time vs rolling average
|
|
curr_total = current.get("total_ms", 0)
|
|
avg_total = avgs.get("total_ms", 0)
|
|
if avg_total > 0 and curr_total > 2 * avg_total:
|
|
issues.append(f"Performance regression detected: {curr_total}ms vs avg {avg_total}ms")
|
|
|
|
# Check extraction time
|
|
extract_ms = current.get("extract_ms", 0)
|
|
if extract_ms > 120000:
|
|
issues.append(f"Extraction too slow: {extract_ms}ms (>2min)")
|
|
|
|
if issues:
|
|
for i in issues:
|
|
log.warning(f"VERIFY/PERF: {i}")
|
|
else:
|
|
log.info(f"VERIFY/PERF: OK (total={curr_total}ms, avg={avg_total}ms)")
|
|
|
|
return issues
|
|
|
|
|
|
def step_verify(state: LoopState, extract_result: dict) -> dict:
|
|
"""Step 4: Verify output quality."""
|
|
log.info("STEP 4: VERIFY — Checking output quality")
|
|
|
|
issues = []
|
|
|
|
# File integrity checks
|
|
for f, label in [(ENTITIES_FILE, "entities"), (RELATIONSHIPS_FILE, "relationships")]:
|
|
if not f.exists():
|
|
issues.append(f"{label} file missing")
|
|
else:
|
|
try:
|
|
data = json.loads(f.read_text())
|
|
if not data:
|
|
issues.append(f"{label} file is empty")
|
|
except json.JSONDecodeError:
|
|
issues.append(f"{label} file is corrupt JSON")
|
|
|
|
events_processed = state.events_processed_last
|
|
extracted = extract_result.get("extracted", 0)
|
|
if events_processed > 10 and extracted == 0:
|
|
issues.append(f"0 entities from {events_processed} events — extraction may be broken")
|
|
|
|
# NATS check
|
|
try:
|
|
r = subprocess.run(["nats", "stream", "ls", "--json"], capture_output=True, text=True, timeout=10)
|
|
if r.returncode != 0:
|
|
issues.append("NATS unreachable")
|
|
except Exception as e:
|
|
issues.append(f"NATS check failed: {e}")
|
|
|
|
# New monitoring checks
|
|
issues.extend(_check_quality(state, extract_result))
|
|
issues.extend(_check_ollama(state))
|
|
issues.extend(_check_performance(state))
|
|
|
|
verdict = "PASS" if not issues else "FAIL"
|
|
log.info(f"VERIFY: {verdict} — {len(issues)} issues")
|
|
for issue in issues:
|
|
log.warning(f" ⚠ {issue}")
|
|
|
|
return {"verdict": verdict, "issues": issues}
|
|
|
|
|
|
def step_report(state: LoopState, verify_result: dict):
|
|
"""Step 5: Alert if degraded/emergency."""
|
|
if state.status == "RUNNING":
|
|
return
|
|
|
|
if not state.can_alert():
|
|
log.info("REPORT: Alert cooldown active, skipping")
|
|
return
|
|
|
|
severity = "🔴 EMERGENCY" if state.status == "EMERGENCY" else "🟡 DEGRADED"
|
|
msg = (
|
|
f"Darkplex Loop {severity}\n"
|
|
f"Consecutive failures: {state.consecutive_failures}\n"
|
|
f"Error: {state.error}\n"
|
|
f"Issues: {', '.join(verify_result.get('issues', []))}"
|
|
)
|
|
|
|
log.warning(f"REPORT: Sending alert — {state.status}")
|
|
|
|
try:
|
|
subprocess.run(
|
|
["python3", str(SCRIPT_DIR / "vera-alert.py"), msg],
|
|
capture_output=True, text=True, timeout=15,
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
flag = LOG_DIR / "darkplex-loop-alert.flag"
|
|
flag.write_text(f"{datetime.now().isoformat()} {state.status}: {state.error}")
|
|
state.mark_alerted()
|
|
|
|
|
|
# ── Helpers ──────────────────────────────────────────────────────────────────
|
|
|
|
def _importance(text: str) -> float:
|
|
"""Importance scoring for event text."""
|
|
if not text:
|
|
return 0.0
|
|
score = 0.3
|
|
if len(text) > 200: score += 0.1
|
|
if len(text) > 500: score += 0.1
|
|
caps = len(re.findall(r"\b[A-Z][a-z]+\b", text))
|
|
if caps > 3: score += 0.1
|
|
if caps > 8: score += 0.1
|
|
for p in ["HEARTBEAT_OK", "heartbeat", "cron:", "health check", "no critical"]:
|
|
if p.lower() in text.lower():
|
|
score -= 0.3
|
|
for w in ["meeting", "project", "company", "contract", "decision", "strategy",
|
|
"budget", "deadline", "milestone", "partnership", "investment", "revenue",
|
|
"client", "proposal", "agreement"]:
|
|
if w in text.lower():
|
|
score += 0.05
|
|
return max(0.0, min(1.0, score))
|
|
|
|
|
|
def print_status():
|
|
"""Print current loop state."""
|
|
state = LoopState()
|
|
|
|
ent_count = rel_count = 0
|
|
try:
|
|
ent_count = len(json.loads(ENTITIES_FILE.read_text()))
|
|
except Exception:
|
|
pass
|
|
try:
|
|
rel_count = len(json.loads(RELATIONSHIPS_FILE.read_text()))
|
|
except Exception:
|
|
pass
|
|
|
|
icon = {"RUNNING": "🟢", "DEGRADED": "🟡", "EMERGENCY": "🔴"}.get(state.status, "⚪")
|
|
print(f"{icon} Status: {state.status}")
|
|
print(f"Cycles: {state.cycle_count}")
|
|
print(f"Last cycle: {state.last_cycle or 'never'}")
|
|
print(f"Last success: {state.last_success or 'never'}")
|
|
print(f"Last failure: {state.last_failure or 'never'}")
|
|
print(f"Failures: {state.consecutive_failures}")
|
|
print(f"Entities: {ent_count} total (last cycle: {state.entities_extracted_last}, {state.entities_new_last} new)")
|
|
print(f"Relationships:{rel_count} total")
|
|
if state.quality_metrics:
|
|
qm = state.quality_metrics
|
|
print(f"Quality: unknown_rate={qm.get('unknown_rate', '?')}% llm_success={qm.get('llm_success_rate', '?')}% avg_ent/event={qm.get('avg_entities_per_event', '?')}")
|
|
print(f"Ollama: {state.ollama_status}")
|
|
if state.perf:
|
|
print(f"Last perf: {state.perf}")
|
|
if state.error:
|
|
print(f"Error: {state.error}")
|
|
|
|
|
|
# ── Main Loop ────────────────────────────────────────────────────────────────
|
|
|
|
def _ms_since(t0: float) -> int:
|
|
return int((time.monotonic() - t0) * 1000)
|
|
|
|
|
|
def run_cycle(state: LoopState) -> bool:
|
|
"""Run one complete pipeline cycle. Returns True on success."""
|
|
log.info(f"═══ CYCLE {state.cycle_count + 1} START ═══")
|
|
step_results = {}
|
|
perf = {}
|
|
t_cycle = time.monotonic()
|
|
|
|
try:
|
|
t0 = time.monotonic()
|
|
ingest = step_ingest(state)
|
|
perf["ingest_ms"] = _ms_since(t0)
|
|
step_results["ingest"] = {"events": len(ingest["events"]), "scanned": ingest["total_scanned"]}
|
|
|
|
# Early skip if no new events
|
|
if ingest.get("skip_reason") == "no_new_events":
|
|
perf["total_ms"] = _ms_since(t_cycle)
|
|
state.record_perf(perf)
|
|
state.save()
|
|
log.info(f"═══ CYCLE SKIPPED (no new events) — {perf['total_ms']}ms ═══")
|
|
return True
|
|
|
|
t0 = time.monotonic()
|
|
extract = step_extract(state, ingest["events"])
|
|
perf["extract_ms"] = _ms_since(t0)
|
|
step_results["extract"] = extract
|
|
|
|
t0 = time.monotonic()
|
|
bridge = step_bridge(state)
|
|
perf["bridge_ms"] = _ms_since(t0)
|
|
step_results["bridge"] = bridge
|
|
|
|
t0 = time.monotonic()
|
|
verify = step_verify(state, extract)
|
|
perf["verify_ms"] = _ms_since(t0)
|
|
step_results["verify"] = verify
|
|
|
|
perf["total_ms"] = _ms_since(t_cycle)
|
|
state.record_perf(perf)
|
|
|
|
if verify["verdict"] == "FAIL" and any("broken" in i or "missing" in i or "corrupt" in i for i in verify["issues"]):
|
|
state.record_failure("verify", "; ".join(verify["issues"]))
|
|
step_report(state, verify)
|
|
return False
|
|
|
|
state.record_success(step_results)
|
|
avgs = state.perf_averages()
|
|
log.info(f"═══ CYCLE {state.cycle_count} DONE — {state.status} — {perf['total_ms']}ms (avg {avgs.get('total_ms', '?')}ms) ═══")
|
|
log.info(f" Perf: ingest={perf.get('ingest_ms')}ms extract={perf.get('extract_ms')}ms bridge={perf.get('bridge_ms')}ms verify={perf.get('verify_ms')}ms")
|
|
|
|
flag = LOG_DIR / "darkplex-loop-alert.flag"
|
|
if flag.exists():
|
|
flag.unlink()
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
perf["total_ms"] = _ms_since(t_cycle)
|
|
state.record_perf(perf)
|
|
step_name = "unknown"
|
|
for name in ["ingest", "extract", "bridge", "verify"]:
|
|
if name not in step_results:
|
|
step_name = name
|
|
break
|
|
log.error(f"CYCLE FAILED at {step_name}: {e}")
|
|
log.error(traceback.format_exc())
|
|
state.record_failure(step_name, str(e)[:300])
|
|
step_report(state, {"issues": [str(e)]})
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""CLI entry point for `darkplex loop`."""
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
handlers=[
|
|
logging.FileHandler(LOG_DIR / "darkplex-loop.log"),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
args = sys.argv[1:]
|
|
|
|
if "--status" in args:
|
|
print_status()
|
|
return
|
|
|
|
if "--check" in args:
|
|
pending = check_new_events()
|
|
if pending > 0:
|
|
print(f"NEW: {pending} events pending")
|
|
sys.exit(0)
|
|
elif pending == 0:
|
|
print("NONE: No new events")
|
|
sys.exit(1)
|
|
else:
|
|
print("ERROR: Could not check")
|
|
sys.exit(2)
|
|
|
|
once = "--once" in args
|
|
cycle_seconds = DEFAULT_CYCLE_SECONDS
|
|
|
|
for i, arg in enumerate(args):
|
|
if arg == "--cycle" and i + 1 < len(args):
|
|
cycle_seconds = int(args[i + 1])
|
|
|
|
state = LoopState()
|
|
log.info(f"Darkplex Loop starting — cycle every {cycle_seconds}s, once={once}")
|
|
|
|
running = True
|
|
def handle_signal(sig, frame):
|
|
nonlocal running
|
|
log.info("Shutdown signal received")
|
|
running = False
|
|
signal.signal(signal.SIGTERM, handle_signal)
|
|
signal.signal(signal.SIGINT, handle_signal)
|
|
|
|
while running:
|
|
run_cycle(state)
|
|
|
|
if once:
|
|
break
|
|
|
|
log.info(f"Sleeping {cycle_seconds}s until next cycle...")
|
|
for _ in range(cycle_seconds):
|
|
if not running:
|
|
break
|
|
time.sleep(1)
|
|
|
|
log.info("Darkplex Loop stopped")
|