#!/usr/bin/env python3 """ LLM-Powered Entity Extractor — Uses Ollama for Named Entity Recognition. Standalone module. No pip dependencies beyond stdlib. Calls Ollama HTTP API with structured NER prompts. Configuration via environment variables: DARKPLEX_OLLAMA_URL — Ollama base URL (default: http://localhost:11434) DARKPLEX_OLLAMA_MODEL — Model name (default: mistral:7b) DARKPLEX_OLLAMA_TIMEOUT — Timeout in seconds (default: 10) DARKPLEX_EXTRACTOR — llm|regex|auto (default: auto) """ import json import logging import os import urllib.request import urllib.error log = logging.getLogger("llm-extractor") OLLAMA_URL = os.environ.get("DARKPLEX_OLLAMA_URL", "http://localhost:11434") OLLAMA_MODEL = os.environ.get("DARKPLEX_OLLAMA_MODEL", "mistral:7b") OLLAMA_TIMEOUT = int(os.environ.get("DARKPLEX_OLLAMA_TIMEOUT", "30")) VALID_TYPES = {"person", "organization", "company", "project", "technology", "location", "event", "concept", "product"} NER_PROMPT = """Extract all named entities from the text below. Return ONLY a JSON object. Each key is the entity name (lowercase), each value has "type" and "context". Valid types: person, organization, company, project, technology, location, event, concept, product Rules: - Skip common/generic words (the, system, message, etc.) - Entity names should be lowercase, use hyphens for multi-word - "context" is a 2-5 word description of the entity's role in the text - If no entities found, return empty JSON object - Return ONLY valid JSON, no explanation Text: {text} JSON:""" BATCH_PROMPT = """Extract all named entities from these texts. Return ONLY a JSON object. Each key is the entity name (lowercase, hyphens for spaces), each value has "type" and "context". Valid types: person, organization, company, project, technology, location, event, concept, product Rules: - Skip common/generic words - "context" is a 2-5 word description - If no entities found, return empty JSON object - Return ONLY valid JSON, no markdown, no explanation Texts: {texts} JSON:""" def _call_ollama(prompt: str) -> str | None: """Call Ollama generate API. Returns response text or None on failure.""" payload = json.dumps({ "model": OLLAMA_MODEL, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 1024}, }).encode() req = urllib.request.Request( f"{OLLAMA_URL}/api/generate", data=payload, headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=OLLAMA_TIMEOUT) as resp: data = json.loads(resp.read().decode()) return data.get("response", "") except (urllib.error.URLError, TimeoutError, OSError) as e: log.warning(f"Ollama call failed: {e}") return None except Exception as e: log.warning(f"Ollama unexpected error: {e}") return None def _parse_json_response(text: str) -> dict: """Extract JSON dict from LLM response, handling markdown fences etc.""" if not text: return {} # Strip markdown code fences text = text.strip() if text.startswith("```"): lines = text.split("\n") lines = [l for l in lines if not l.strip().startswith("```")] text = "\n".join(lines) # Find the JSON object start = text.find("{") if start == -1: return {} # Find matching closing brace depth = 0 for i in range(start, len(text)): if text[i] == "{": depth += 1 elif text[i] == "}": depth -= 1 if depth == 0: try: return json.loads(text[start:i + 1]) except json.JSONDecodeError: return {} return {} def _normalize_entities(raw: dict) -> dict: """Normalize and validate extracted entities.""" result = {} for name, info in raw.items(): if not isinstance(info, dict): continue name = name.strip().lower().replace("_", "-").replace(" ", "-") if len(name) < 2 or len(name) > 80: continue etype = info.get("type", "unknown").lower().strip() if etype not in VALID_TYPES: # Map common aliases aliases = {"org": "organization", "tech": "technology", "loc": "location", "place": "location", "tool": "technology", "framework": "technology", "language": "technology", "app": "product", "software": "product", "service": "product", "group": "organization", "team": "organization"} etype = aliases.get(etype, "concept") context = info.get("context", "") if isinstance(context, str): context = context[:100] else: context = "" result[name] = {"type": etype, "context": context, "match": "llm"} return result def extract_entities_llm(text: str) -> dict[str, dict] | None: """ Extract entities from text using Ollama LLM. Returns dict of {name: {type, context, match}} or None if LLM unavailable. None signals caller to fall back to regex. """ if not text or len(text) < 10: return {} # Truncate very long texts if len(text) > 2000: text = text[:2000] prompt = NER_PROMPT.format(text=text) response = _call_ollama(prompt) if response is None: return None # Signal fallback raw = _parse_json_response(response) return _normalize_entities(raw) def extract_entities_llm_batch(texts: list[str]) -> dict[str, dict] | None: """ Extract entities from multiple texts in one LLM call. Returns combined dict or None if LLM unavailable. """ if not texts: return {} # Filter and truncate clean = [] for t in texts: if t and len(t) >= 10: clean.append(t[:500] if len(t) > 500 else t) if not clean: return {} # Limit batch size to keep prompt reasonable if len(clean) > 10: clean = clean[:10] numbered = "\n".join(f"[{i+1}] {t}" for i, t in enumerate(clean)) prompt = BATCH_PROMPT.format(texts=numbered) response = _call_ollama(prompt) if response is None: return None raw = _parse_json_response(response) return _normalize_entities(raw) def is_available() -> bool: """Check if Ollama is reachable.""" try: req = urllib.request.Request(f"{OLLAMA_URL}/api/tags", method="GET") with urllib.request.urlopen(req, timeout=3) as resp: return resp.status == 200 except Exception: return False