darkplex-core/tests/test_llm_extractor.py

"""Tests for intelligence/llm_extractor.py — LLM-Powered Entity Extractor."""

import json
import sys
from pathlib import Path
from unittest.mock import patch, MagicMock

sys.path.insert(0, str(Path.home() / "repos" / "darkplex-core" / "intelligence"))

from llm_extractor import (
    _parse_json_response,
    _normalize_entities,
    extract_entities_llm,
    extract_entities_llm_batch,
    is_available,
    VALID_TYPES,
)


class TestParseJsonResponse:
    def test_empty(self):
        assert _parse_json_response("") == {}
        assert _parse_json_response(None) == {}

    def test_plain_json(self):
        r = _parse_json_response('{"albert": {"type": "person", "context": "CEO"}}')
        assert "albert" in r

    def test_markdown_fenced(self):
        r = _parse_json_response('```json\n{"albert": {"type": "person", "context": "CEO"}}\n```')
        assert "albert" in r

    def test_no_json(self):
        assert _parse_json_response("no json here") == {}

    def test_nested_braces(self):
        r = _parse_json_response('{"a": {"type": "person", "context": "test"}}')
        assert "a" in r


class TestNormalizeEntities:
    def test_valid_entity(self):
        raw = {"Albert": {"type": "person", "context": "CEO of company"}}
        result = _normalize_entities(raw)
        assert "albert" in result
        assert result["albert"]["type"] == "person"
        assert result["albert"]["match"] == "llm"

    def test_type_alias(self):
        raw = {"python": {"type": "language", "context": "programming"}}
        result = _normalize_entities(raw)
        assert result["python"]["type"] == "technology"

    def test_unknown_type_becomes_concept(self):
        raw = {"thing": {"type": "xyzzy", "context": "unknown"}}
        result = _normalize_entities(raw)
        assert result["thing"]["type"] == "concept"

    def test_filters_short_names(self):
        raw = {"x": {"type": "person", "context": "test"}}
        result = _normalize_entities(raw)
        assert len(result) == 0

    def test_filters_long_names(self):
        raw = {"a" * 81: {"type": "person", "context": "test"}}
        result = _normalize_entities(raw)
        assert len(result) == 0

    def test_non_dict_info_skipped(self):
        raw = {"test": "not a dict"}
        result = _normalize_entities(raw)
        assert len(result) == 0

    def test_context_truncated(self):
        raw = {"test": {"type": "person", "context": "x" * 200}}
        result = _normalize_entities(raw)
        assert len(result["test"]["context"]) <= 100

    def test_underscores_to_hyphens(self):
        raw = {"mondo_gate": {"type": "company", "context": "test"}}
        result = _normalize_entities(raw)
        assert "mondo-gate" in result


class TestExtractEntitiesLlm:
    @patch("llm_extractor._call_ollama")
    def test_empty_text(self, mock_ollama):
        assert extract_entities_llm("") == {}
        assert extract_entities_llm("short") == {}
        mock_ollama.assert_not_called()

    @patch("llm_extractor._call_ollama")
    def test_ollama_unavailable(self, mock_ollama):
        mock_ollama.return_value = None
        result = extract_entities_llm("This is a test about Albert and Mondo Gate AG")
        assert result is None  # signals fallback

    @patch("llm_extractor._call_ollama")
    def test_successful_extraction(self, mock_ollama):
        mock_ollama.return_value = '{"albert": {"type": "person", "context": "mentioned"}}'
        result = extract_entities_llm("Albert discussed the project with the team members today")
        assert "albert" in result
        assert result["albert"]["type"] == "person"

    @patch("llm_extractor._call_ollama")
    def test_truncates_long_text(self, mock_ollama):
        mock_ollama.return_value = "{}"
        extract_entities_llm("x" * 3000)
        call_args = mock_ollama.call_args[0][0]
        # The text in the prompt should be truncated
        assert len(call_args) < 3000 + 500  # prompt overhead


class TestExtractEntitiesLlmBatch:
    @patch("llm_extractor._call_ollama")
    def test_empty_list(self, mock_ollama):
        assert extract_entities_llm_batch([]) == {}
        mock_ollama.assert_not_called()

    @patch("llm_extractor._call_ollama")
    def test_filters_short_texts(self, mock_ollama):
        mock_ollama.return_value = "{}"
        result = extract_entities_llm_batch(["hi", "yo", ""])
        assert result == {}
        mock_ollama.assert_not_called()

    @patch("llm_extractor._call_ollama")
    def test_batch_extraction(self, mock_ollama):
        mock_ollama.return_value = '{"python": {"type": "technology", "context": "language"}}'
        result = extract_entities_llm_batch(["Python is a great programming language for data science"])
        assert "python" in result


class TestIsAvailable:
    @patch("llm_extractor.urllib.request.urlopen")
    def test_available(self, mock_urlopen):
        mock_resp = MagicMock()
        mock_resp.status = 200
        mock_resp.__enter__ = MagicMock(return_value=mock_resp)
        mock_resp.__exit__ = MagicMock(return_value=False)
        mock_urlopen.return_value = mock_resp
        assert is_available() is True

    @patch("llm_extractor.urllib.request.urlopen")
    def test_unavailable(self, mock_urlopen):
        mock_urlopen.side_effect = Exception("connection refused")
        assert is_available() is False