From 2e40f00922a560139fa2260a382a8c51bb48fb60 Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Fri, 26 Jun 2026 12:58:34 -0800 Subject: [PATCH] chore: reverts judge message stripping --- .../ldai_langchain/langchain_model_runner.py | 36 ++- .../langchain_runner_factory.py | 3 +- .../tests/test_langchain_provider.py | 29 ++- .../src/ldai_openai/openai_model_runner.py | 35 ++- .../src/ldai_openai/openai_runner_factory.py | 5 +- .../tests/test_openai_provider.py | 32 +++ packages/sdk/server-ai/src/ldai/client.py | 16 +- .../sdk/server-ai/src/ldai/judge/__init__.py | 79 +++--- .../sdk/server-ai/src/ldai/managed_model.py | 31 ++- .../server-ai/src/ldai/providers/runner.py | 7 +- packages/sdk/server-ai/tests/test_judge.py | 226 +++++++----------- 11 files changed, 254 insertions(+), 245 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py index bc90f425..c7106f87 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py @@ -2,7 +2,7 @@ from langchain_core.chat_history import InMemoryChatMessageHistory from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.messages import BaseMessage, HumanMessage +from langchain_core.messages import BaseMessage from ldai import LDMessage, log from ldai.providers.runner import Runner from ldai.providers.types import LDAIMetrics, RunnerResult @@ -28,13 +28,10 @@ class LangChainModelRunner(Runner): def __init__( self, llm: BaseChatModel, - config_messages: Optional[List[LDMessage]] = None, multi_turn: bool = True, ): self._llm = llm - self._chat_history = InMemoryChatMessageHistory( - messages=cast(List[BaseMessage], convert_messages_to_langchain(config_messages or [])) - ) + self._chat_history = InMemoryChatMessageHistory() self._multi_turn = multi_turn def get_llm(self) -> BaseChatModel: @@ -47,32 +44,53 @@ def get_llm(self) -> BaseChatModel: async def run( self, - input: str, + input: Any, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ Run the LangChain model with the given input. - :param input: A string prompt + :param input: A string prompt or a list of :class:`LDMessage` objects. + When a list is provided it is used as the complete message set. + When a string is provided it is appended to the conversation history. :param output_type: Optional JSON schema dict requesting structured output. When provided, ``parsed`` on the returned :class:`RunnerResult` is populated with the parsed JSON document. :return: :class:`RunnerResult` containing ``content``, ``metrics``, ``raw`` and (when ``output_type`` is set) ``parsed``. """ - langchain_messages = self._chat_history.messages + [HumanMessage(content=input)] + coerced = self._coerce_input(input) + + if isinstance(input, list): + langchain_messages = cast(List[BaseMessage], convert_messages_to_langchain(coerced)) + else: + langchain_messages = self._chat_history.messages + cast( + List[BaseMessage], convert_messages_to_langchain(coerced) + ) if output_type is not None: result = await self._run_structured(langchain_messages, output_type) else: result = await self._run_completion(langchain_messages) - if result.metrics.success and result.content and self._multi_turn: + if result.metrics.success and result.content and self._multi_turn and isinstance(input, str): self._chat_history.add_user_message(input) self._chat_history.add_ai_message(result.content) return result + # convert_messages_to_langchain only accepts List[LDMessage]; _coerce_input + # normalizes a bare string to [LDMessage(role='user', ...)] before that step. + @staticmethod + def _coerce_input(input: Any) -> List[LDMessage]: + if isinstance(input, str): + return [LDMessage(role='user', content=input)] + if isinstance(input, list): + return input + raise TypeError( + f"Unsupported input type for LangChainModelRunner.run: {type(input).__name__}" + ) + async def _run_completion(self, messages: List[BaseMessage]) -> RunnerResult: try: response: BaseMessage = await self._llm.ainvoke(messages) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py index 0a7005a2..7328f669 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py @@ -72,5 +72,4 @@ def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> LangCha :return: LangChainModelRunner ready to invoke the model """ llm = create_langchain_model(config) - config_messages = list(getattr(config, 'messages', None) or []) - return LangChainModelRunner(llm, config_messages, multi_turn=multi_turn) + return LangChainModelRunner(llm, multi_turn=multi_turn) diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py index 0b8c0f14..f024c7e5 100644 --- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py @@ -332,23 +332,22 @@ async def test_does_not_accumulate_history_on_failed_call(self, mock_llm): assert second_call_messages[0].content == 'Try again' @pytest.mark.asyncio - async def test_prepends_config_messages_before_history(self, mock_llm): - """Should send config messages before history on every call.""" - mock_llm.ainvoke = AsyncMock(side_effect=[ - AIMessage(content='Answer 1'), - AIMessage(content='Answer 2'), - ]) - config_messages = [LDMessage(role='system', content='You are helpful.')] - provider = LangChainModelRunner(mock_llm, config_messages=config_messages) + async def test_passes_list_input_directly_without_prepending_history(self, mock_llm): + """When a list[LDMessage] is passed, it is used as-is without prepending chat history.""" + mock_llm.ainvoke = AsyncMock(return_value=AIMessage(content='Answer')) + provider = LangChainModelRunner(mock_llm) - await provider.run('Q1') - await provider.run('Q2') + messages = [ + LDMessage(role='system', content='You are helpful.'), + LDMessage(role='user', content='Q1'), + ] + await provider.run(messages) - second_call_messages = mock_llm.ainvoke.call_args_list[1][0][0] - assert second_call_messages[0].content == 'You are helpful.' - assert second_call_messages[1].content == 'Q1' - assert second_call_messages[2].content == 'Answer 1' - assert second_call_messages[3].content == 'Q2' + first_call_messages = mock_llm.ainvoke.call_args_list[0][0][0] + assert len(first_call_messages) == 2 + assert first_call_messages[0].content == 'You are helpful.' + assert first_call_messages[1].content == 'Q1' + assert len(provider._chat_history.messages) == 0 class TestRunStructured: diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py index 358f34b3..dcd5c70c 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py @@ -28,44 +28,63 @@ def __init__( client: AsyncOpenAI, model_name: str, parameters: Dict[str, Any], - config_messages: Optional[List[LDMessage]] = None, multi_turn: bool = True, ): self._client = client self._model_name = model_name self._parameters = parameters - self._history: List[LDMessage] = list(config_messages or []) + self._history: List[LDMessage] = [] self._multi_turn = multi_turn async def run( self, - input: str, + input: Any, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ Run the OpenAI model with the given input. - :param input: A string prompt + :param input: A string prompt or a list of :class:`LDMessage` objects. + When a list is provided it is used as the complete message set. + When a string is provided it is appended to the conversation history. :param output_type: Optional JSON schema dict requesting structured output. When provided, ``parsed`` on the returned :class:`RunnerResult` is populated with the parsed JSON document. :return: :class:`RunnerResult` containing ``content``, ``metrics``, ``raw`` and (when ``output_type`` is set) ``parsed``. """ - user_message = LDMessage(role='user', content=input) - messages = self._history + [user_message] + try: + coerced = self._coerce_input(input) + except TypeError as error: + log.warning(f'OpenAI model runner received unsupported input type: {error}') + return RunnerResult(content='', metrics=LDAIMetrics(success=False, usage=None)) + + if isinstance(input, list): + messages = coerced + else: + messages = self._history + coerced if output_type is not None: result = await self._run_structured(messages, output_type) else: result = await self._run_completion(messages) - if result.metrics.success and result.content and self._multi_turn: - self._history.append(user_message) + if result.metrics.success and result.content and self._multi_turn and isinstance(input, str): + self._history.append(LDMessage(role='user', content=input)) self._history.append(LDMessage(role='assistant', content=result.content)) return result + @staticmethod + def _coerce_input(input: Any) -> List[LDMessage]: + if isinstance(input, str): + return [LDMessage(role='user', content=input)] + if isinstance(input, list): + return input + raise TypeError( + f"Unsupported input type for OpenAIModelRunner.run: {type(input).__name__}" + ) + async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult: try: response = await self._client.chat.completions.create( diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py index 275c6ecc..8dc76808 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py @@ -103,10 +103,7 @@ def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> OpenAIM tool_defs = parameters.pop('tools', None) or [] if tool_defs: parameters['tools'] = normalize_tool_types(tool_defs) - config_messages = list(getattr(config, 'messages', None) or []) - return OpenAIModelRunner( - self._client, model_name, parameters, config_messages, multi_turn=multi_turn - ) + return OpenAIModelRunner(self._client, model_name, parameters, multi_turn=multi_turn) def get_client(self) -> AsyncOpenAI: """ diff --git a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py index 195149b0..88c5adea 100644 --- a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py +++ b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py @@ -293,6 +293,38 @@ def make_response(text: str): assert len(provider._history) == baseline_len + 4 + @pytest.mark.asyncio + async def test_passes_list_input_directly_without_prepending_history(self, mock_client): + """When a list[LDMessage] is passed, it is used as-is without history prepend.""" + from ldai import LDMessage as _LDMsg + + def make_response(text: str): + r = MagicMock() + r.context_wrapper = None + r.choices = [MagicMock()] + r.choices[0].message = MagicMock() + r.choices[0].message.content = text + r.usage = None + return r + + mock_client.chat = MagicMock() + mock_client.chat.completions = MagicMock() + mock_client.chat.completions.create = AsyncMock(return_value=make_response('Answer')) + + provider = OpenAIModelRunner(mock_client, 'gpt-4o', {}) + messages = [ + _LDMsg(role='system', content='You are helpful.'), + _LDMsg(role='user', content='Q1'), + ] + await provider.run(messages) + + call_messages = mock_client.chat.completions.create.call_args.kwargs['messages'] + assert call_messages == [ + {'role': 'system', 'content': 'You are helpful.'}, + {'role': 'user', 'content': 'Q1'}, + ] + assert provider._history == [] + @pytest.mark.asyncio async def test_does_not_accumulate_history_on_failed_call(self, mock_client): """Should not add to history when the call fails.""" diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 623ff825..a11992f6 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -8,7 +8,7 @@ from ldai import log from ldai.agent_graph import AgentGraphDefinition from ldai.evaluator import Evaluator -from ldai.judge import Judge, _strip_legacy_judge_messages +from ldai.judge import Judge from ldai.managed_agent import ManagedAgent from ldai.managed_agent_graph import ManagedAgentGraph from ldai.managed_model import ManagedModel @@ -203,17 +203,9 @@ def _judge_config( "The variable 'response_to_evaluate' is reserved by the judge and will be ignored." ) - # Re-inject the reserved variables as their literal placeholders so they - # survive Mustache interpolation in ``__evaluate``. Without this, legacy - # templates like ``{{message_history}}`` get rendered to empty strings and - # ``_strip_legacy_judge_messages`` below cannot detect them. - extended_variables = dict(variables) if variables else {} - extended_variables['message_history'] = '{{message_history}}' - extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}' - (model, provider, messages, instructions, tracker_factory, enabled, judge_configuration, variation) = self.__evaluate( - key, context, default.to_dict(), extended_variables + key, context, default.to_dict(), variables ) def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]: @@ -233,10 +225,6 @@ def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]: evaluation_metric_key = _extract_evaluation_metric_key(variation) - # strip legacy judge template messages before creating config - if messages: - messages = _strip_legacy_judge_messages(messages) - config = AIJudgeConfig( key=key, enabled=bool(enabled), diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index 6ae7d90d..8ab654b6 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -3,6 +3,8 @@ import random from typing import Any, Dict, List, Optional, Tuple +import chevron + from ldai import log from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import AIJudgeConfig, LDMessage @@ -10,30 +12,6 @@ from ldai.providers.types import JudgeResult, RunnerResult -def _strip_legacy_judge_messages(messages: List[LDMessage]) -> List[LDMessage]: - """ - Remove legacy judge template messages from a message list. - - Strips any non-system message whose content contains ``{{message_history}}`` - or ``{{response_to_evaluate}}``. These were used by older judge configs to - indicate where the SDK should interpolate the evaluated conversation; new - configs omit them entirely and rely on the string input built by - :meth:`Judge._build_evaluation_input`. - - :param messages: The raw message list from the judge AI config. - :return: A new list with legacy template messages removed. - """ - result = [] - for msg in messages: - if msg.role != 'system' and ( - '{{message_history}}' in msg.content - or '{{response_to_evaluate}}' in msg.content - ): - continue - result.append(msg) - return result - - class Judge: """ Judge implementation that handles evaluation functionality and conversation management. @@ -87,6 +65,11 @@ async def evaluate( judge_result.error_message = 'Judge configuration is missing required evaluationMetricKey' return judge_result + if not self._ai_config.messages: + log.warning('Judge configuration must include messages') + judge_result.error_message = 'Judge configuration must include messages' + return judge_result + if random.random() > effective_rate: log.debug(f'Judge evaluation skipped due to sampling rate: {effective_rate}') return judge_result @@ -94,12 +77,12 @@ async def evaluate( judge_result.sampled = True tracker = self._ai_config.create_tracker() - evaluation_input = self._build_evaluation_input(input_text, output_text) + messages = self._construct_evaluation_messages(input_text, output_text) assert self._evaluation_response_structure is not None response = await tracker.track_metrics_of_async( lambda result: result.metrics, - lambda: self._model_runner.run(evaluation_input, output_type=self._evaluation_response_structure), + lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure), ) if response.parsed is None: @@ -132,19 +115,13 @@ async def evaluate_messages( """ Evaluates an AI response from chat messages and response. - The conversation is rendered for the judge by joining each message as - ``"{role}: {content}"`` on newlines, preserving who said what so the - judge can distinguish user turns from assistant turns. - :param messages: Array of messages representing the conversation history :param response: The runner result to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed. When ``None`` (the default), falls back to ``self.sample_rate``. :return: The result of the judge evaluation. """ - input_text = ( - '\n'.join(f'{msg.role}: {msg.content}' for msg in messages) if messages else '' - ) + input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' output_text = response.content return await self.evaluate(input_text, output_text, sampling_ratio) @@ -165,8 +142,40 @@ def get_model_runner(self) -> Runner: """ return self._model_runner - def _build_evaluation_input(self, input_text: str, output_text: str) -> str: - return f"MESSAGE HISTORY:\n{input_text}\n\nRESPONSE TO EVALUATE:\n{output_text}" + def _construct_evaluation_messages(self, input_text: str, output_text: str) -> List[LDMessage]: + """ + Constructs evaluation messages by combining judge's config messages with input/output. + + Iterates all messages from the judge's AI config and interpolates + ``{{message_history}}`` with ``input_text`` and ``{{response_to_evaluate}}`` + with ``output_text`` using Mustache templating. + + :param input_text: The input text (conversation history) + :param output_text: The output text to evaluate + :return: List of messages for evaluation + """ + if not self._ai_config.messages: + return [] + + messages: List[LDMessage] = [] + for msg in self._ai_config.messages: + content = self._interpolate_message(msg.content, { + 'message_history': input_text, + 'response_to_evaluate': output_text, + }) + messages.append(LDMessage(role=msg.role, content=content)) + + return messages + + def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str: + """ + Interpolates message content with variables using Mustache templating. + + :param content: The message content template + :param variables: Variables to interpolate + :return: Interpolated message content + """ + return chevron.render(content, variables) def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]: """ diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py index 586b8c16..3eab7929 100644 --- a/packages/sdk/server-ai/src/ldai/managed_model.py +++ b/packages/sdk/server-ai/src/ldai/managed_model.py @@ -2,7 +2,7 @@ from typing import List from ldai import log -from ldai.models import AICompletionConfig +from ldai.models import AICompletionConfig, LDMessage from ldai.providers.runner import Runner from ldai.providers.types import JudgeResult, ManagedResult, RunnerResult from ldai.tracker import LDAIConfigTracker @@ -12,10 +12,9 @@ class ManagedModel: """ LaunchDarkly managed wrapper for AI model invocations. - Holds a Runner. Handles judge evaluation dispatch and tracking - automatically via ``create_tracker()``. Conversation history is - managed by the runner. Obtain an instance via - ``LDAIClient.create_model()``. + Holds a Runner. Handles conversation management, judge evaluation + dispatch, and tracking automatically via ``create_tracker()``. + Obtain an instance via ``LDAIClient.create_model()``. """ def __init__( @@ -25,13 +24,15 @@ def __init__( ): self._ai_config = ai_config self._model_runner = model_runner + self._messages: List[LDMessage] = [] async def run(self, prompt: str) -> ManagedResult: """ Run the model with a prompt string. - Delegates to the runner, then dispatches judge evaluations and - records tracking metrics. + Appends the prompt to the conversation history, prepends any + system messages from the config, delegates to the runner, and + appends the response to the history. :param prompt: The user prompt to send to the model :return: ManagedResult containing the model's response, metric summary, @@ -39,12 +40,24 @@ async def run(self, prompt: str) -> ManagedResult: """ tracker = self._ai_config.create_tracker() + user_message = LDMessage(role='user', content=prompt) + self._messages.append(user_message) + + config_messages = self._ai_config.messages or [] + all_messages = config_messages + self._messages + result: RunnerResult = await tracker.track_metrics_of_async( lambda r: r.metrics, - lambda: self._model_runner.run(prompt), + lambda: self._model_runner.run(all_messages), ) - evaluations_task = await self._track_judge_results(tracker, prompt, result.content) + assistant_message = LDMessage(role='assistant', content=result.content) + + input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else '' + + evaluations_task = await self._track_judge_results(tracker, input_text, result.content) + + self._messages.append(assistant_message) return ManagedResult( content=result.content, diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py index 36f47915..247ee2ef 100644 --- a/packages/sdk/server-ai/src/ldai/providers/runner.py +++ b/packages/sdk/server-ai/src/ldai/providers/runner.py @@ -16,13 +16,14 @@ class Runner(Protocol): async def run( self, - input: str, + input: Any, output_type: Optional[Dict[str, Any]] = None, ) -> RunnerResult: """ - Execute the runner with the given input string. + Execute the runner with the given input. - :param input: The string input to the runner. + :param input: The input to the runner. May be a string prompt or a + list of :class:`~ldai.models.LDMessage` objects. :param output_type: Optional JSON schema for structured output. :return: RunnerResult containing content, metrics, raw, and parsed fields. """ diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index b1f88fa2..8c0f60e5 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -8,7 +8,7 @@ from ldclient.integrations.test_data import TestData from ldai import LDAIClient -from ldai.judge import Judge, _strip_legacy_judge_messages +from ldai.judge import Judge from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import ( AIJudgeConfig, @@ -113,69 +113,6 @@ def judge_config_without_messages(tracker) -> AIJudgeConfig: return _make_judge_config(messages=None, tracker=tracker) -class TestStripLegacyJudgeMessages: - """Tests for the _strip_legacy_judge_messages helper.""" - - def test_strips_assistant_message_with_message_history(self): - """Non-system messages containing {{message_history}} should be removed.""" - messages = [ - LDMessage(role='system', content='You are a judge.'), - LDMessage(role='assistant', content='Here is the history: {{message_history}}'), - ] - result = _strip_legacy_judge_messages(messages) - assert len(result) == 1 - assert result[0].role == 'system' - - def test_strips_user_message_with_response_to_evaluate(self): - """Non-system messages containing {{response_to_evaluate}} should be removed.""" - messages = [ - LDMessage(role='system', content='You are a judge.'), - LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'), - ] - result = _strip_legacy_judge_messages(messages) - assert len(result) == 1 - assert result[0].role == 'system' - - def test_strips_all_legacy_messages(self): - """All non-system template messages should be stripped from a typical legacy config.""" - messages = [ - LDMessage(role='system', content='You are a judge.'), - LDMessage(role='assistant', content='{{message_history}}'), - LDMessage(role='user', content='{{response_to_evaluate}}'), - ] - result = _strip_legacy_judge_messages(messages) - assert len(result) == 1 - assert result[0].role == 'system' - - def test_does_not_strip_system_message_containing_template_vars(self): - """System messages are never stripped, even if they contain template variable names.""" - messages = [ - LDMessage(role='system', content='Judge using {{message_history}} and {{response_to_evaluate}}.'), - ] - result = _strip_legacy_judge_messages(messages) - assert len(result) == 1 - assert result[0].role == 'system' - - def test_does_not_strip_non_template_messages(self): - """Non-system messages without template variables are left untouched.""" - messages = [ - LDMessage(role='system', content='You are a judge.'), - LDMessage(role='user', content='This is a regular message.'), - ] - result = _strip_legacy_judge_messages(messages) - assert len(result) == 2 - - def test_returns_empty_list_for_empty_input(self): - """An empty input list should return an empty list.""" - assert _strip_legacy_judge_messages([]) == [] - - def test_new_style_config_system_only_unchanged(self): - """A new-style config with only a system message passes through unchanged.""" - messages = [LDMessage(role='system', content='You are a judge.')] - result = _strip_legacy_judge_messages(messages) - assert result == messages - - class TestJudgeInitialization: """Tests for Judge initialization.""" @@ -225,40 +162,36 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing( mock_runner.run.assert_not_called() @pytest.mark.asyncio - async def test_evaluate_succeeds_when_messages_is_none( - self, judge_config_without_messages: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner + async def test_evaluate_returns_failure_when_messages_is_none( + self, judge_config_without_messages: AIJudgeConfig, mock_runner ): - """Evaluate should proceed (not error early) when messages is None.""" - mock_response = RunnerResult( - content='', - metrics=LDAIMetrics(success=True), - parsed={'score': 0.7, 'reasoning': 'Acceptable response.'}, - ) - mock_runner.run.return_value = mock_response - tracker.track_metrics_of_async = AsyncMock(return_value=mock_response) - - config = _make_judge_config(messages=None, tracker=tracker) - judge = Judge(config, mock_runner) + """Evaluate should return an error result when messages is None.""" + judge = Judge(judge_config_without_messages, mock_runner) result = await judge.evaluate("input text", "output text") assert isinstance(result, JudgeResult) - assert result.sampled is True + assert result.success is False + assert result.sampled is False + assert result.error_message is not None + mock_runner.run.assert_not_called() @pytest.mark.asyncio - async def test_evaluate_passes_string_input_to_runner( + async def test_evaluate_passes_list_of_messages_to_runner( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner ): - """runner.run() should receive the formatted string, NOT a message list.""" + """runner.run() should receive a List[LDMessage], not a plain string.""" mock_response = RunnerResult( content='', metrics=LDAIMetrics(success=True), parsed={'score': 0.85, 'reasoning': 'Good answer.'}, ) mock_runner.run.return_value = mock_response - tracker.track_metrics_of_async = AsyncMock( - side_effect=lambda _metric_fn, fn: fn() - ) + + async def _await_fn(_metric_fn, fn): + return await fn() + + tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn) judge = Judge(judge_config_with_key, mock_runner) await judge.evaluate("What is AI?", "AI is artificial intelligence.") @@ -266,67 +199,82 @@ async def test_evaluate_passes_string_input_to_runner( mock_runner.run.assert_called_once() call_args = mock_runner.run.call_args input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input') - assert isinstance(input_arg, str) - assert "MESSAGE HISTORY:\nWhat is AI?" in input_arg - assert "RESPONSE TO EVALUATE:\nAI is artificial intelligence." in input_arg + assert isinstance(input_arg, list) + assert all(isinstance(m, LDMessage) for m in input_arg) @pytest.mark.asyncio - async def test_evaluate_string_input_format( - self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner + async def test_evaluate_interpolates_message_history_and_response( + self, tracker: LDAIConfigTracker, mock_runner ): - """runner.run() should receive the exact expected string format.""" + """Config messages with {{message_history}} and {{response_to_evaluate}} are interpolated.""" + config = _make_judge_config( + messages=[ + LDMessage(role='system', content='You are a judge.'), + LDMessage(role='assistant', content='{{message_history}}'), + LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'), + ], + tracker=tracker, + ) + mock_response = RunnerResult( content='', metrics=LDAIMetrics(success=True), - parsed={'score': 0.9, 'reasoning': 'Correct.'}, + parsed={'score': 0.75, 'reasoning': 'Mostly relevant.'}, ) mock_runner.run.return_value = mock_response - tracker.track_metrics_of_async = AsyncMock( - side_effect=lambda _metric_fn, fn: fn() - ) - judge = Judge(judge_config_with_key, mock_runner) - await judge.evaluate("hello", "world") + async def _await_fn(_metric_fn, fn): + return await fn() + + tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn) + + judge = Judge(config, mock_runner) + await judge.evaluate("user asked this", "AI said that") call_args = mock_runner.run.call_args - input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input') - expected = "MESSAGE HISTORY:\nhello\n\nRESPONSE TO EVALUATE:\nworld" - assert input_arg == expected + messages_arg: List[LDMessage] = call_args[0][0] if call_args[0] else call_args[1].get('input') + + assert len(messages_arg) == 3 + assert messages_arg[0].role == 'system' + assert messages_arg[0].content == 'You are a judge.' + assert messages_arg[1].role == 'assistant' + assert messages_arg[1].content == 'user asked this' + assert messages_arg[2].role == 'user' + assert messages_arg[2].content == 'Evaluate: AI said that' @pytest.mark.asyncio - async def test_evaluate_legacy_config_passes_string_input_to_runner( + async def test_evaluate_sends_all_config_messages_to_runner( self, tracker: LDAIConfigTracker, mock_runner ): - """ - Judge built directly with legacy messages (bypassing the client) still passes - a formatted string to the runner. Legacy message stripping is the client's - responsibility; the Judge itself does not strip. - """ - legacy_messages = [ - LDMessage(role='system', content='You are a strict judge.'), - LDMessage(role='assistant', content='{{message_history}}'), - LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'), - ] - config = _make_judge_config(messages=legacy_messages, tracker=tracker) + """All config messages (including non-system) must be passed to the runner.""" + config = _make_judge_config( + messages=[ + LDMessage(role='system', content='You are a judge.'), + LDMessage(role='assistant', content='{{message_history}}'), + LDMessage(role='user', content='{{response_to_evaluate}}'), + ], + tracker=tracker, + ) mock_response = RunnerResult( content='', metrics=LDAIMetrics(success=True), - parsed={'score': 0.75, 'reasoning': 'Mostly relevant.'}, + parsed={'score': 0.8, 'reasoning': 'Good.'}, ) mock_runner.run.return_value = mock_response - tracker.track_metrics_of_async = AsyncMock( - side_effect=lambda _metric_fn, fn: fn() - ) + + async def _await_fn(_metric_fn, fn): + return await fn() + + tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn) judge = Judge(config, mock_runner) await judge.evaluate("input", "output") call_args = mock_runner.run.call_args - input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input') - assert isinstance(input_arg, str) - assert "MESSAGE HISTORY:\ninput" in input_arg - assert "RESPONSE TO EVALUATE:\noutput" in input_arg + messages_arg: List[LDMessage] = call_args[0][0] if call_args[0] else call_args[1].get('input') + + assert len(messages_arg) == 3 @pytest.mark.asyncio async def test_evaluate_success_with_valid_response( @@ -545,10 +493,10 @@ async def test_evaluate_messages_calls_evaluate( assert tracker.track_metrics_of_async.called @pytest.mark.asyncio - async def test_evaluate_messages_preserves_roles_in_input( + async def test_evaluate_messages_joins_content_without_role( self, judge_config_with_key: AIJudgeConfig, mock_runner ): - """evaluate_messages must forward role-prefixed lines to evaluate().""" + """evaluate_messages must forward message content (joined by CRLF) to evaluate().""" messages = [ LDMessage(role='user', content='hi'), LDMessage(role='assistant', content='hello'), @@ -561,7 +509,7 @@ async def test_evaluate_messages_preserves_roles_in_input( mock_evaluate.assert_called_once() args, _ = mock_evaluate.call_args - assert args[0] == 'user: hi\nassistant: hello' + assert args[0] == 'hi\r\nhello' assert args[1] == 'reply' @@ -580,9 +528,6 @@ async def test_two_evaluations_do_not_contaminate_history( self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker ): """A judge bound to a non-multi-turn runner must run the same baseline twice.""" - # Stand in a fake runner that records the history it would expose to the - # LLM at the moment run() is called. With multi_turn=False the recorded - # baseline should be identical across calls. seen_baselines: List[List[LDMessage]] = [] class _FakeRunner: @@ -591,7 +536,6 @@ def __init__(self): self._multi_turn = False async def run(self, input, output_type=None): # type: ignore[no-untyped-def] - # Snapshot history as seen at call time. seen_baselines.append(list(self._history)) return RunnerResult( content='ok', @@ -611,22 +555,16 @@ async def _await_fn(_metric_fn, fn): await judge.evaluate('second input', 'second output') assert len(seen_baselines) == 2 - # Both runs see the same baseline (empty in this fake; the point is - # they're equal — no contamination from the prior turn). assert seen_baselines[0] == seen_baselines[1] - # And the runner's history never grew because multi_turn is False. assert runner._history == [] -class TestJudgeConfigStripsLegacyMessages: - """Tests for ``LDAIClient.judge_config()`` legacy-message stripping. +class TestJudgeConfigMessages: + """Tests for LDAIClient.judge_config() preserving all config messages. - Both ``LDAIClient.judge_config()`` (the direct public API) and - ``LDAIClient.create_judge()`` (the wrapper API) reach the same internal - ``_judge_config()`` method. ``_judge_config()`` is responsible for - injecting ``message_history``/``response_to_evaluate`` markers so they - survive Mustache rendering, then stripping any legacy template messages - before returning the config. + judge_config() must NOT strip legacy template messages — the judge itself + is responsible for interpolating {{message_history}} and + {{response_to_evaluate}} when it constructs the evaluation message list. """ @pytest.fixture @@ -637,13 +575,8 @@ def _make_client(self, td: TestData) -> LDAIClient: config = Config('sdk-key', update_processor_class=td, send_events=False) return LDAIClient(LDClient(config=config)) - def test_judge_config_strips_legacy_messages_from_returned_config(self, context): - """Calling ``judge_config()`` directly (no variables) still strips legacy messages. - - This is the regression for the bug where legacy messages leaked through - the public ``judge_config()`` entry point because reserved-variable - markers were only injected in ``_create_judge_instance``. - """ + def test_judge_config_preserves_legacy_messages(self, context): + """Calling judge_config() must return all config messages, including legacy template ones.""" td = TestData.data_source() td.update( td.flag('legacy-judge') @@ -666,9 +599,10 @@ def test_judge_config_strips_legacy_messages_from_returned_config(self, context) assert result.enabled is True assert result.messages is not None - assert len(result.messages) == 1 + assert len(result.messages) == 3 assert result.messages[0].role == 'system' - assert result.messages[0].content == 'You are a judge.' + assert result.messages[1].role == 'assistant' + assert result.messages[2].role == 'user' def test_judge_config_passes_user_variables_to_template(self, context): """User variables are still interpolated into the system message.""" @@ -696,7 +630,7 @@ def test_judge_config_passes_user_variables_to_template(self, context): assert result.messages[0].content == 'You are a strict judge.' def test_judge_config_warns_on_reserved_variables(self, context): - """``_judge_config`` warns when callers pass reserved variable names.""" + """_judge_config warns when callers pass reserved variable names.""" td = TestData.data_source() td.update( td.flag('judge-config')