From 2e40f00922a560139fa2260a382a8c51bb48fb60 Mon Sep 17 00:00:00 2001
From: Andrew Klatzke <aklatzke@launchdarkly.com>
Date: Fri, 26 Jun 2026 12:58:34 -0800
Subject: [PATCH] chore: reverts judge message stripping

---
 .../ldai_langchain/langchain_model_runner.py  |  36 ++-
 .../langchain_runner_factory.py               |   3 +-
 .../tests/test_langchain_provider.py          |  29 ++-
 .../src/ldai_openai/openai_model_runner.py    |  35 ++-
 .../src/ldai_openai/openai_runner_factory.py  |   5 +-
 .../tests/test_openai_provider.py             |  32 +++
 packages/sdk/server-ai/src/ldai/client.py     |  16 +-
 .../sdk/server-ai/src/ldai/judge/__init__.py  |  79 +++---
 .../sdk/server-ai/src/ldai/managed_model.py   |  31 ++-
 .../server-ai/src/ldai/providers/runner.py    |   7 +-
 packages/sdk/server-ai/tests/test_judge.py    | 226 +++++++-----------
 11 files changed, 254 insertions(+), 245 deletions(-)

diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py
index bc90f425..c7106f87 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py
@@ -2,7 +2,7 @@
 
 from langchain_core.chat_history import InMemoryChatMessageHistory
 from langchain_core.language_models.chat_models import BaseChatModel
-from langchain_core.messages import BaseMessage, HumanMessage
+from langchain_core.messages import BaseMessage
 from ldai import LDMessage, log
 from ldai.providers.runner import Runner
 from ldai.providers.types import LDAIMetrics, RunnerResult
@@ -28,13 +28,10 @@ class LangChainModelRunner(Runner):
     def __init__(
         self,
         llm: BaseChatModel,
-        config_messages: Optional[List[LDMessage]] = None,
         multi_turn: bool = True,
     ):
         self._llm = llm
-        self._chat_history = InMemoryChatMessageHistory(
-            messages=cast(List[BaseMessage], convert_messages_to_langchain(config_messages or []))
-        )
+        self._chat_history = InMemoryChatMessageHistory()
         self._multi_turn = multi_turn
 
     def get_llm(self) -> BaseChatModel:
@@ -47,32 +44,53 @@ def get_llm(self) -> BaseChatModel:
 
     async def run(
         self,
-        input: str,
+        input: Any,
         output_type: Optional[Dict[str, Any]] = None,
     ) -> RunnerResult:
         """
         Run the LangChain model with the given input.
 
-        :param input: A string prompt
+        :param input: A string prompt or a list of :class:`LDMessage` objects.
+            When a list is provided it is used as the complete message set.
+            When a string is provided it is appended to the conversation history.
         :param output_type: Optional JSON schema dict requesting structured output.
             When provided, ``parsed`` on the returned :class:`RunnerResult` is
             populated with the parsed JSON document.
         :return: :class:`RunnerResult` containing ``content``, ``metrics``,
             ``raw`` and (when ``output_type`` is set) ``parsed``.
         """
-        langchain_messages = self._chat_history.messages + [HumanMessage(content=input)]
+        coerced = self._coerce_input(input)
+
+        if isinstance(input, list):
+            langchain_messages = cast(List[BaseMessage], convert_messages_to_langchain(coerced))
+        else:
+            langchain_messages = self._chat_history.messages + cast(
+                List[BaseMessage], convert_messages_to_langchain(coerced)
+            )
 
         if output_type is not None:
             result = await self._run_structured(langchain_messages, output_type)
         else:
             result = await self._run_completion(langchain_messages)
 
-        if result.metrics.success and result.content and self._multi_turn:
+        if result.metrics.success and result.content and self._multi_turn and isinstance(input, str):
             self._chat_history.add_user_message(input)
             self._chat_history.add_ai_message(result.content)
 
         return result
 
+    # convert_messages_to_langchain only accepts List[LDMessage]; _coerce_input
+    # normalizes a bare string to [LDMessage(role='user', ...)] before that step.
+    @staticmethod
+    def _coerce_input(input: Any) -> List[LDMessage]:
+        if isinstance(input, str):
+            return [LDMessage(role='user', content=input)]
+        if isinstance(input, list):
+            return input
+        raise TypeError(
+            f"Unsupported input type for LangChainModelRunner.run: {type(input).__name__}"
+        )
+
     async def _run_completion(self, messages: List[BaseMessage]) -> RunnerResult:
         try:
             response: BaseMessage = await self._llm.ainvoke(messages)
diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py
index 0a7005a2..7328f669 100644
--- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py
+++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py
@@ -72,5 +72,4 @@ def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> LangCha
         :return: LangChainModelRunner ready to invoke the model
         """
         llm = create_langchain_model(config)
-        config_messages = list(getattr(config, 'messages', None) or [])
-        return LangChainModelRunner(llm, config_messages, multi_turn=multi_turn)
+        return LangChainModelRunner(llm, multi_turn=multi_turn)
diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
index 0b8c0f14..f024c7e5 100644
--- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
+++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py
@@ -332,23 +332,22 @@ async def test_does_not_accumulate_history_on_failed_call(self, mock_llm):
         assert second_call_messages[0].content == 'Try again'
 
     @pytest.mark.asyncio
-    async def test_prepends_config_messages_before_history(self, mock_llm):
-        """Should send config messages before history on every call."""
-        mock_llm.ainvoke = AsyncMock(side_effect=[
-            AIMessage(content='Answer 1'),
-            AIMessage(content='Answer 2'),
-        ])
-        config_messages = [LDMessage(role='system', content='You are helpful.')]
-        provider = LangChainModelRunner(mock_llm, config_messages=config_messages)
+    async def test_passes_list_input_directly_without_prepending_history(self, mock_llm):
+        """When a list[LDMessage] is passed, it is used as-is without prepending chat history."""
+        mock_llm.ainvoke = AsyncMock(return_value=AIMessage(content='Answer'))
+        provider = LangChainModelRunner(mock_llm)
 
-        await provider.run('Q1')
-        await provider.run('Q2')
+        messages = [
+            LDMessage(role='system', content='You are helpful.'),
+            LDMessage(role='user', content='Q1'),
+        ]
+        await provider.run(messages)
 
-        second_call_messages = mock_llm.ainvoke.call_args_list[1][0][0]
-        assert second_call_messages[0].content == 'You are helpful.'
-        assert second_call_messages[1].content == 'Q1'
-        assert second_call_messages[2].content == 'Answer 1'
-        assert second_call_messages[3].content == 'Q2'
+        first_call_messages = mock_llm.ainvoke.call_args_list[0][0][0]
+        assert len(first_call_messages) == 2
+        assert first_call_messages[0].content == 'You are helpful.'
+        assert first_call_messages[1].content == 'Q1'
+        assert len(provider._chat_history.messages) == 0
 
 
 class TestRunStructured:
diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py
index 358f34b3..dcd5c70c 100644
--- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py
+++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py
@@ -28,44 +28,63 @@ def __init__(
         client: AsyncOpenAI,
         model_name: str,
         parameters: Dict[str, Any],
-        config_messages: Optional[List[LDMessage]] = None,
         multi_turn: bool = True,
     ):
         self._client = client
         self._model_name = model_name
         self._parameters = parameters
-        self._history: List[LDMessage] = list(config_messages or [])
+        self._history: List[LDMessage] = []
         self._multi_turn = multi_turn
 
     async def run(
         self,
-        input: str,
+        input: Any,
         output_type: Optional[Dict[str, Any]] = None,
     ) -> RunnerResult:
         """
         Run the OpenAI model with the given input.
 
-        :param input: A string prompt
+        :param input: A string prompt or a list of :class:`LDMessage` objects.
+            When a list is provided it is used as the complete message set.
+            When a string is provided it is appended to the conversation history.
         :param output_type: Optional JSON schema dict requesting structured output.
             When provided, ``parsed`` on the returned :class:`RunnerResult` is
             populated with the parsed JSON document.
         :return: :class:`RunnerResult` containing ``content``, ``metrics``,
             ``raw`` and (when ``output_type`` is set) ``parsed``.
         """
-        user_message = LDMessage(role='user', content=input)
-        messages = self._history + [user_message]
+        try:
+            coerced = self._coerce_input(input)
+        except TypeError as error:
+            log.warning(f'OpenAI model runner received unsupported input type: {error}')
+            return RunnerResult(content='', metrics=LDAIMetrics(success=False, usage=None))
+
+        if isinstance(input, list):
+            messages = coerced
+        else:
+            messages = self._history + coerced
 
         if output_type is not None:
             result = await self._run_structured(messages, output_type)
         else:
             result = await self._run_completion(messages)
 
-        if result.metrics.success and result.content and self._multi_turn:
-            self._history.append(user_message)
+        if result.metrics.success and result.content and self._multi_turn and isinstance(input, str):
+            self._history.append(LDMessage(role='user', content=input))
             self._history.append(LDMessage(role='assistant', content=result.content))
 
         return result
 
+    @staticmethod
+    def _coerce_input(input: Any) -> List[LDMessage]:
+        if isinstance(input, str):
+            return [LDMessage(role='user', content=input)]
+        if isinstance(input, list):
+            return input
+        raise TypeError(
+            f"Unsupported input type for OpenAIModelRunner.run: {type(input).__name__}"
+        )
+
     async def _run_completion(self, messages: List[LDMessage]) -> RunnerResult:
         try:
             response = await self._client.chat.completions.create(
diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py
index 275c6ecc..8dc76808 100644
--- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py
+++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py
@@ -103,10 +103,7 @@ def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> OpenAIM
         tool_defs = parameters.pop('tools', None) or []
         if tool_defs:
             parameters['tools'] = normalize_tool_types(tool_defs)
-        config_messages = list(getattr(config, 'messages', None) or [])
-        return OpenAIModelRunner(
-            self._client, model_name, parameters, config_messages, multi_turn=multi_turn
-        )
+        return OpenAIModelRunner(self._client, model_name, parameters, multi_turn=multi_turn)
 
     def get_client(self) -> AsyncOpenAI:
         """
diff --git a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py
index 195149b0..88c5adea 100644
--- a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py
+++ b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py
@@ -293,6 +293,38 @@ def make_response(text: str):
 
         assert len(provider._history) == baseline_len + 4
 
+    @pytest.mark.asyncio
+    async def test_passes_list_input_directly_without_prepending_history(self, mock_client):
+        """When a list[LDMessage] is passed, it is used as-is without history prepend."""
+        from ldai import LDMessage as _LDMsg
+
+        def make_response(text: str):
+            r = MagicMock()
+            r.context_wrapper = None
+            r.choices = [MagicMock()]
+            r.choices[0].message = MagicMock()
+            r.choices[0].message.content = text
+            r.usage = None
+            return r
+
+        mock_client.chat = MagicMock()
+        mock_client.chat.completions = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=make_response('Answer'))
+
+        provider = OpenAIModelRunner(mock_client, 'gpt-4o', {})
+        messages = [
+            _LDMsg(role='system', content='You are helpful.'),
+            _LDMsg(role='user', content='Q1'),
+        ]
+        await provider.run(messages)
+
+        call_messages = mock_client.chat.completions.create.call_args.kwargs['messages']
+        assert call_messages == [
+            {'role': 'system', 'content': 'You are helpful.'},
+            {'role': 'user', 'content': 'Q1'},
+        ]
+        assert provider._history == []
+
     @pytest.mark.asyncio
     async def test_does_not_accumulate_history_on_failed_call(self, mock_client):
         """Should not add to history when the call fails."""
diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py
index 623ff825..a11992f6 100644
--- a/packages/sdk/server-ai/src/ldai/client.py
+++ b/packages/sdk/server-ai/src/ldai/client.py
@@ -8,7 +8,7 @@
 from ldai import log
 from ldai.agent_graph import AgentGraphDefinition
 from ldai.evaluator import Evaluator
-from ldai.judge import Judge, _strip_legacy_judge_messages
+from ldai.judge import Judge
 from ldai.managed_agent import ManagedAgent
 from ldai.managed_agent_graph import ManagedAgentGraph
 from ldai.managed_model import ManagedModel
@@ -203,17 +203,9 @@ def _judge_config(
                     "The variable 'response_to_evaluate' is reserved by the judge and will be ignored."
                 )
 
-        # Re-inject the reserved variables as their literal placeholders so they
-        # survive Mustache interpolation in ``__evaluate``.  Without this, legacy
-        # templates like ``{{message_history}}`` get rendered to empty strings and
-        # ``_strip_legacy_judge_messages`` below cannot detect them.
-        extended_variables = dict(variables) if variables else {}
-        extended_variables['message_history'] = '{{message_history}}'
-        extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
-
         (model, provider, messages, instructions,
          tracker_factory, enabled, judge_configuration, variation) = self.__evaluate(
-            key, context, default.to_dict(), extended_variables
+            key, context, default.to_dict(), variables
         )
 
         def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]:
@@ -233,10 +225,6 @@ def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]:
 
         evaluation_metric_key = _extract_evaluation_metric_key(variation)
 
-        # strip legacy judge template messages before creating config
-        if messages:
-            messages = _strip_legacy_judge_messages(messages)
-
         config = AIJudgeConfig(
             key=key,
             enabled=bool(enabled),
diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py
index 6ae7d90d..8ab654b6 100644
--- a/packages/sdk/server-ai/src/ldai/judge/__init__.py
+++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py
@@ -3,6 +3,8 @@
 import random
 from typing import Any, Dict, List, Optional, Tuple
 
+import chevron
+
 from ldai import log
 from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
 from ldai.models import AIJudgeConfig, LDMessage
@@ -10,30 +12,6 @@
 from ldai.providers.types import JudgeResult, RunnerResult
 
 
-def _strip_legacy_judge_messages(messages: List[LDMessage]) -> List[LDMessage]:
-    """
-    Remove legacy judge template messages from a message list.
-
-    Strips any non-system message whose content contains ``{{message_history}}``
-    or ``{{response_to_evaluate}}``.  These were used by older judge configs to
-    indicate where the SDK should interpolate the evaluated conversation; new
-    configs omit them entirely and rely on the string input built by
-    :meth:`Judge._build_evaluation_input`.
-
-    :param messages: The raw message list from the judge AI config.
-    :return: A new list with legacy template messages removed.
-    """
-    result = []
-    for msg in messages:
-        if msg.role != 'system' and (
-            '{{message_history}}' in msg.content
-            or '{{response_to_evaluate}}' in msg.content
-        ):
-            continue
-        result.append(msg)
-    return result
-
-
 class Judge:
     """
     Judge implementation that handles evaluation functionality and conversation management.
@@ -87,6 +65,11 @@ async def evaluate(
                 judge_result.error_message = 'Judge configuration is missing required evaluationMetricKey'
                 return judge_result
 
+            if not self._ai_config.messages:
+                log.warning('Judge configuration must include messages')
+                judge_result.error_message = 'Judge configuration must include messages'
+                return judge_result
+
             if random.random() > effective_rate:
                 log.debug(f'Judge evaluation skipped due to sampling rate: {effective_rate}')
                 return judge_result
@@ -94,12 +77,12 @@ async def evaluate(
             judge_result.sampled = True
 
             tracker = self._ai_config.create_tracker()
-            evaluation_input = self._build_evaluation_input(input_text, output_text)
+            messages = self._construct_evaluation_messages(input_text, output_text)
             assert self._evaluation_response_structure is not None
 
             response = await tracker.track_metrics_of_async(
                 lambda result: result.metrics,
-                lambda: self._model_runner.run(evaluation_input, output_type=self._evaluation_response_structure),
+                lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure),
             )
 
             if response.parsed is None:
@@ -132,19 +115,13 @@ async def evaluate_messages(
         """
         Evaluates an AI response from chat messages and response.
 
-        The conversation is rendered for the judge by joining each message as
-        ``"{role}: {content}"`` on newlines, preserving who said what so the
-        judge can distinguish user turns from assistant turns.
-
         :param messages: Array of messages representing the conversation history
         :param response: The runner result to be evaluated
         :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed.
             When ``None`` (the default), falls back to ``self.sample_rate``.
         :return: The result of the judge evaluation.
         """
-        input_text = (
-            '\n'.join(f'{msg.role}: {msg.content}' for msg in messages) if messages else ''
-        )
+        input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
         output_text = response.content
 
         return await self.evaluate(input_text, output_text, sampling_ratio)
@@ -165,8 +142,40 @@ def get_model_runner(self) -> Runner:
         """
         return self._model_runner
 
-    def _build_evaluation_input(self, input_text: str, output_text: str) -> str:
-        return f"MESSAGE HISTORY:\n{input_text}\n\nRESPONSE TO EVALUATE:\n{output_text}"
+    def _construct_evaluation_messages(self, input_text: str, output_text: str) -> List[LDMessage]:
+        """
+        Constructs evaluation messages by combining judge's config messages with input/output.
+
+        Iterates all messages from the judge's AI config and interpolates
+        ``{{message_history}}`` with ``input_text`` and ``{{response_to_evaluate}}``
+        with ``output_text`` using Mustache templating.
+
+        :param input_text: The input text (conversation history)
+        :param output_text: The output text to evaluate
+        :return: List of messages for evaluation
+        """
+        if not self._ai_config.messages:
+            return []
+
+        messages: List[LDMessage] = []
+        for msg in self._ai_config.messages:
+            content = self._interpolate_message(msg.content, {
+                'message_history': input_text,
+                'response_to_evaluate': output_text,
+            })
+            messages.append(LDMessage(role=msg.role, content=content))
+
+        return messages
+
+    def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
+        """
+        Interpolates message content with variables using Mustache templating.
+
+        :param content: The message content template
+        :param variables: Variables to interpolate
+        :return: Interpolated message content
+        """
+        return chevron.render(content, variables)
 
     def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]:
         """
diff --git a/packages/sdk/server-ai/src/ldai/managed_model.py b/packages/sdk/server-ai/src/ldai/managed_model.py
index 586b8c16..3eab7929 100644
--- a/packages/sdk/server-ai/src/ldai/managed_model.py
+++ b/packages/sdk/server-ai/src/ldai/managed_model.py
@@ -2,7 +2,7 @@
 from typing import List
 
 from ldai import log
-from ldai.models import AICompletionConfig
+from ldai.models import AICompletionConfig, LDMessage
 from ldai.providers.runner import Runner
 from ldai.providers.types import JudgeResult, ManagedResult, RunnerResult
 from ldai.tracker import LDAIConfigTracker
@@ -12,10 +12,9 @@ class ManagedModel:
     """
     LaunchDarkly managed wrapper for AI model invocations.
 
-    Holds a Runner. Handles judge evaluation dispatch and tracking
-    automatically via ``create_tracker()``. Conversation history is
-    managed by the runner. Obtain an instance via
-    ``LDAIClient.create_model()``.
+    Holds a Runner. Handles conversation management, judge evaluation
+    dispatch, and tracking automatically via ``create_tracker()``.
+    Obtain an instance via ``LDAIClient.create_model()``.
     """
 
     def __init__(
@@ -25,13 +24,15 @@ def __init__(
     ):
         self._ai_config = ai_config
         self._model_runner = model_runner
+        self._messages: List[LDMessage] = []
 
     async def run(self, prompt: str) -> ManagedResult:
         """
         Run the model with a prompt string.
 
-        Delegates to the runner, then dispatches judge evaluations and
-        records tracking metrics.
+        Appends the prompt to the conversation history, prepends any
+        system messages from the config, delegates to the runner, and
+        appends the response to the history.
 
         :param prompt: The user prompt to send to the model
         :return: ManagedResult containing the model's response, metric summary,
@@ -39,12 +40,24 @@ async def run(self, prompt: str) -> ManagedResult:
         """
         tracker = self._ai_config.create_tracker()
 
+        user_message = LDMessage(role='user', content=prompt)
+        self._messages.append(user_message)
+
+        config_messages = self._ai_config.messages or []
+        all_messages = config_messages + self._messages
+
         result: RunnerResult = await tracker.track_metrics_of_async(
             lambda r: r.metrics,
-            lambda: self._model_runner.run(prompt),
+            lambda: self._model_runner.run(all_messages),
         )
 
-        evaluations_task = await self._track_judge_results(tracker, prompt, result.content)
+        assistant_message = LDMessage(role='assistant', content=result.content)
+
+        input_text = '\r\n'.join(m.content for m in self._messages) if self._messages else ''
+
+        evaluations_task = await self._track_judge_results(tracker, input_text, result.content)
+
+        self._messages.append(assistant_message)
 
         return ManagedResult(
             content=result.content,
diff --git a/packages/sdk/server-ai/src/ldai/providers/runner.py b/packages/sdk/server-ai/src/ldai/providers/runner.py
index 36f47915..247ee2ef 100644
--- a/packages/sdk/server-ai/src/ldai/providers/runner.py
+++ b/packages/sdk/server-ai/src/ldai/providers/runner.py
@@ -16,13 +16,14 @@ class Runner(Protocol):
 
     async def run(
         self,
-        input: str,
+        input: Any,
         output_type: Optional[Dict[str, Any]] = None,
     ) -> RunnerResult:
         """
-        Execute the runner with the given input string.
+        Execute the runner with the given input.
 
-        :param input: The string input to the runner.
+        :param input: The input to the runner. May be a string prompt or a
+            list of :class:`~ldai.models.LDMessage` objects.
         :param output_type: Optional JSON schema for structured output.
         :return: RunnerResult containing content, metrics, raw, and parsed fields.
         """
diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py
index b1f88fa2..8c0f60e5 100644
--- a/packages/sdk/server-ai/tests/test_judge.py
+++ b/packages/sdk/server-ai/tests/test_judge.py
@@ -8,7 +8,7 @@
 from ldclient.integrations.test_data import TestData
 
 from ldai import LDAIClient
-from ldai.judge import Judge, _strip_legacy_judge_messages
+from ldai.judge import Judge
 from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
 from ldai.models import (
     AIJudgeConfig,
@@ -113,69 +113,6 @@ def judge_config_without_messages(tracker) -> AIJudgeConfig:
     return _make_judge_config(messages=None, tracker=tracker)
 
 
-class TestStripLegacyJudgeMessages:
-    """Tests for the _strip_legacy_judge_messages helper."""
-
-    def test_strips_assistant_message_with_message_history(self):
-        """Non-system messages containing {{message_history}} should be removed."""
-        messages = [
-            LDMessage(role='system', content='You are a judge.'),
-            LDMessage(role='assistant', content='Here is the history: {{message_history}}'),
-        ]
-        result = _strip_legacy_judge_messages(messages)
-        assert len(result) == 1
-        assert result[0].role == 'system'
-
-    def test_strips_user_message_with_response_to_evaluate(self):
-        """Non-system messages containing {{response_to_evaluate}} should be removed."""
-        messages = [
-            LDMessage(role='system', content='You are a judge.'),
-            LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'),
-        ]
-        result = _strip_legacy_judge_messages(messages)
-        assert len(result) == 1
-        assert result[0].role == 'system'
-
-    def test_strips_all_legacy_messages(self):
-        """All non-system template messages should be stripped from a typical legacy config."""
-        messages = [
-            LDMessage(role='system', content='You are a judge.'),
-            LDMessage(role='assistant', content='{{message_history}}'),
-            LDMessage(role='user', content='{{response_to_evaluate}}'),
-        ]
-        result = _strip_legacy_judge_messages(messages)
-        assert len(result) == 1
-        assert result[0].role == 'system'
-
-    def test_does_not_strip_system_message_containing_template_vars(self):
-        """System messages are never stripped, even if they contain template variable names."""
-        messages = [
-            LDMessage(role='system', content='Judge using {{message_history}} and {{response_to_evaluate}}.'),
-        ]
-        result = _strip_legacy_judge_messages(messages)
-        assert len(result) == 1
-        assert result[0].role == 'system'
-
-    def test_does_not_strip_non_template_messages(self):
-        """Non-system messages without template variables are left untouched."""
-        messages = [
-            LDMessage(role='system', content='You are a judge.'),
-            LDMessage(role='user', content='This is a regular message.'),
-        ]
-        result = _strip_legacy_judge_messages(messages)
-        assert len(result) == 2
-
-    def test_returns_empty_list_for_empty_input(self):
-        """An empty input list should return an empty list."""
-        assert _strip_legacy_judge_messages([]) == []
-
-    def test_new_style_config_system_only_unchanged(self):
-        """A new-style config with only a system message passes through unchanged."""
-        messages = [LDMessage(role='system', content='You are a judge.')]
-        result = _strip_legacy_judge_messages(messages)
-        assert result == messages
-
-
 class TestJudgeInitialization:
     """Tests for Judge initialization."""
 
@@ -225,40 +162,36 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing(
         mock_runner.run.assert_not_called()
 
     @pytest.mark.asyncio
-    async def test_evaluate_succeeds_when_messages_is_none(
-        self, judge_config_without_messages: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
+    async def test_evaluate_returns_failure_when_messages_is_none(
+        self, judge_config_without_messages: AIJudgeConfig, mock_runner
     ):
-        """Evaluate should proceed (not error early) when messages is None."""
-        mock_response = RunnerResult(
-            content='',
-            metrics=LDAIMetrics(success=True),
-            parsed={'score': 0.7, 'reasoning': 'Acceptable response.'},
-        )
-        mock_runner.run.return_value = mock_response
-        tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
-
-        config = _make_judge_config(messages=None, tracker=tracker)
-        judge = Judge(config, mock_runner)
+        """Evaluate should return an error result when messages is None."""
+        judge = Judge(judge_config_without_messages, mock_runner)
 
         result = await judge.evaluate("input text", "output text")
 
         assert isinstance(result, JudgeResult)
-        assert result.sampled is True
+        assert result.success is False
+        assert result.sampled is False
+        assert result.error_message is not None
+        mock_runner.run.assert_not_called()
 
     @pytest.mark.asyncio
-    async def test_evaluate_passes_string_input_to_runner(
+    async def test_evaluate_passes_list_of_messages_to_runner(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
     ):
-        """runner.run() should receive the formatted string, NOT a message list."""
+        """runner.run() should receive a List[LDMessage], not a plain string."""
         mock_response = RunnerResult(
             content='',
             metrics=LDAIMetrics(success=True),
             parsed={'score': 0.85, 'reasoning': 'Good answer.'},
         )
         mock_runner.run.return_value = mock_response
-        tracker.track_metrics_of_async = AsyncMock(
-            side_effect=lambda _metric_fn, fn: fn()
-        )
+
+        async def _await_fn(_metric_fn, fn):
+            return await fn()
+
+        tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn)
 
         judge = Judge(judge_config_with_key, mock_runner)
         await judge.evaluate("What is AI?", "AI is artificial intelligence.")
@@ -266,67 +199,82 @@ async def test_evaluate_passes_string_input_to_runner(
         mock_runner.run.assert_called_once()
         call_args = mock_runner.run.call_args
         input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input')
-        assert isinstance(input_arg, str)
-        assert "MESSAGE HISTORY:\nWhat is AI?" in input_arg
-        assert "RESPONSE TO EVALUATE:\nAI is artificial intelligence." in input_arg
+        assert isinstance(input_arg, list)
+        assert all(isinstance(m, LDMessage) for m in input_arg)
 
     @pytest.mark.asyncio
-    async def test_evaluate_string_input_format(
-        self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
+    async def test_evaluate_interpolates_message_history_and_response(
+        self, tracker: LDAIConfigTracker, mock_runner
     ):
-        """runner.run() should receive the exact expected string format."""
+        """Config messages with {{message_history}} and {{response_to_evaluate}} are interpolated."""
+        config = _make_judge_config(
+            messages=[
+                LDMessage(role='system', content='You are a judge.'),
+                LDMessage(role='assistant', content='{{message_history}}'),
+                LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'),
+            ],
+            tracker=tracker,
+        )
+
         mock_response = RunnerResult(
             content='',
             metrics=LDAIMetrics(success=True),
-            parsed={'score': 0.9, 'reasoning': 'Correct.'},
+            parsed={'score': 0.75, 'reasoning': 'Mostly relevant.'},
         )
         mock_runner.run.return_value = mock_response
-        tracker.track_metrics_of_async = AsyncMock(
-            side_effect=lambda _metric_fn, fn: fn()
-        )
 
-        judge = Judge(judge_config_with_key, mock_runner)
-        await judge.evaluate("hello", "world")
+        async def _await_fn(_metric_fn, fn):
+            return await fn()
+
+        tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn)
+
+        judge = Judge(config, mock_runner)
+        await judge.evaluate("user asked this", "AI said that")
 
         call_args = mock_runner.run.call_args
-        input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input')
-        expected = "MESSAGE HISTORY:\nhello\n\nRESPONSE TO EVALUATE:\nworld"
-        assert input_arg == expected
+        messages_arg: List[LDMessage] = call_args[0][0] if call_args[0] else call_args[1].get('input')
+
+        assert len(messages_arg) == 3
+        assert messages_arg[0].role == 'system'
+        assert messages_arg[0].content == 'You are a judge.'
+        assert messages_arg[1].role == 'assistant'
+        assert messages_arg[1].content == 'user asked this'
+        assert messages_arg[2].role == 'user'
+        assert messages_arg[2].content == 'Evaluate: AI said that'
 
     @pytest.mark.asyncio
-    async def test_evaluate_legacy_config_passes_string_input_to_runner(
+    async def test_evaluate_sends_all_config_messages_to_runner(
         self, tracker: LDAIConfigTracker, mock_runner
     ):
-        """
-        Judge built directly with legacy messages (bypassing the client) still passes
-        a formatted string to the runner.  Legacy message stripping is the client's
-        responsibility; the Judge itself does not strip.
-        """
-        legacy_messages = [
-            LDMessage(role='system', content='You are a strict judge.'),
-            LDMessage(role='assistant', content='{{message_history}}'),
-            LDMessage(role='user', content='Evaluate: {{response_to_evaluate}}'),
-        ]
-        config = _make_judge_config(messages=legacy_messages, tracker=tracker)
+        """All config messages (including non-system) must be passed to the runner."""
+        config = _make_judge_config(
+            messages=[
+                LDMessage(role='system', content='You are a judge.'),
+                LDMessage(role='assistant', content='{{message_history}}'),
+                LDMessage(role='user', content='{{response_to_evaluate}}'),
+            ],
+            tracker=tracker,
+        )
 
         mock_response = RunnerResult(
             content='',
             metrics=LDAIMetrics(success=True),
-            parsed={'score': 0.75, 'reasoning': 'Mostly relevant.'},
+            parsed={'score': 0.8, 'reasoning': 'Good.'},
         )
         mock_runner.run.return_value = mock_response
-        tracker.track_metrics_of_async = AsyncMock(
-            side_effect=lambda _metric_fn, fn: fn()
-        )
+
+        async def _await_fn(_metric_fn, fn):
+            return await fn()
+
+        tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn)
 
         judge = Judge(config, mock_runner)
         await judge.evaluate("input", "output")
 
         call_args = mock_runner.run.call_args
-        input_arg = call_args[0][0] if call_args[0] else call_args[1].get('input')
-        assert isinstance(input_arg, str)
-        assert "MESSAGE HISTORY:\ninput" in input_arg
-        assert "RESPONSE TO EVALUATE:\noutput" in input_arg
+        messages_arg: List[LDMessage] = call_args[0][0] if call_args[0] else call_args[1].get('input')
+
+        assert len(messages_arg) == 3
 
     @pytest.mark.asyncio
     async def test_evaluate_success_with_valid_response(
@@ -545,10 +493,10 @@ async def test_evaluate_messages_calls_evaluate(
         assert tracker.track_metrics_of_async.called
 
     @pytest.mark.asyncio
-    async def test_evaluate_messages_preserves_roles_in_input(
+    async def test_evaluate_messages_joins_content_without_role(
         self, judge_config_with_key: AIJudgeConfig, mock_runner
     ):
-        """evaluate_messages must forward role-prefixed lines to evaluate()."""
+        """evaluate_messages must forward message content (joined by CRLF) to evaluate()."""
         messages = [
             LDMessage(role='user', content='hi'),
             LDMessage(role='assistant', content='hello'),
@@ -561,7 +509,7 @@ async def test_evaluate_messages_preserves_roles_in_input(
 
         mock_evaluate.assert_called_once()
         args, _ = mock_evaluate.call_args
-        assert args[0] == 'user: hi\nassistant: hello'
+        assert args[0] == 'hi\r\nhello'
         assert args[1] == 'reply'
 
 
@@ -580,9 +528,6 @@ async def test_two_evaluations_do_not_contaminate_history(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker
     ):
         """A judge bound to a non-multi-turn runner must run the same baseline twice."""
-        # Stand in a fake runner that records the history it would expose to the
-        # LLM at the moment run() is called. With multi_turn=False the recorded
-        # baseline should be identical across calls.
         seen_baselines: List[List[LDMessage]] = []
 
         class _FakeRunner:
@@ -591,7 +536,6 @@ def __init__(self):
                 self._multi_turn = False
 
             async def run(self, input, output_type=None):  # type: ignore[no-untyped-def]
-                # Snapshot history as seen at call time.
                 seen_baselines.append(list(self._history))
                 return RunnerResult(
                     content='ok',
@@ -611,22 +555,16 @@ async def _await_fn(_metric_fn, fn):
         await judge.evaluate('second input', 'second output')
 
         assert len(seen_baselines) == 2
-        # Both runs see the same baseline (empty in this fake; the point is
-        # they're equal — no contamination from the prior turn).
         assert seen_baselines[0] == seen_baselines[1]
-        # And the runner's history never grew because multi_turn is False.
         assert runner._history == []
 
 
-class TestJudgeConfigStripsLegacyMessages:
-    """Tests for ``LDAIClient.judge_config()`` legacy-message stripping.
+class TestJudgeConfigMessages:
+    """Tests for LDAIClient.judge_config() preserving all config messages.
 
-    Both ``LDAIClient.judge_config()`` (the direct public API) and
-    ``LDAIClient.create_judge()`` (the wrapper API) reach the same internal
-    ``_judge_config()`` method.  ``_judge_config()`` is responsible for
-    injecting ``message_history``/``response_to_evaluate`` markers so they
-    survive Mustache rendering, then stripping any legacy template messages
-    before returning the config.
+    judge_config() must NOT strip legacy template messages — the judge itself
+    is responsible for interpolating {{message_history}} and
+    {{response_to_evaluate}} when it constructs the evaluation message list.
     """
 
     @pytest.fixture
@@ -637,13 +575,8 @@ def _make_client(self, td: TestData) -> LDAIClient:
         config = Config('sdk-key', update_processor_class=td, send_events=False)
         return LDAIClient(LDClient(config=config))
 
-    def test_judge_config_strips_legacy_messages_from_returned_config(self, context):
-        """Calling ``judge_config()`` directly (no variables) still strips legacy messages.
-
-        This is the regression for the bug where legacy messages leaked through
-        the public ``judge_config()`` entry point because reserved-variable
-        markers were only injected in ``_create_judge_instance``.
-        """
+    def test_judge_config_preserves_legacy_messages(self, context):
+        """Calling judge_config() must return all config messages, including legacy template ones."""
         td = TestData.data_source()
         td.update(
             td.flag('legacy-judge')
@@ -666,9 +599,10 @@ def test_judge_config_strips_legacy_messages_from_returned_config(self, context)
 
         assert result.enabled is True
         assert result.messages is not None
-        assert len(result.messages) == 1
+        assert len(result.messages) == 3
         assert result.messages[0].role == 'system'
-        assert result.messages[0].content == 'You are a judge.'
+        assert result.messages[1].role == 'assistant'
+        assert result.messages[2].role == 'user'
 
     def test_judge_config_passes_user_variables_to_template(self, context):
         """User variables are still interpolated into the system message."""
@@ -696,7 +630,7 @@ def test_judge_config_passes_user_variables_to_template(self, context):
         assert result.messages[0].content == 'You are a strict judge.'
 
     def test_judge_config_warns_on_reserved_variables(self, context):
-        """``_judge_config`` warns when callers pass reserved variable names."""
+        """_judge_config warns when callers pass reserved variable names."""
         td = TestData.data_source()
         td.update(
             td.flag('judge-config')