From c311b306a2fabd8b8aeb0312df8ba1426f04521b Mon Sep 17 00:00:00 2001
From: Jeff Larson <thejefflarson@gmail.com>
Date: Mon, 29 Jun 2026 15:43:41 -0700
Subject: [PATCH] fix(adjudicator): refute exploitable verdicts with no
 evidence anchor + clarify runtime/secret evidence in the prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An internet-facing watcher-server Pod came back `exploitable` ("connects to
exposed secrets which are mounted into the pod…") — a false breach. Its evidence:
CVEs (none), no exposed secret baked into the image, runtime = three benign
NetworkConnections to its own DB/metrics. The 1B judge fabricated evidence by
treating benign connections as a live signal and conflating reaching a secret/…
objective with an exposed secret in the image. Correct verdict: refuted.

Add the symmetric backstop to guard_fabricated_cve: guard_unsupported_exploitable
downgrades an Exploitable verdict to Refuted ONLY when ALL THREE exploitation
anchors are absent — empty CVE list, no exposed-secret finding, and no
corroborating runtime behavior (Behavior::is_alert() or exec_class::notable_exec,
the engine's existing definition; benign Network/File/Library/SecretRead are NOT
corroborating). Any anchor present leaves the model's call untouched. Wired after
guard_fabricated_cve in model_call; exposed-secret presence read from the same
entry_findings source the prompt uses.

Also two surgical prompt clarifications: a workload's own activity (network
connections, file reads, library loads, reading its own mounted secrets) is NOT a
live signal — only an ALERT or hands-on-keyboard action is; and reaching a
secret/… objective is NOT an exposed secret baked into the image. This shifts the
verdict fingerprint, so entries re-judge once.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VtjoJttCvBY4dzCoE4f9vP
---
 engine/src/engine/reason/adjudicate/guards.rs |  53 ++++++-
 .../engine/reason/adjudicate/model_call.rs    |  23 ++-
 engine/src/engine/reason/adjudicate/prompt.rs |   4 +-
 .../engine/reason/adjudicate/tests/group_1.rs | 142 +++++++++++++++++-
 .../engine/reason/adjudicate/tests/group_3.rs |  45 +++++-
 5 files changed, 257 insertions(+), 10 deletions(-)

diff --git a/engine/src/engine/reason/adjudicate/guards.rs b/engine/src/engine/reason/adjudicate/guards.rs
index 881000f..bcdf234 100644
--- a/engine/src/engine/reason/adjudicate/guards.rs
+++ b/engine/src/engine/reason/adjudicate/guards.rs
@@ -6,7 +6,7 @@
 //! is what the cross-pass verdict cache keys on.
 
 use crate::engine::graph::attack::AttackRef;
-use crate::engine::graph::{NodeKey, Relation, SecurityGraph};
+use crate::engine::graph::{Behavior, NodeKey, Relation, SecurityGraph};
 
 use super::Verdict;
 use super::evidence::entry_evidence;
@@ -107,6 +107,57 @@ pub(crate) fn guard_fabricated_cve(
     })
 }
 
+/// Whether a runtime behavior CORROBORATES an exploit — the engine's existing definition,
+/// reused verbatim, NOT a new one: a critical Falco alert ([`Behavior::is_alert`]) OR a
+/// notable shell/package-manager exec ([`crate::engine::observe::exec_class::notable_exec`],
+/// JEF-117). Benign `NetworkConnection`/`FileRead`/`LibraryLoaded`/`SecretRead` — a
+/// workload's own observed activity — are NOT corroborating and so must never anchor an
+/// `exploitable` (the watcher-server false breach: three benign connections to its own
+/// DB/metrics were read as a live signal).
+fn corroborating_behavior(behavior: &Behavior) -> bool {
+    behavior.is_alert() || crate::engine::observe::exec_class::notable_exec(behavior).is_some()
+}
+
+/// Zero-anchor safety net (the symmetric backstop to [`guard_fabricated_cve`]): a 1B judge
+/// fabricated an `Exploitable` verdict for the internet-facing `watcher-server` with NO
+/// exploitation evidence at all — no CVE was shown, no exposed secret was baked in, and the
+/// only runtime behavior was three benign `NetworkConnection`s to its own DB/metrics. It got
+/// there by (a) treating benign network connections as a live signal and (b) conflating
+/// reaching a `secret/…` objective with an exposed secret in the image. The correct verdict
+/// is `refuted`: reachability is not a breach.
+///
+/// This guard DOWNGRADES an `Exploitable` verdict to `Refuted` ONLY when ALL THREE
+/// exploitation anchors are absent:
+/// - the CVE evidence list is empty (no CVE was shown to the model), AND
+/// - there is no exposed-secret finding for the entry (`has_exposed_secret == false`), AND
+/// - no observed behavior is [`corroborating_behavior`] (no alert, no notable exec).
+///
+/// Be conservative: if ANY anchor is present — a CVE in the list (even
+/// reachability:not-observed), an exposed secret, or a corroborating behavior — the model's
+/// (debatable) call stands untouched. Those are the model's calls to make, not this guard's
+/// to override; this is purely the zero-anchor net. Like the fabrication guard it only ever
+/// acts on `Exploitable`, leaving every other verdict alone, and the entry is re-judged next
+/// pass.
+pub(crate) fn guard_unsupported_exploitable(
+    verdict: Verdict,
+    cves: &[String],
+    behaviors: &[Behavior],
+    has_exposed_secret: bool,
+) -> Verdict {
+    guard_exploitable(verdict, |_reason| {
+        let has_cve = !cves.is_empty();
+        let has_corroborating = behaviors.iter().any(corroborating_behavior);
+        let any_anchor = has_cve || has_exposed_secret || has_corroborating;
+        (!any_anchor).then(|| {
+            Verdict::Refuted(
+                "no exploitation evidence present (no CVE, no exposed secret, no runtime alert) \
+                 — reachability is not a breach"
+                    .to_string(),
+            )
+        })
+    })
+}
+
 /// A stable fingerprint of the evidence a verdict depends on — the entry's
 /// exposure, its exploited/critical CVEs, and its runtime behavior. The cross-pass
 /// verdict cache keys on this so an entry is re-judged only when the facts that
diff --git a/engine/src/engine/reason/adjudicate/model_call.rs b/engine/src/engine/reason/adjudicate/model_call.rs
index 4027f87..8b0954c 100644
--- a/engine/src/engine/reason/adjudicate/model_call.rs
+++ b/engine/src/engine/reason/adjudicate/model_call.rs
@@ -7,8 +7,8 @@
 use crate::engine::graph::attack::AttackRef;
 use crate::engine::graph::{NodeKey, SecurityGraph};
 
-use super::evidence::{cve_ids_of, entry_evidence};
-use super::guards::guard_fabricated_cve;
+use super::evidence::{cve_ids_of, entry_evidence, entry_findings};
+use super::guards::{guard_fabricated_cve, guard_unsupported_exploitable};
 use super::prompt::{build_judgment_prompt_with, parse_verdict};
 use super::{Adjudicator, Verdict};
 
@@ -86,6 +86,12 @@ impl Adjudicator for ModelAdjudicator {
         // model's, not the engine's. The ONE remaining backstop is anti-fabrication
         // (guard_fabricated_cve), not a decision gate.
         let (cves, behaviors) = entry_evidence(graph, entry);
+        // Exposed-secret presence for the zero-anchor backstop, read from the SAME source the
+        // prompt uses (`entry_findings` → `(secret_lines, posture_lines)`): a non-empty
+        // `secret_lines` means a usable credential is baked into the image. Posture (misconfig
+        // / RBAC) is NOT an exploitation anchor, so it is ignored here.
+        let (secret_lines, _posture_lines) = entry_findings(graph, entry);
+        let has_exposed_secret = !secret_lines.is_empty();
 
         let prompt = build_judgment_prompt_with(entry, objectives, graph, &cves, &behaviors);
         let (reply, verdict) =
@@ -98,7 +104,20 @@ impl Adjudicator for ModelAdjudicator {
                 // from the real evidence. A genuine `Exploitable` (a real CVE, or a non-CVE
                 // step that cites no CVE) passes through untouched.
                 Some(reply) => {
+                    // Two deterministic backstops, chained, both only ever acting on an
+                    // `Exploitable` verdict: anti-fabrication first (a cited CVE absent from the
+                    // evidence → skeptic), then the symmetric zero-anchor net (an `Exploitable`
+                    // with NO CVE, NO exposed secret, and NO corroborating runtime behavior →
+                    // `Refuted`, since reachability is not a breach — the watcher-server false
+                    // breach). Order is harmless: the fabrication guard only fires when a CVE is
+                    // cited, the unsupported guard only when no anchor exists.
                     let verdict = guard_fabricated_cve(parse_verdict(&reply), &cve_ids_of(&cves));
+                    let verdict = guard_unsupported_exploitable(
+                        verdict,
+                        &cves,
+                        &behaviors,
+                        has_exposed_secret,
+                    );
                     (Some(reply), verdict)
                 }
                 // Model unavailable → skeptic: do not let an auto-action proceed.
diff --git a/engine/src/engine/reason/adjudicate/prompt.rs b/engine/src/engine/reason/adjudicate/prompt.rs
index 12c0e16..89067fd 100644
--- a/engine/src/engine/reason/adjudicate/prompt.rs
+++ b/engine/src/engine/reason/adjudicate/prompt.rs
@@ -133,8 +133,8 @@ A deterministic analysis already PROVED this workload can reach every objective
 
 A breach is a reached objective that carries EXPLOITATION EVIDENCE — and only that:
   - a critical / known-exploited CVE from the CVE list that is actually running here (vulnerable code on the path), OR
-  - an ALERT or hands-on-keyboard signal in the observed runtime behavior (something happening now), OR
-  - an EXPOSED SECRET baked into this image (a usable credential — an API key, token, or private key — committed into the image): a real, immediately-usable breach primitive on the path.
+  - an ALERT or hands-on-keyboard signal in the observed runtime behavior (something happening now) — a workload's OWN normal activity (outbound network connections, file reads, library loads, reading its own mounted secrets) is NOT a live signal, only an ALERT or hands-on-keyboard action counts, OR
+  - an EXPOSED SECRET baked into this image (a usable credential — an API key, token, or private key — committed into the image): a real, immediately-usable breach primitive on the path. Reaching a `secret/…` objective (a Credential-Access OUTCOME in the reachable-objectives list) is NOT an exposed secret — only a credential listed in the "Exposed secrets baked into this image" field below is exploitation evidence.
 Judge whether the evidence genuinely makes a reached objective exploitable. With NO such CVE, NO live signal, and NO exposed secret, it is NOT a breach — refute it, no matter how broad, cross-tenant, high-impact, or cross-namespace the reach. A cross-namespace network path or a delete/escalate capability is loose topology / broad authorization (how severe a fix is), not an attack in progress.
 
 Each objective is tagged with HOW it is reached — CONTEXT for how severe a finding would be, NOT a breach signal on its own:
diff --git a/engine/src/engine/reason/adjudicate/tests/group_1.rs b/engine/src/engine/reason/adjudicate/tests/group_1.rs
index c93de40..ae84d58 100644
--- a/engine/src/engine/reason/adjudicate/tests/group_1.rs
+++ b/engine/src/engine/reason/adjudicate/tests/group_1.rs
@@ -6,14 +6,15 @@
 
 use super::super::evidence::{cve_evidence, cve_ids_of, entry_evidence};
 use super::super::guards::{
-    extract_cve_ids, fence, fence_list, guard_fabricated_cve, ns_marker, objective_reach,
+    extract_cve_ids, fence, fence_list, guard_fabricated_cve, guard_unsupported_exploitable,
+    ns_marker, objective_reach,
 };
 use super::super::*;
 use super::{critical_cve, entry_reaching_db, graph_with_vuln, graph_with_vulns, objectives_of};
 use crate::engine::graph::attack::{AttackRef, EXPLOIT_PUBLIC_FACING};
 use crate::engine::graph::{
-    Edge, Exposure, Grade, Image, Node, NodeKey, Provenance, Relation, SecurityGraph, Severity,
-    Trust, Vulnerability, Workload,
+    Behavior, Edge, Exposure, Grade, Image, Node, NodeKey, Provenance, Relation, SecurityGraph,
+    Severity, Trust, Vulnerability, Workload,
 };
 use crate::engine::observe::adapter::{build_graph, default_adapters};
 use crate::engine::observe::{Attribution, ImageVulnerabilities, RuntimeObservation, Snapshot};
@@ -326,6 +327,141 @@ fn hallucination_guard_normalizes_cosmetic_cve_spellings() {
     ));
 }
 
+/// The zero-anchor backstop (symmetric to the fabrication guard): an `Exploitable` with
+/// NO CVE, NO exposed secret, and NO corroborating runtime behavior is downgraded to
+/// `Refuted` — reachability is not a breach. Models the watcher-server false breach: the
+/// internet-facing entry had CVEs `(none)`, no baked-in secret, and only benign
+/// `NetworkConnection`s to its own DB/metrics.
+#[test]
+fn unsupported_exploitable_guard_downgrades_when_no_anchor_present() {
+    let benign = vec![
+        Behavior::NetworkConnection {
+            peer: "10.42.0.1:8086".into(),
+            internet: false,
+        },
+        Behavior::NetworkConnection {
+            peer: "10.42.0.2:8090".into(),
+            internet: false,
+        },
+        Behavior::NetworkConnection {
+            peer: "10.42.0.3:4318".into(),
+            internet: false,
+        },
+    ];
+    // The watcher case: Exploitable + no CVE + no exposed secret + only benign connections.
+    let v = guard_unsupported_exploitable(
+        Verdict::Exploitable("connects to exposed secrets which are mounted into the pod".into()),
+        &[],
+        &benign,
+        false,
+    );
+    assert!(
+        matches!(v, Verdict::Refuted(_)) && !v.promotes(),
+        "zero-anchor exploitable must downgrade to refuted, got {v:?}"
+    );
+
+    // Other benign behaviors (file read, library load, secret read) are likewise no anchor.
+    let benign_misc = vec![
+        Behavior::FileRead {
+            path: "/etc/config".into(),
+        },
+        Behavior::LibraryLoaded {
+            name: "libc.so.6".into(),
+        },
+        Behavior::SecretRead {
+            secret: "app/own-creds".into(),
+        },
+    ];
+    assert!(matches!(
+        guard_unsupported_exploitable(
+            Verdict::Exploitable("reaches its own mounted secret".into()),
+            &[],
+            &benign_misc,
+            false,
+        ),
+        Verdict::Refuted(_)
+    ));
+}
+
+/// The guard is conservative: ANY single anchor — a CVE in the list (even
+/// reachability:not-observed), an exposed-secret finding, or a corroborating runtime
+/// behavior (a critical Falco alert, or a notable shell/package-manager exec, JEF-117) —
+/// leaves the model's `Exploitable` call untouched. Those are the model's (debatable)
+/// calls, not this guard's to override.
+#[test]
+fn unsupported_exploitable_guard_preserves_each_anchored_case() {
+    let no_behaviors: Vec<Behavior> = vec![];
+
+    // Anchor 1 — a CVE is present in the evidence list (the rendered line, any reachability).
+    assert!(matches!(
+        guard_unsupported_exploitable(
+            Verdict::Exploitable("CVE running on the path".into()),
+            &["CVE-2021-44228 [severity: critical] [reachability: not-observed]".to_string()],
+            &no_behaviors,
+            false,
+        ),
+        Verdict::Exploitable(_)
+    ));
+
+    // Anchor 2 — an exposed-secret finding is present for the entry.
+    assert!(matches!(
+        guard_unsupported_exploitable(
+            Verdict::Exploitable("usable credential baked into the image".into()),
+            &[],
+            &no_behaviors,
+            true,
+        ),
+        Verdict::Exploitable(_)
+    ));
+
+    // Anchor 3a — a corroborating runtime behavior: a critical Falco alert (is_alert()).
+    let alert = vec![Behavior::Alert {
+        rule: "Terminal shell in container".into(),
+    }];
+    assert!(matches!(
+        guard_unsupported_exploitable(
+            Verdict::Exploitable("alert fired on the path".into()),
+            &[],
+            &alert,
+            false,
+        ),
+        Verdict::Exploitable(_)
+    ));
+
+    // Anchor 3b — a corroborating runtime behavior: a notable exec (notable_exec(), JEF-117).
+    let notable = vec![Behavior::ProcessExec {
+        path: "/bin/bash".into(),
+    }];
+    assert!(matches!(
+        guard_unsupported_exploitable(
+            Verdict::Exploitable("interactive shell spawned".into()),
+            &[],
+            &notable,
+            false,
+        ),
+        Verdict::Exploitable(_)
+    ));
+}
+
+/// The guard only ever acts on `Exploitable` (mirrors `guard_exploitable`): every other
+/// verdict passes through unchanged even with zero anchors present.
+#[test]
+fn unsupported_exploitable_guard_leaves_non_exploitable_verdicts_untouched() {
+    let none: Vec<Behavior> = vec![];
+    assert!(matches!(
+        guard_unsupported_exploitable(Verdict::Refuted("benign".into()), &[], &none, false),
+        Verdict::Refuted(_)
+    ));
+    assert!(matches!(
+        guard_unsupported_exploitable(Verdict::Confirmed, &[], &none, false),
+        Verdict::Confirmed
+    ));
+    assert!(matches!(
+        guard_unsupported_exploitable(Verdict::Uncertain("unclear".into()), &[], &none, false),
+        Verdict::Uncertain(_)
+    ));
+}
+
 #[test]
 fn prompt_includes_the_chain_evidence() {
     // A foothold chain: exposed + KEV CVE + runtime signal → meets the bar.
diff --git a/engine/src/engine/reason/adjudicate/tests/group_3.rs b/engine/src/engine/reason/adjudicate/tests/group_3.rs
index 5d5e641..ad97963 100644
--- a/engine/src/engine/reason/adjudicate/tests/group_3.rs
+++ b/engine/src/engine/reason/adjudicate/tests/group_3.rs
@@ -49,9 +49,12 @@ fn oversized_fence_laden_title_stays_bounded_and_fence_intact() {
     let (g, e) = graph_with_vuln(v);
     let prompt = build_judgment_prompt(&e, &[], &g);
 
-    // The whole prompt is small despite the megabyte input — the cap bounds it hard.
+    // The whole prompt is small despite the megabyte input — the cap bounds it hard. The
+    // bound is on the UNTRUSTED payload, not the static template (the floor here is the
+    // ~4.3 KB static prompt + the per-field-capped title); a megabyte of title would blow
+    // past this by orders of magnitude if the cap failed, so the assertion still proves it.
     assert!(
-        prompt.len() < 4_000,
+        prompt.len() < 5_000,
         "prompt must stay bounded; was {} bytes",
         prompt.len()
     );
@@ -289,3 +292,41 @@ fn prompt_keeps_the_notable_exec_annotation_after_the_classifier_move() {
         "bare exec was wrongly annotated:\n{prompt}"
     );
 }
+
+/// The prompt clarifies (at the source of the watcher-server false breach) that a
+/// workload's OWN observed activity — outbound network connections, file reads, library
+/// loads, reading its own mounted secrets — is normal behavior and NOT a live signal;
+/// only an ALERT or hands-on-keyboard action counts as the runtime exploitation signal.
+#[test]
+fn prompt_clarifies_benign_runtime_activity_is_not_a_live_signal() {
+    let (g, e) = graph_with_vuln(critical_cve("CVE-2021-44228"));
+    let prompt = build_judgment_prompt(&e, &[], &g);
+    assert!(
+        prompt.contains("network connections") && prompt.contains("NOT a live signal"),
+        "prompt must say a workload's own network connections are NOT a live signal:\n{prompt}"
+    );
+    assert!(
+        prompt.contains("only an ALERT or hands-on-keyboard action counts"),
+        "prompt must restrict the runtime signal to alert/hands-on-keyboard:\n{prompt}"
+    );
+}
+
+/// The prompt clarifies that reaching a `secret/…` objective (a Credential-Access OUTCOME
+/// in the reachable-objectives list) is NOT the same as an exposed secret baked into the
+/// image — only a credential in the "Exposed secrets baked into this image" field is
+/// exploitation evidence. (The watcher judge conflated the two.)
+#[test]
+fn prompt_clarifies_reaching_a_secret_objective_is_not_an_exposed_secret() {
+    let (g, e) = graph_with_vuln(critical_cve("CVE-2021-44228"));
+    let prompt = build_judgment_prompt(&e, &[], &g);
+    assert!(
+        prompt.contains("Reaching a `secret/…` objective")
+            && prompt.contains("is NOT an exposed secret"),
+        "prompt must distinguish reaching a secret objective from an exposed secret:\n{prompt}"
+    );
+    assert!(
+        prompt
+            .contains("only a credential listed in the \"Exposed secrets baked into this image\""),
+        "prompt must point to the exposed-secrets field as the sole secret evidence:\n{prompt}"
+    );
+}