From c311b306a2fabd8b8aeb0312df8ba1426f04521b Mon Sep 17 00:00:00 2001 From: Jeff Larson Date: Mon, 29 Jun 2026 15:43:41 -0700 Subject: [PATCH] fix(adjudicator): refute exploitable verdicts with no evidence anchor + clarify runtime/secret evidence in the prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An internet-facing watcher-server Pod came back `exploitable` ("connects to exposed secrets which are mounted into the pod…") — a false breach. Its evidence: CVEs (none), no exposed secret baked into the image, runtime = three benign NetworkConnections to its own DB/metrics. The 1B judge fabricated evidence by treating benign connections as a live signal and conflating reaching a secret/… objective with an exposed secret in the image. Correct verdict: refuted. Add the symmetric backstop to guard_fabricated_cve: guard_unsupported_exploitable downgrades an Exploitable verdict to Refuted ONLY when ALL THREE exploitation anchors are absent — empty CVE list, no exposed-secret finding, and no corroborating runtime behavior (Behavior::is_alert() or exec_class::notable_exec, the engine's existing definition; benign Network/File/Library/SecretRead are NOT corroborating). Any anchor present leaves the model's call untouched. Wired after guard_fabricated_cve in model_call; exposed-secret presence read from the same entry_findings source the prompt uses. Also two surgical prompt clarifications: a workload's own activity (network connections, file reads, library loads, reading its own mounted secrets) is NOT a live signal — only an ALERT or hands-on-keyboard action is; and reaching a secret/… objective is NOT an exposed secret baked into the image. This shifts the verdict fingerprint, so entries re-judge once. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01VtjoJttCvBY4dzCoE4f9vP --- engine/src/engine/reason/adjudicate/guards.rs | 53 ++++++- .../engine/reason/adjudicate/model_call.rs | 23 ++- engine/src/engine/reason/adjudicate/prompt.rs | 4 +- .../engine/reason/adjudicate/tests/group_1.rs | 142 +++++++++++++++++- .../engine/reason/adjudicate/tests/group_3.rs | 45 +++++- 5 files changed, 257 insertions(+), 10 deletions(-) diff --git a/engine/src/engine/reason/adjudicate/guards.rs b/engine/src/engine/reason/adjudicate/guards.rs index 881000f..bcdf234 100644 --- a/engine/src/engine/reason/adjudicate/guards.rs +++ b/engine/src/engine/reason/adjudicate/guards.rs @@ -6,7 +6,7 @@ //! is what the cross-pass verdict cache keys on. use crate::engine::graph::attack::AttackRef; -use crate::engine::graph::{NodeKey, Relation, SecurityGraph}; +use crate::engine::graph::{Behavior, NodeKey, Relation, SecurityGraph}; use super::Verdict; use super::evidence::entry_evidence; @@ -107,6 +107,57 @@ pub(crate) fn guard_fabricated_cve( }) } +/// Whether a runtime behavior CORROBORATES an exploit — the engine's existing definition, +/// reused verbatim, NOT a new one: a critical Falco alert ([`Behavior::is_alert`]) OR a +/// notable shell/package-manager exec ([`crate::engine::observe::exec_class::notable_exec`], +/// JEF-117). Benign `NetworkConnection`/`FileRead`/`LibraryLoaded`/`SecretRead` — a +/// workload's own observed activity — are NOT corroborating and so must never anchor an +/// `exploitable` (the watcher-server false breach: three benign connections to its own +/// DB/metrics were read as a live signal). +fn corroborating_behavior(behavior: &Behavior) -> bool { + behavior.is_alert() || crate::engine::observe::exec_class::notable_exec(behavior).is_some() +} + +/// Zero-anchor safety net (the symmetric backstop to [`guard_fabricated_cve`]): a 1B judge +/// fabricated an `Exploitable` verdict for the internet-facing `watcher-server` with NO +/// exploitation evidence at all — no CVE was shown, no exposed secret was baked in, and the +/// only runtime behavior was three benign `NetworkConnection`s to its own DB/metrics. It got +/// there by (a) treating benign network connections as a live signal and (b) conflating +/// reaching a `secret/…` objective with an exposed secret in the image. The correct verdict +/// is `refuted`: reachability is not a breach. +/// +/// This guard DOWNGRADES an `Exploitable` verdict to `Refuted` ONLY when ALL THREE +/// exploitation anchors are absent: +/// - the CVE evidence list is empty (no CVE was shown to the model), AND +/// - there is no exposed-secret finding for the entry (`has_exposed_secret == false`), AND +/// - no observed behavior is [`corroborating_behavior`] (no alert, no notable exec). +/// +/// Be conservative: if ANY anchor is present — a CVE in the list (even +/// reachability:not-observed), an exposed secret, or a corroborating behavior — the model's +/// (debatable) call stands untouched. Those are the model's calls to make, not this guard's +/// to override; this is purely the zero-anchor net. Like the fabrication guard it only ever +/// acts on `Exploitable`, leaving every other verdict alone, and the entry is re-judged next +/// pass. +pub(crate) fn guard_unsupported_exploitable( + verdict: Verdict, + cves: &[String], + behaviors: &[Behavior], + has_exposed_secret: bool, +) -> Verdict { + guard_exploitable(verdict, |_reason| { + let has_cve = !cves.is_empty(); + let has_corroborating = behaviors.iter().any(corroborating_behavior); + let any_anchor = has_cve || has_exposed_secret || has_corroborating; + (!any_anchor).then(|| { + Verdict::Refuted( + "no exploitation evidence present (no CVE, no exposed secret, no runtime alert) \ + — reachability is not a breach" + .to_string(), + ) + }) + }) +} + /// A stable fingerprint of the evidence a verdict depends on — the entry's /// exposure, its exploited/critical CVEs, and its runtime behavior. The cross-pass /// verdict cache keys on this so an entry is re-judged only when the facts that diff --git a/engine/src/engine/reason/adjudicate/model_call.rs b/engine/src/engine/reason/adjudicate/model_call.rs index 4027f87..8b0954c 100644 --- a/engine/src/engine/reason/adjudicate/model_call.rs +++ b/engine/src/engine/reason/adjudicate/model_call.rs @@ -7,8 +7,8 @@ use crate::engine::graph::attack::AttackRef; use crate::engine::graph::{NodeKey, SecurityGraph}; -use super::evidence::{cve_ids_of, entry_evidence}; -use super::guards::guard_fabricated_cve; +use super::evidence::{cve_ids_of, entry_evidence, entry_findings}; +use super::guards::{guard_fabricated_cve, guard_unsupported_exploitable}; use super::prompt::{build_judgment_prompt_with, parse_verdict}; use super::{Adjudicator, Verdict}; @@ -86,6 +86,12 @@ impl Adjudicator for ModelAdjudicator { // model's, not the engine's. The ONE remaining backstop is anti-fabrication // (guard_fabricated_cve), not a decision gate. let (cves, behaviors) = entry_evidence(graph, entry); + // Exposed-secret presence for the zero-anchor backstop, read from the SAME source the + // prompt uses (`entry_findings` → `(secret_lines, posture_lines)`): a non-empty + // `secret_lines` means a usable credential is baked into the image. Posture (misconfig + // / RBAC) is NOT an exploitation anchor, so it is ignored here. + let (secret_lines, _posture_lines) = entry_findings(graph, entry); + let has_exposed_secret = !secret_lines.is_empty(); let prompt = build_judgment_prompt_with(entry, objectives, graph, &cves, &behaviors); let (reply, verdict) = @@ -98,7 +104,20 @@ impl Adjudicator for ModelAdjudicator { // from the real evidence. A genuine `Exploitable` (a real CVE, or a non-CVE // step that cites no CVE) passes through untouched. Some(reply) => { + // Two deterministic backstops, chained, both only ever acting on an + // `Exploitable` verdict: anti-fabrication first (a cited CVE absent from the + // evidence → skeptic), then the symmetric zero-anchor net (an `Exploitable` + // with NO CVE, NO exposed secret, and NO corroborating runtime behavior → + // `Refuted`, since reachability is not a breach — the watcher-server false + // breach). Order is harmless: the fabrication guard only fires when a CVE is + // cited, the unsupported guard only when no anchor exists. let verdict = guard_fabricated_cve(parse_verdict(&reply), &cve_ids_of(&cves)); + let verdict = guard_unsupported_exploitable( + verdict, + &cves, + &behaviors, + has_exposed_secret, + ); (Some(reply), verdict) } // Model unavailable → skeptic: do not let an auto-action proceed. diff --git a/engine/src/engine/reason/adjudicate/prompt.rs b/engine/src/engine/reason/adjudicate/prompt.rs index 12c0e16..89067fd 100644 --- a/engine/src/engine/reason/adjudicate/prompt.rs +++ b/engine/src/engine/reason/adjudicate/prompt.rs @@ -133,8 +133,8 @@ A deterministic analysis already PROVED this workload can reach every objective A breach is a reached objective that carries EXPLOITATION EVIDENCE — and only that: - a critical / known-exploited CVE from the CVE list that is actually running here (vulnerable code on the path), OR - - an ALERT or hands-on-keyboard signal in the observed runtime behavior (something happening now), OR - - an EXPOSED SECRET baked into this image (a usable credential — an API key, token, or private key — committed into the image): a real, immediately-usable breach primitive on the path. + - an ALERT or hands-on-keyboard signal in the observed runtime behavior (something happening now) — a workload's OWN normal activity (outbound network connections, file reads, library loads, reading its own mounted secrets) is NOT a live signal, only an ALERT or hands-on-keyboard action counts, OR + - an EXPOSED SECRET baked into this image (a usable credential — an API key, token, or private key — committed into the image): a real, immediately-usable breach primitive on the path. Reaching a `secret/…` objective (a Credential-Access OUTCOME in the reachable-objectives list) is NOT an exposed secret — only a credential listed in the "Exposed secrets baked into this image" field below is exploitation evidence. Judge whether the evidence genuinely makes a reached objective exploitable. With NO such CVE, NO live signal, and NO exposed secret, it is NOT a breach — refute it, no matter how broad, cross-tenant, high-impact, or cross-namespace the reach. A cross-namespace network path or a delete/escalate capability is loose topology / broad authorization (how severe a fix is), not an attack in progress. Each objective is tagged with HOW it is reached — CONTEXT for how severe a finding would be, NOT a breach signal on its own: diff --git a/engine/src/engine/reason/adjudicate/tests/group_1.rs b/engine/src/engine/reason/adjudicate/tests/group_1.rs index c93de40..ae84d58 100644 --- a/engine/src/engine/reason/adjudicate/tests/group_1.rs +++ b/engine/src/engine/reason/adjudicate/tests/group_1.rs @@ -6,14 +6,15 @@ use super::super::evidence::{cve_evidence, cve_ids_of, entry_evidence}; use super::super::guards::{ - extract_cve_ids, fence, fence_list, guard_fabricated_cve, ns_marker, objective_reach, + extract_cve_ids, fence, fence_list, guard_fabricated_cve, guard_unsupported_exploitable, + ns_marker, objective_reach, }; use super::super::*; use super::{critical_cve, entry_reaching_db, graph_with_vuln, graph_with_vulns, objectives_of}; use crate::engine::graph::attack::{AttackRef, EXPLOIT_PUBLIC_FACING}; use crate::engine::graph::{ - Edge, Exposure, Grade, Image, Node, NodeKey, Provenance, Relation, SecurityGraph, Severity, - Trust, Vulnerability, Workload, + Behavior, Edge, Exposure, Grade, Image, Node, NodeKey, Provenance, Relation, SecurityGraph, + Severity, Trust, Vulnerability, Workload, }; use crate::engine::observe::adapter::{build_graph, default_adapters}; use crate::engine::observe::{Attribution, ImageVulnerabilities, RuntimeObservation, Snapshot}; @@ -326,6 +327,141 @@ fn hallucination_guard_normalizes_cosmetic_cve_spellings() { )); } +/// The zero-anchor backstop (symmetric to the fabrication guard): an `Exploitable` with +/// NO CVE, NO exposed secret, and NO corroborating runtime behavior is downgraded to +/// `Refuted` — reachability is not a breach. Models the watcher-server false breach: the +/// internet-facing entry had CVEs `(none)`, no baked-in secret, and only benign +/// `NetworkConnection`s to its own DB/metrics. +#[test] +fn unsupported_exploitable_guard_downgrades_when_no_anchor_present() { + let benign = vec![ + Behavior::NetworkConnection { + peer: "10.42.0.1:8086".into(), + internet: false, + }, + Behavior::NetworkConnection { + peer: "10.42.0.2:8090".into(), + internet: false, + }, + Behavior::NetworkConnection { + peer: "10.42.0.3:4318".into(), + internet: false, + }, + ]; + // The watcher case: Exploitable + no CVE + no exposed secret + only benign connections. + let v = guard_unsupported_exploitable( + Verdict::Exploitable("connects to exposed secrets which are mounted into the pod".into()), + &[], + &benign, + false, + ); + assert!( + matches!(v, Verdict::Refuted(_)) && !v.promotes(), + "zero-anchor exploitable must downgrade to refuted, got {v:?}" + ); + + // Other benign behaviors (file read, library load, secret read) are likewise no anchor. + let benign_misc = vec![ + Behavior::FileRead { + path: "/etc/config".into(), + }, + Behavior::LibraryLoaded { + name: "libc.so.6".into(), + }, + Behavior::SecretRead { + secret: "app/own-creds".into(), + }, + ]; + assert!(matches!( + guard_unsupported_exploitable( + Verdict::Exploitable("reaches its own mounted secret".into()), + &[], + &benign_misc, + false, + ), + Verdict::Refuted(_) + )); +} + +/// The guard is conservative: ANY single anchor — a CVE in the list (even +/// reachability:not-observed), an exposed-secret finding, or a corroborating runtime +/// behavior (a critical Falco alert, or a notable shell/package-manager exec, JEF-117) — +/// leaves the model's `Exploitable` call untouched. Those are the model's (debatable) +/// calls, not this guard's to override. +#[test] +fn unsupported_exploitable_guard_preserves_each_anchored_case() { + let no_behaviors: Vec = vec![]; + + // Anchor 1 — a CVE is present in the evidence list (the rendered line, any reachability). + assert!(matches!( + guard_unsupported_exploitable( + Verdict::Exploitable("CVE running on the path".into()), + &["CVE-2021-44228 [severity: critical] [reachability: not-observed]".to_string()], + &no_behaviors, + false, + ), + Verdict::Exploitable(_) + )); + + // Anchor 2 — an exposed-secret finding is present for the entry. + assert!(matches!( + guard_unsupported_exploitable( + Verdict::Exploitable("usable credential baked into the image".into()), + &[], + &no_behaviors, + true, + ), + Verdict::Exploitable(_) + )); + + // Anchor 3a — a corroborating runtime behavior: a critical Falco alert (is_alert()). + let alert = vec![Behavior::Alert { + rule: "Terminal shell in container".into(), + }]; + assert!(matches!( + guard_unsupported_exploitable( + Verdict::Exploitable("alert fired on the path".into()), + &[], + &alert, + false, + ), + Verdict::Exploitable(_) + )); + + // Anchor 3b — a corroborating runtime behavior: a notable exec (notable_exec(), JEF-117). + let notable = vec![Behavior::ProcessExec { + path: "/bin/bash".into(), + }]; + assert!(matches!( + guard_unsupported_exploitable( + Verdict::Exploitable("interactive shell spawned".into()), + &[], + ¬able, + false, + ), + Verdict::Exploitable(_) + )); +} + +/// The guard only ever acts on `Exploitable` (mirrors `guard_exploitable`): every other +/// verdict passes through unchanged even with zero anchors present. +#[test] +fn unsupported_exploitable_guard_leaves_non_exploitable_verdicts_untouched() { + let none: Vec = vec![]; + assert!(matches!( + guard_unsupported_exploitable(Verdict::Refuted("benign".into()), &[], &none, false), + Verdict::Refuted(_) + )); + assert!(matches!( + guard_unsupported_exploitable(Verdict::Confirmed, &[], &none, false), + Verdict::Confirmed + )); + assert!(matches!( + guard_unsupported_exploitable(Verdict::Uncertain("unclear".into()), &[], &none, false), + Verdict::Uncertain(_) + )); +} + #[test] fn prompt_includes_the_chain_evidence() { // A foothold chain: exposed + KEV CVE + runtime signal → meets the bar. diff --git a/engine/src/engine/reason/adjudicate/tests/group_3.rs b/engine/src/engine/reason/adjudicate/tests/group_3.rs index 5d5e641..ad97963 100644 --- a/engine/src/engine/reason/adjudicate/tests/group_3.rs +++ b/engine/src/engine/reason/adjudicate/tests/group_3.rs @@ -49,9 +49,12 @@ fn oversized_fence_laden_title_stays_bounded_and_fence_intact() { let (g, e) = graph_with_vuln(v); let prompt = build_judgment_prompt(&e, &[], &g); - // The whole prompt is small despite the megabyte input — the cap bounds it hard. + // The whole prompt is small despite the megabyte input — the cap bounds it hard. The + // bound is on the UNTRUSTED payload, not the static template (the floor here is the + // ~4.3 KB static prompt + the per-field-capped title); a megabyte of title would blow + // past this by orders of magnitude if the cap failed, so the assertion still proves it. assert!( - prompt.len() < 4_000, + prompt.len() < 5_000, "prompt must stay bounded; was {} bytes", prompt.len() ); @@ -289,3 +292,41 @@ fn prompt_keeps_the_notable_exec_annotation_after_the_classifier_move() { "bare exec was wrongly annotated:\n{prompt}" ); } + +/// The prompt clarifies (at the source of the watcher-server false breach) that a +/// workload's OWN observed activity — outbound network connections, file reads, library +/// loads, reading its own mounted secrets — is normal behavior and NOT a live signal; +/// only an ALERT or hands-on-keyboard action counts as the runtime exploitation signal. +#[test] +fn prompt_clarifies_benign_runtime_activity_is_not_a_live_signal() { + let (g, e) = graph_with_vuln(critical_cve("CVE-2021-44228")); + let prompt = build_judgment_prompt(&e, &[], &g); + assert!( + prompt.contains("network connections") && prompt.contains("NOT a live signal"), + "prompt must say a workload's own network connections are NOT a live signal:\n{prompt}" + ); + assert!( + prompt.contains("only an ALERT or hands-on-keyboard action counts"), + "prompt must restrict the runtime signal to alert/hands-on-keyboard:\n{prompt}" + ); +} + +/// The prompt clarifies that reaching a `secret/…` objective (a Credential-Access OUTCOME +/// in the reachable-objectives list) is NOT the same as an exposed secret baked into the +/// image — only a credential in the "Exposed secrets baked into this image" field is +/// exploitation evidence. (The watcher judge conflated the two.) +#[test] +fn prompt_clarifies_reaching_a_secret_objective_is_not_an_exposed_secret() { + let (g, e) = graph_with_vuln(critical_cve("CVE-2021-44228")); + let prompt = build_judgment_prompt(&e, &[], &g); + assert!( + prompt.contains("Reaching a `secret/…` objective") + && prompt.contains("is NOT an exposed secret"), + "prompt must distinguish reaching a secret objective from an exposed secret:\n{prompt}" + ); + assert!( + prompt + .contains("only a credential listed in the \"Exposed secrets baked into this image\""), + "prompt must point to the exposed-secrets field as the sole secret evidence:\n{prompt}" + ); +}