From 221d866ef5a00045c6e5a6ad6da4037ac8ef0c75 Mon Sep 17 00:00:00 2001 From: Wayne Sun Date: Wed, 1 Jul 2026 11:24:46 -0400 Subject: [PATCH 1/2] fix(supervisor): tolerate non-empty bounding set when CAP_SETPCAP is unavailable When running inside rootless Podman on Ubuntu 24.04 with AppArmor's apparmor_restrict_unprivileged_userns=1, prctl(PR_CAPBSET_DROP) returns EPERM even though CAP_SETPCAP may be nominally granted. The capability bounding set remains non-empty, causing the supervisor to abort sandbox creation. Add a new match arm in validate_capability_bounding_set_clear() that tolerates EPERM when the bounding set is non-empty: log a warning and continue, relying on seccomp to block dangerous syscalls. The existing privileged-environment behavior (fail-closed on non-empty success) is unchanged. Emit a parent-side OCSF DetectionFinding alert so the degraded mode is visible to operators and SIEM. The readiness probe performs a non-destructive bounding::drop() on an already-absent capability to detect AppArmor restrictions even when CAP_SETPCAP is nominally present in the effective set. Closes #2069 Signed-off-by: Wayne Sun --- .../src/process.rs | 118 +++++++++++++----- 1 file changed, 88 insertions(+), 30 deletions(-) diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index c1b6b4532..3d54c8939 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -27,6 +27,8 @@ use std::process::Stdio; use std::sync::OnceLock; use tokio::process::{Child, Command}; use tracing::debug; +#[cfg(target_os = "linux")] +use tracing::warn; const SUPERVISOR_ONLY_ENV_VARS: &[&str] = &[ openshell_core::sandbox_env::SANDBOX_TOKEN, @@ -189,12 +191,84 @@ fn validate_capability_bounding_set_clear( "Failed to clear unknown child capability bounding set entries: {unknown_err}" )), }, + Err(err) if err.code() == libc::EPERM => { + warn!( + ?remaining, + "CAP_SETPCAP is unavailable and the child capability bounding set is non-empty; \ + the child process relies on seccomp for confinement" + ); + Ok(()) + } Err(err) => Err(miette::miette!( "Failed to clear child capability bounding set: {err}" )), } } +/// Probe capability bounding-set availability and emit an OCSF +/// `DetectionFinding` from the parent process when `bounding::clear()` +/// would fail and the bounding set is non-empty. Called once before +/// `pre_exec`/`fork()` so the event reaches the tracing subscriber. +/// +/// The probe tries a non-destructive `bounding::drop()` on a capability +/// that is already absent from the bounding set. This triggers the same +/// `prctl(PR_CAPBSET_DROP)` syscall that `bounding::clear()` uses, so +/// `AppArmor` restrictions that block the syscall are detected even when +/// `CAP_SETPCAP` is nominally present in the effective set. +#[cfg(target_os = "linux")] +fn log_capability_bounding_set_readiness() { + use std::sync::Once; + static PROBED: Once = Once::new(); + let mut already_probed = true; + PROBED.call_once(|| already_probed = false); + if already_probed { + return; + } + + let bounding = capctl::caps::bounding::probe(); + if bounding.is_empty() { + return; + } + + // Find a capability NOT in the bounding set so that drop() is a no-op + // when the syscall is permitted. If every known capability is raised + // (unusual), skip the probe — clear() will be attempted in the child + // and the warn!() path handles failure there. + let probe_cap = capctl::caps::Cap::iter().find(|cap| !bounding.has(*cap)); + let clear_blocked = probe_cap.is_some_and(|cap| { + capctl::caps::bounding::drop(cap).is_err_and(|e| e.code() == libc::EPERM) + }); + + if !clear_blocked { + return; + } + + openshell_ocsf::ocsf_emit!( + openshell_ocsf::DetectionFindingBuilder::new(openshell_ocsf::ctx::ctx()) + .activity(openshell_ocsf::ActivityId::Open) + .severity(openshell_ocsf::SeverityId::High) + .confidence(openshell_ocsf::ConfidenceId::High) + .is_alert(true) + .finding_info( + openshell_ocsf::FindingInfo::new( + "bounding-set-clear-blocked", + "Capability Bounding Set Clear Blocked", + ) + .with_desc( + "The supervisor cannot clear the child capability bounding set \ + because PR_CAPBSET_DROP returns EPERM. \ + The child process will rely on seccomp for confinement. \ + This is expected in rootless container runtimes with \ + AppArmor user-namespace restrictions.", + ), + ) + .message(format!( + "PR_CAPBSET_DROP blocked, capability bounding set non-empty: {bounding:?}" + )) + .build() + ); +} + // Pins the pre-seccomp child mount namespace where supervisor identity sockets // are shadowed. Children enter it with setns before dropping privileges. #[cfg(target_os = "linux")] @@ -548,11 +622,14 @@ impl ProcessHandle { } } - // Probe Landlock availability and emit OCSF logs from the parent - // process where the tracing subscriber is functional. The child's - // pre_exec context cannot reliably emit structured logs. + // Probe Landlock and capability bounding-set availability and emit + // OCSF logs from the parent process where the tracing subscriber is + // functional. The child's pre_exec context cannot reliably emit + // structured logs. #[cfg(target_os = "linux")] sandbox::linux::log_sandbox_readiness(policy, workdir); + #[cfg(target_os = "linux")] + log_capability_bounding_set_readiness(); // Phase 1 (as root): Prepare Landlock ruleset by opening PathFds. // This MUST happen before drop_privileges() so that root-only paths @@ -1150,22 +1227,17 @@ mod tests { #[test] #[cfg(target_os = "linux")] - fn capability_bounding_set_clear_rejects_nonempty_eperm() { + fn capability_bounding_set_clear_tolerates_nonempty_eperm() { let mut remaining = capctl::caps::CapSet::empty(); remaining.add(capctl::caps::Cap::CHOWN); - let result = validate_capability_bounding_set_clear( - Err(capctl::Error::from_code(libc::EPERM)), - remaining, - || panic!("unknown capabilities should not be checked when known caps remain"), - ); - - assert!(result.is_err()); assert!( - result - .unwrap_err() - .to_string() - .contains("Failed to clear child capability bounding set") + validate_capability_bounding_set_clear( + Err(capctl::Error::from_code(libc::EPERM)), + remaining, + || panic!("unknown capabilities should not be checked when known caps remain"), + ) + .is_ok() ); } @@ -1270,21 +1342,7 @@ mod tests { let result = drop_privileges(&policy); - #[cfg(target_os = "linux")] - { - if capability_bounding_set_clear_available() { - assert!(result.is_ok(), "drop_privileges failed: {result:?}"); - } else { - let msg = format!("{}", result.unwrap_err()); - assert!( - msg.contains("Failed to clear child capability bounding set"), - "unexpected failure: {msg}" - ); - } - } - - #[cfg(not(target_os = "linux"))] - assert!(result.is_ok()); + assert!(result.is_ok(), "drop_privileges failed: {result:?}"); } #[test] From 4d5f5ad53e92f08949d1dc7b0a66e28a94b6e03b Mon Sep 17 00:00:00 2001 From: Wayne Sun Date: Wed, 1 Jul 2026 11:24:52 -0400 Subject: [PATCH 2/2] test(ci): add rootless capability regression test on ubuntu-24.04 Add a rootless-caps job to branch-checks.yml that runs the supervisor capability bounding set and drop_privileges tests as an unprivileged user on ubuntu-24.04 where AppArmor restricts PR_CAPBSET_DROP. Update architecture/sandbox.md to describe the degraded rootless mode where seccomp provides confinement when the bounding set cannot be cleared. Signed-off-by: Wayne Sun --- .github/workflows/branch-checks.yml | 30 +++++++++++++++++++++++++++++ architecture/sandbox.md | 12 ++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/.github/workflows/branch-checks.yml b/.github/workflows/branch-checks.yml index 7713febb6..5d5f3fd5f 100644 --- a/.github/workflows/branch-checks.yml +++ b/.github/workflows/branch-checks.yml @@ -172,6 +172,36 @@ jobs: - name: Test run: mise run test:python + rootless-caps: + name: Rootless capability tests + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + + - uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + toolchain: "1.95.0" + cache: false + + - name: Run supervisor capability tests without CAP_SETPCAP + run: | + sudo useradd -m testuser + sudo chmod a+rx /home/runner /home/runner/work /home/runner/work/OpenShell + sudo chmod -R a+rX "$GITHUB_WORKSPACE" + sudo cp -r /home/runner/.rustup /home/testuser/.rustup + sudo chown -R testuser: /home/testuser/.rustup + sudo mkdir -p /home/testuser/.cargo + sudo cp /home/runner/.cargo/config.toml /home/testuser/.cargo/ 2>/dev/null || true + sudo chown -R testuser: /home/testuser/.cargo + sudo -u testuser env \ + PATH="/home/testuser/.cargo/bin:/home/testuser/.rustup/toolchains/1.95.0-x86_64-unknown-linux-gnu/bin:$PATH" \ + CARGO_HOME="/home/testuser/.cargo" \ + RUSTUP_HOME="/home/testuser/.rustup" \ + CARGO_TARGET_DIR="/home/testuser/target" \ + bash -c "cd $GITHUB_WORKSPACE && cargo test -p openshell-supervisor-process --lib -- capability_bounding drop_privileges" + markdown: name: Markdown needs: pr_metadata diff --git a/architecture/sandbox.md b/architecture/sandbox.md index 580d8f96d..c88c1ffbb 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -16,10 +16,14 @@ Each sandbox workload has two trust levels: The supervisor keeps enough privilege to manage the sandbox, but the agent child loses that privilege before user code runs. On Linux, child setup clears the capability bounding set during privilege drop so later execs cannot regain -container-granted capabilities. This is fail-closed: the supervisor retains -`CAP_SETPCAP` solely to perform the clear, and spawning the workload or SSH shell -aborts unless the bounding set ends up empty. A `setpcap` `EPERM` is tolerated -only when the set is already empty; any other outcome fails the spawn. +container-granted capabilities. When `CAP_SETPCAP` is available, this is +fail-closed: the supervisor clears the bounding set and aborts if capabilities +remain. When `CAP_SETPCAP` is unavailable (rootless Podman with AppArmor +user-namespace restrictions, or similar environments), the supervisor logs a +warning, emits an OCSF `DetectionFinding` alert, and continues with the +bounding set intact. In this degraded mode the child process relies on seccomp +to block dangerous syscalls; Landlock filesystem restrictions are applied +independently and may also be active. ## Startup Flow