diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 5bc04beb3..08e423013 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -268,6 +268,33 @@ kubectl -n openshell get configmap openshell-config -o jsonpath='{.data.gateway\ kubectl -n get sandbox -o jsonpath='{.spec.template.spec.serviceAccountName}{"\n"}' ``` +If `supervisor_topology = "sidecar"` is rendered, sandbox pods should have an +`openshell-network-init` init container running `--mode=network-init`, an +`agent` container running `openshell-sandbox --mode=process`, and an +`openshell-supervisor-network` container running `--mode=network`. The init +container owns nftables setup and should be the only sidecar topology container +with `NET_ADMIN`. It also needs `CHOWN`/`FOWNER` to hand shared emptyDir state +to `sidecar_proxy_uid`. The long-running network sidecar runs as +`sidecar_proxy_uid` with primary GID `0` so it can read the root-owned, +group-readable projected service-account token. In sidecar topology the +`openshell-sa-token` projected volume should render `defaultMode: 288` (`0440`); +if the proxy logs `failed to read K8s SA token`, verify this token mode and the +network sidecar security context. The process container should also publish the +workload entrypoint PID to `OPENSHELL_ENTRYPOINT_PID_FILE` +(`/run/openshell-sidecar/entrypoint.pid` by default), and the network sidecar +should read it for binary-scoped policy decisions; if allowed network rules are +all denied, inspect that file and the network sidecar logs. +Inspect all three when sandbox registration or egress enforcement fails: + +```bash +kubectl -n openshell get configmap openshell-config -o jsonpath='{.data.gateway\.toml}' | grep supervisor_topology +kubectl -n get pod -o jsonpath='{range .spec.initContainers[*]}{.name}{" "}{.command}{"\n"}{end}' +kubectl -n get pod -o jsonpath='{range .spec.containers[*]}{.name}{" "}{.command}{"\n"}{end}' +kubectl -n logs -c openshell-network-init --tail=200 +kubectl -n logs -c openshell-supervisor-network --tail=200 +kubectl -n logs -c agent --tail=200 +``` + ### Step 6: Check VM-Backed Gateways Use the VM driver logs and host diagnostics available in the user's environment. Verify: diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index bffa4e2e8..7d6ad7cd5 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -60,9 +60,17 @@ mise run helm:skaffold:dev mise run helm:skaffold:run ``` +**Supervisor sidecar topology** (build once and leave running): +```bash +mise run helm:skaffold:run:sidecar +``` + Both commands build the `gateway` and `supervisor` images and deploy the OpenShell Helm -chart. The `pkiInitJob` hook (a pre-install Job that runs `openshell-gateway generate-certs`) -generates mTLS secrets on first install. Envoy Gateway opt-in; see the Optional Add-ons section below. +chart. The sidecar profile renders an `openshell-network-init` init container for +nftables setup and a non-root `openshell-supervisor-network` runtime sidecar for +proxying. The `pkiInitJob` hook (a pre-install Job that runs `openshell-gateway +generate-certs`) generates mTLS secrets on first install. Envoy Gateway opt-in; +see the Optional Add-ons section below. The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or `kubectl port-forward`. @@ -126,6 +134,12 @@ openshell sandbox list --gateway-endpoint https://localhost:8090 mise run helm:skaffold:delete ``` +For a sidecar-profile deployment: + +```bash +mise run helm:skaffold:delete:sidecar +``` + ### Delete the cluster entirely ```bash @@ -250,6 +264,7 @@ for dependencies still declared in `Chart.yaml`. | `deploy/helm/openshell/ci/values-gateway.yaml` | Envoy Gateway GRPCRoute + Gateway overlay | | `deploy/helm/openshell/ci/values-high-availability.yaml` | HA test overlay (`replicaCount: 2` with external PostgreSQL Secret) | | `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay | +| `deploy/helm/openshell/ci/values-sidecar.yaml` | Supervisor sidecar topology overlay for Kubernetes e2e/dev | | `deploy/helm/openshell/ci/values-spire.yaml` | SPIFFE/SPIRE provider token grant overlay | | `deploy/helm/openshell/ci/values-spire-stack.yaml` | SPIRE hardened chart values for local dev | | `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) | diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index ebe783406..859f22ef9 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -121,16 +121,25 @@ jobs: include: - agent_sandbox_api: v1beta1 agent_sandbox_version: v0.5.0 + topology: combined + extra_helm_values: "" - agent_sandbox_api: v1alpha1 agent_sandbox_version: v0.4.6 + topology: combined + extra_helm_values: "" + - agent_sandbox_api: v1beta1 + agent_sandbox_version: v0.5.0 + topology: sidecar + extra_helm_values: deploy/helm/openshell/ci/values-sidecar.yaml permissions: contents: read packages: read uses: ./.github/workflows/e2e-kubernetes-test.yml with: image-tag: ${{ github.sha }} - job-name: Kubernetes E2E (Rust smoke, Agent Sandbox ${{ matrix.agent_sandbox_api }}) + job-name: Kubernetes E2E (Rust smoke, ${{ matrix.topology }}, Agent Sandbox ${{ matrix.agent_sandbox_api }}) agent-sandbox-version: ${{ matrix.agent_sandbox_version }} + extra-helm-values: ${{ matrix.extra_helm_values }} kubernetes-ha-e2e: needs: [pr_metadata, build-gateway, build-supervisor] diff --git a/Cargo.lock b/Cargo.lock index 13b670f55..e94eb56f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3827,6 +3827,7 @@ dependencies = [ "clap", "futures", "miette", + "nix", "openshell-core", "openshell-ocsf", "openshell-policy", diff --git a/architecture/build.md b/architecture/build.md index a3cb2e25f..633efa72d 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -91,10 +91,11 @@ Runtime layout: as a release artifact. Linux GNU VM driver binaries must not reference `GLIBC_*` symbols newer than `GLIBC_2.28`; release workflows verify this before publishing artifacts. -- **Supervisor**: `scratch` base, static musl binary at `/openshell-sandbox`. - Static linkage is required because the image is mounted/extracted into - sandbox environments (Docker extraction, Podman image volumes, Kubernetes - init-container copy-self) and cannot rely on a dynamic loader. +- **Supervisor**: Alpine base with `nftables`, static musl binary at + `/openshell-sandbox`. Static linkage keeps the binary usable when the image + is mounted/extracted into sandbox environments (Docker extraction, Podman + image volumes, Kubernetes init-container copy-self), while `nftables` supports + Kubernetes supervisor sidecar egress enforcement. Gateway image builds bake the corresponding supervisor image tag into the gateway binary so Docker sandboxes do not depend on `:latest` by default. diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index f122fda5d..ac239bfb8 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -81,7 +81,7 @@ The supervisor must be available inside each sandbox workload: |---|---| | Docker | Bind-mounted local supervisor binary, or a binary extracted from the configured supervisor image. | | Podman | Read-only OCI image volume containing the supervisor binary. | -| Kubernetes | Sandbox pod image or pod template configuration. | +| Kubernetes | Supervisor image side-loaded into the sandbox pod by image volume or init container. | | VM | Embedded in the guest rootfs bundle. | | Extension | Defined by the out-of-tree driver. | @@ -89,6 +89,20 @@ Driver-controlled environment variables must override sandbox image or template values for sandbox ID, sandbox name, gateway endpoint, relay socket path, TLS paths, and command metadata. +Kubernetes can run the supervisor in the default combined topology or in a +sidecar topology. Combined mode keeps network and process supervision in the +agent container. Sidecar mode runs network enforcement, the proxy, and gateway +loopback forwarding in a dedicated sidecar, while the agent container runs only +the process-supervision leaf and launches the user workload after the sidecar +signals readiness. In sidecar mode, an init container performs the privileged +pod-network nftables setup with `NET_ADMIN` and hands shared state ownership to +the configured proxy UID; the long-running network sidecar runs as that UID and +does not keep `NET_ADMIN`. The agent container runs as the resolved sandbox +UID/GID with no added Linux capabilities. Sidecar mode preserves gateway session +and SSH behavior, but treats the process leaf as network-only: Landlock +filesystem policy, process privilege dropping, and process/binary identity +checks are not applied there. + ## Images The gateway image and Helm chart are built from this repository. Sandbox images diff --git a/crates/openshell-core/src/grpc_client.rs b/crates/openshell-core/src/grpc_client.rs index 96158a1d1..4f2477c25 100644 --- a/crates/openshell-core/src/grpc_client.rs +++ b/crates/openshell-core/src/grpc_client.rs @@ -167,9 +167,14 @@ async fn build_plain_channel(endpoint: &str) -> Result { .into_diagnostic() .wrap_err_with(|| format!("failed to read client key from {key_path}"))?; - let tls_config = ClientTlsConfig::new() + let mut tls_config = ClientTlsConfig::new() .ca_certificate(Certificate::from_pem(ca_pem)) .identity(Identity::from_pem(cert_pem, key_pem)); + if let Ok(server_name) = std::env::var(sandbox_env::GATEWAY_TLS_SERVER_NAME) + && !server_name.is_empty() + { + tls_config = tls_config.domain_name(server_name); + } ep = ep .tls_config(tls_config) diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index c56a1c889..ae3a21787 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -29,6 +29,47 @@ pub const SANDBOX_COMMAND: &str = "OPENSHELL_SANDBOX_COMMAND"; /// Deployment-controlled telemetry toggle propagated to the sandbox supervisor. pub const TELEMETRY_ENABLED: &str = "OPENSHELL_TELEMETRY_ENABLED"; +/// Supervisor pod/runtime topology. Kubernetes sidecar mode sets this to +/// `"sidecar"`; the default combined supervisor path omits it. +pub const SUPERVISOR_TOPOLOGY: &str = "OPENSHELL_SUPERVISOR_TOPOLOGY"; + +/// Network enforcement backend selected by the compute driver. +pub const NETWORK_ENFORCEMENT_MODE: &str = "OPENSHELL_NETWORK_ENFORCEMENT_MODE"; + +/// Process enforcement mode selected by the compute driver. +/// +/// The default when unset is `"full"`, where the process supervisor enforces +/// filesystem/process policy before spawning workloads. Kubernetes sidecar +/// topology sets this to `"network-only"` so the process wrapper can run as +/// the sandbox UID without Linux capabilities while preserving SSH/session +/// behavior. +pub const PROCESS_ENFORCEMENT_MODE: &str = "OPENSHELL_PROCESS_ENFORCEMENT_MODE"; + +/// Whether network policy evaluation must bind requests to the peer binary. +/// +/// The default when unset is `"required"`. Kubernetes sidecar experiments may +/// set this to `"relaxed"` to enforce endpoint and L7 policy without per-binary +/// `/proc` identity binding. +pub const NETWORK_BINARY_IDENTITY: &str = "OPENSHELL_NETWORK_BINARY_IDENTITY"; + +/// File written by the network supervisor when sidecar networking is ready. +pub const SUPERVISOR_READY_FILE: &str = "OPENSHELL_SUPERVISOR_READY_FILE"; + +/// File written by the process supervisor with the workload entrypoint PID and +/// read by the network sidecar for process/binary-bound network policy checks. +pub const ENTRYPOINT_PID_FILE: &str = "OPENSHELL_ENTRYPOINT_PID_FILE"; + +/// Loopback address where the network sidecar forwards gateway gRPC traffic. +pub const GATEWAY_FORWARD_ADDR: &str = "OPENSHELL_GATEWAY_FORWARD_ADDR"; + +/// Optional TLS server name used when the process supervisor reaches the +/// gateway through a loopback TCP forward. +pub const GATEWAY_TLS_SERVER_NAME: &str = "OPENSHELL_GATEWAY_TLS_SERVER_NAME"; + +/// Directory where the network supervisor writes the proxy CA files consumed +/// by workload child processes. +pub const PROXY_TLS_DIR: &str = "OPENSHELL_PROXY_TLS_DIR"; + /// Path to the CA certificate for mTLS communication with the gateway. pub const TLS_CA: &str = "OPENSHELL_TLS_CA"; diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 831e4edf2..452b26484 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -53,9 +53,29 @@ pods do not need direct external ingress for SSH. ## Container Security Context -The driver grants the sandbox agent container the Linux capabilities the -supervisor needs for namespace setup and policy enforcement. It can also request -a Kubernetes AppArmor profile through `app_armor_profile`. +The default `combined` supervisor topology grants the sandbox agent container +the Linux capabilities the supervisor needs for namespace setup and process, +filesystem, and network policy enforcement. + +The `sidecar` supervisor topology moves pod-level network setup into a root init +container and runs the long-lived network sidecar as a non-root UID with no +added Linux capabilities. The agent container also runs as the resolved sandbox +UID/GID with `allowPrivilegeEscalation: false` and `capabilities.drop: ["ALL"]`. +In this mode OpenShell preserves gateway session and SSH behavior, but the +process supervisor defaults to network-only mode and does not apply Landlock +filesystem policy, process privilege dropping, or process/binary identity +checks. Network endpoint and L7 policy remain enforced by the network sidecar. +Set `process_enforcement = "full"` only when you want combined-mode +process/filesystem guards and accept the added agent-container permissions. + +Sidecar mode uses the pod `fsGroup` to make the projected service-account token +and sandbox client TLS secret group-readable so the non-root process supervisor +can authenticate to the gateway. Treat the agent container as trusted with +respect to those in-pod gateway credentials until a narrower credential handoff +exists. + +The driver can request a Kubernetes AppArmor profile through +`app_armor_profile`. Supported values are `Unconfined`, `RuntimeDefault`, and `Localhost/`. An empty or unset value omits diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 292563c2e..a0d3920cd 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -15,6 +15,9 @@ pub const DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME: &str = "default"; /// Default storage size for the workspace PVC. pub const DEFAULT_WORKSPACE_STORAGE_SIZE: &str = "2Gi"; +/// Default UID for the long-running Kubernetes network supervisor sidecar. +pub const DEFAULT_PROXY_UID: u32 = 1337; + /// How the supervisor binary is delivered into sandbox pods. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] @@ -59,12 +62,16 @@ pub enum SupervisorTopology { /// Run networking and process supervision in the agent container. #[default] Combined, + /// Run network supervision in a privileged sidecar and process supervision + /// as a low-capability wrapper in the agent container. + Sidecar, } impl std::fmt::Display for SupervisorTopology { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Combined => f.write_str("combined"), + Self::Sidecar => f.write_str("sidecar"), } } } @@ -75,11 +82,49 @@ impl FromStr for SupervisorTopology { fn from_str(s: &str) -> Result { match s { "combined" => Ok(Self::Combined), + "sidecar" => Ok(Self::Sidecar), other => Err(format!("unknown supervisor topology '{other}'")), } } } +/// Process/filesystem controls applied by the process supervisor in split +/// Kubernetes topologies. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum ProcessEnforcementMode { + /// Preserve process launch and session relay behavior, but leave + /// filesystem/process guards to the network supervisor topology. + #[default] + NetworkOnly, + /// Run the process supervisor with the same process/filesystem controls as + /// combined topology. + Full, +} + +impl std::fmt::Display for ProcessEnforcementMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NetworkOnly => f.write_str("network-only"), + Self::Full => f.write_str("full"), + } + } +} + +impl FromStr for ProcessEnforcementMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "network-only" => Ok(Self::NetworkOnly), + "full" => Ok(Self::Full), + other => Err(format!( + "unknown process enforcement mode '{other}'; expected 'network-only' or 'full'" + )), + } + } +} + /// Kubernetes `AppArmor` profile requested for the sandbox agent container. #[derive(Debug, Clone, PartialEq, Eq)] pub enum AppArmorProfile { @@ -206,6 +251,14 @@ pub struct KubernetesComputeConfig { pub supervisor_sideload_method: SupervisorSideloadMethod, /// How the supervisor is arranged for Kubernetes sandbox pods. pub supervisor_topology: SupervisorTopology, + /// Process/filesystem enforcement mode used by the agent container in + /// non-combined topologies. `network-only` keeps the low-permission agent + /// shape; `full` grants the agent supervisor combined-mode controls. + pub process_enforcement: ProcessEnforcementMode, + /// UID used by the long-running network sidecar in `sidecar` topology. + /// The network init container installs nftables rules that exempt this + /// UID, so it must not match the sandbox workload UID. + pub proxy_uid: u32, pub grpc_endpoint: String, pub ssh_socket_path: String, pub client_tls_secret_name: String, @@ -292,6 +345,8 @@ impl Default for KubernetesComputeConfig { supervisor_image_pull_policy: String::new(), supervisor_sideload_method: SupervisorSideloadMethod::default(), supervisor_topology: SupervisorTopology::default(), + process_enforcement: ProcessEnforcementMode::default(), + proxy_uid: DEFAULT_PROXY_UID, grpc_endpoint: String::new(), ssh_socket_path: "/run/openshell/ssh.sock".to_string(), client_tls_secret_name: String::new(), @@ -336,6 +391,16 @@ impl KubernetesComputeConfig { ) } + pub fn validate_proxy_uid(&self) -> Result<(), String> { + if self.proxy_uid < openshell_policy::MIN_SANDBOX_UID { + return Err(format!( + "proxy_uid must be at least {}", + openshell_policy::MIN_SANDBOX_UID + )); + } + Ok(()) + } + /// Resolve the sandbox UID/GID pair. /// /// Resolution order: @@ -351,6 +416,7 @@ impl KubernetesComputeConfig { if let Some(uid) = self.sandbox_uid { return uid; } + // Try OpenShift SCC annotation. if let Some(anns) = namespace_annotations && let Some(range) = anns.get(ANNOTATION_SCC_UID_RANGE) && let Some(uid) = Self::from_open_shift_uid_range(range) @@ -462,19 +528,32 @@ mod tests { } #[test] - fn default_service_account_name_is_default() { + fn default_supervisor_topology_is_combined() { let cfg = KubernetesComputeConfig::default(); - assert_eq!( - cfg.service_account_name, - DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME - ); + assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined); + assert_eq!(cfg.supervisor_topology.to_string(), "combined"); } #[test] - fn default_supervisor_topology_is_combined() { + fn default_proxy_uid_is_dedicated_non_root_uid() { let cfg = KubernetesComputeConfig::default(); - assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined); - assert_eq!(cfg.supervisor_topology.to_string(), "combined"); + assert_eq!(cfg.proxy_uid, DEFAULT_PROXY_UID); + } + + #[test] + fn default_process_enforcement_is_network_only() { + let cfg = KubernetesComputeConfig::default(); + assert_eq!(cfg.process_enforcement, ProcessEnforcementMode::NetworkOnly); + assert_eq!(cfg.process_enforcement.to_string(), "network-only"); + } + + #[test] + fn serde_override_supervisor_topology_sidecar() { + let json = serde_json::json!({ + "supervisor_topology": "sidecar" + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.supervisor_topology, SupervisorTopology::Sidecar); } #[test] @@ -486,6 +565,35 @@ mod tests { assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined); } + #[test] + fn serde_override_process_enforcement_full() { + let json = serde_json::json!({ + "process_enforcement": "full" + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.process_enforcement, ProcessEnforcementMode::Full); + } + + #[test] + fn serde_override_proxy_uid() { + let json = serde_json::json!({ + "proxy_uid": 2000 + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.proxy_uid, 2000); + cfg.validate_proxy_uid().unwrap(); + } + + #[test] + fn validate_proxy_uid_rejects_privileged_uid() { + let cfg = KubernetesComputeConfig { + proxy_uid: 999, + ..KubernetesComputeConfig::default() + }; + let err = cfg.validate_proxy_uid().unwrap_err(); + assert!(err.contains("proxy_uid")); + } + #[test] fn serde_rejects_invalid_supervisor_topology() { let json = serde_json::json!({ @@ -495,6 +603,24 @@ mod tests { assert!(err.to_string().contains("unknown variant")); } + #[test] + fn serde_rejects_invalid_process_enforcement() { + let json = serde_json::json!({ + "process_enforcement": "privileged" + }); + let err = serde_json::from_value::(json).unwrap_err(); + assert!(err.to_string().contains("unknown variant")); + } + + #[test] + fn default_service_account_name_is_default() { + let cfg = KubernetesComputeConfig::default(); + assert_eq!( + cfg.service_account_name, + DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME + ); + } + #[test] fn serde_override_workspace_storage_size() { let json = serde_json::json!({ diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 166f18b1c..ae45b3f50 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -5,8 +5,9 @@ use super::AppArmorProfile; use crate::config::{ - DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_SANDBOX_UID, DEFAULT_WORKSPACE_STORAGE_SIZE, - KubernetesComputeConfig, SupervisorSideloadMethod, + DEFAULT_PROXY_UID, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_SANDBOX_UID, + DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, ProcessEnforcementMode, + SupervisorSideloadMethod, SupervisorTopology, }; use futures::{Stream, StreamExt, TryStreamExt}; use k8s_openapi::api::core::v1::{Event as KubeEventObj, Namespace, Node}; @@ -221,6 +222,9 @@ impl KubernetesComputeDriver { config .validate_sandbox_identity_config() .map_err(KubernetesDriverError::Precondition)?; + config + .validate_proxy_uid() + .map_err(KubernetesDriverError::Precondition)?; let base_config = match kube::Config::incluster() { Ok(c) => c, Err(_) => kube::Config::infer() @@ -549,7 +553,8 @@ impl KubernetesComputeDriver { .map_err(KubernetesDriverError::Message)?; // Resolve sandbox UID/GID from config or OpenShift SCC namespace annotations. - let (resolved_uid, resolved_gid, ns_annotations) = self.resolve_sandbox_identity().await; + let (resolved_user_id, resolved_group_id, ns_annotations) = + self.resolve_sandbox_identity().await; let params = SandboxPodParams { default_image: &self.config.default_image, @@ -558,6 +563,9 @@ impl KubernetesComputeDriver { supervisor_image: &self.config.supervisor_image, supervisor_image_pull_policy: &self.config.supervisor_image_pull_policy, supervisor_sideload_method: self.config.supervisor_sideload_method, + supervisor_topology: self.config.supervisor_topology, + process_enforcement: self.config.process_enforcement, + proxy_uid: self.config.proxy_uid, service_account_name: &self.config.service_account_name, sandbox_id: &sandbox.id, sandbox_name: &sandbox.name, @@ -574,9 +582,10 @@ impl KubernetesComputeDriver { provider_spiffe_workload_api_socket_path: &self .config .provider_spiffe_workload_api_socket_path, - sandbox_uid: resolved_uid, - sandbox_gid: resolved_gid, + sandbox_uid: resolved_user_id, + sandbox_gid: resolved_group_id, }; + validate_sidecar_proxy_identity(¶ms)?; let mut obj = DynamicObject::new(name, &agent_sandbox_api.resource); // Copy only the SCC-related annotations onto the Sandbox CR for @@ -1035,6 +1044,31 @@ const SUPERVISOR_VOLUME_NAME: &str = "openshell-supervisor-bin"; /// Name of the init container that installs the supervisor binary. const SUPERVISOR_INIT_CONTAINER_NAME: &str = "openshell-supervisor-install"; +/// Name of the init container that prepares pod-level sidecar networking. +const SUPERVISOR_NETWORK_INIT_CONTAINER_NAME: &str = "openshell-network-init"; + +/// Container name for the network-only supervisor sidecar. +const SUPERVISOR_NETWORK_SIDECAR_NAME: &str = "openshell-supervisor-network"; + +/// Shared volume used by the network sidecar to signal readiness to the +/// process-only supervisor in the agent container. +const SIDECAR_STATE_VOLUME_NAME: &str = "openshell-sidecar-state"; +const SIDECAR_STATE_MOUNT_PATH: &str = "/run/openshell-sidecar"; +const SIDECAR_READY_FILE: &str = "/run/openshell-sidecar/supervisor.ready"; +const SIDECAR_ENTRYPOINT_PID_FILE: &str = "/run/openshell-sidecar/entrypoint.pid"; +const SIDECAR_SSH_SOCKET_FILE: &str = "/run/openshell-sidecar/ssh.sock"; + +/// Shared TLS work directory. The network sidecar writes the proxy CA bundle +/// here, while the agent container consumes it after the readiness file exists. +const SIDECAR_TLS_VOLUME_NAME: &str = "openshell-supervisor-tls"; +const SIDECAR_TLS_MOUNT_PATH: &str = "/etc/openshell-tls/proxy"; +const SIDECAR_CLIENT_TLS_MOUNT_PATH: &str = "/etc/openshell-tls/proxy/client"; + +/// Loopback listener owned by the network sidecar. The process-only supervisor +/// connects here for gateway gRPC, and the sidecar forwards bytes to the real +/// gateway endpoint using its own network privileges. +const SIDECAR_GATEWAY_FORWARD_ADDR: &str = "127.0.0.1:18080"; + /// Build the emptyDir volume that holds the supervisor binary. /// /// The init container writes the binary here; the agent container reads it. @@ -1109,31 +1143,12 @@ fn supervisor_init_container( spec } -/// Apply supervisor side-load transforms to an already-built pod template JSON. -/// -/// Depending on the sideload method: -/// - **`ImageVolume`**: mounts the supervisor OCI image directly as a read-only -/// volume (no init container needed, requires K8s >= v1.33). -/// - **`InitContainer`**: injects an emptyDir volume and an init container that -/// copies the supervisor binary from the supervisor image into that volume. -/// -/// In both cases, the agent container gets a command override to run the -/// side-loaded binary and `runAsUser: 0` so it can create network namespaces, -/// set up the proxy, and configure Landlock/seccomp. -#[allow(clippy::similar_names)] -fn apply_supervisor_sideload( - pod_template: &mut serde_json::Value, +fn apply_supervisor_binary_source( + spec: &mut serde_json::Map, supervisor_image: &str, supervisor_image_pull_policy: &str, method: SupervisorSideloadMethod, - sandbox_uid: u32, - sandbox_gid: u32, ) { - let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { - return; - }; - - // 1. Add the volume (image source or emptyDir depending on method) let volumes = spec .entry("volumes") .or_insert_with(|| serde_json::json!([])) @@ -1152,7 +1167,6 @@ fn apply_supervisor_sideload( } } - // 2. Add the init container only for the init-container method if method == SupervisorSideloadMethod::InitContainer { let init_containers = spec .entry("initContainers") @@ -1165,8 +1179,35 @@ fn apply_supervisor_sideload( )); } } +} - // 3. Find the agent container and add volume mount + command override +/// Apply supervisor side-load transforms to an already-built pod template JSON. +/// +/// Depending on the sideload method: +/// - **`ImageVolume`**: mounts the supervisor OCI image directly as a read-only +/// volume (no init container needed, requires K8s >= v1.33). +/// - **`InitContainer`**: injects an emptyDir volume and an init container that +/// copies the supervisor binary from the supervisor image into that volume. +/// +/// In both cases, the agent container gets a command override to run the +/// side-loaded binary as root so it can create network namespaces, set up the +/// proxy, and configure Landlock/seccomp. +#[allow(clippy::similar_names)] +fn apply_supervisor_sideload( + pod_template: &mut serde_json::Value, + supervisor_image: &str, + supervisor_image_pull_policy: &str, + method: SupervisorSideloadMethod, + sandbox_uid: u32, + sandbox_gid: u32, +) { + let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { + return; + }; + + apply_supervisor_binary_source(spec, supervisor_image, supervisor_image_pull_policy, method); + + // Find the agent container and add volume mount + command override let Some(containers) = spec.get_mut("containers").and_then(|v| v.as_array_mut()) else { return; }; @@ -1227,6 +1268,420 @@ fn apply_supervisor_sideload( } } +fn sidecar_state_volume_mount() -> serde_json::Value { + serde_json::json!({ + "name": SIDECAR_STATE_VOLUME_NAME, + "mountPath": SIDECAR_STATE_MOUNT_PATH, + }) +} + +fn sidecar_tls_volume_mount() -> serde_json::Value { + serde_json::json!({ + "name": SIDECAR_TLS_VOLUME_NAME, + "mountPath": SIDECAR_TLS_MOUNT_PATH, + }) +} + +fn sidecar_process_gateway_endpoint(grpc_endpoint: &str) -> String { + if grpc_endpoint.is_empty() { + String::new() + } else if grpc_endpoint.starts_with("https://") { + format!("https://{SIDECAR_GATEWAY_FORWARD_ADDR}") + } else { + format!("http://{SIDECAR_GATEWAY_FORWARD_ADDR}") + } +} + +fn gateway_tls_server_name(grpc_endpoint: &str) -> Option { + let rest = grpc_endpoint.strip_prefix("https://")?; + let authority = rest.split('/').next().unwrap_or(rest); + if authority.is_empty() { + return None; + } + if let Some(bracketed) = authority.strip_prefix('[') { + return bracketed.split(']').next().map(str::to_string); + } + authority + .split(':') + .next() + .filter(|host| !host.is_empty()) + .map(str::to_string) +} + +fn copy_log_level_env( + env: &mut Vec, + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, +) { + if let Some(value) = spec_environment + .get(openshell_core::sandbox_env::LOG_LEVEL) + .or_else(|| template_environment.get(openshell_core::sandbox_env::LOG_LEVEL)) + { + upsert_env(env, openshell_core::sandbox_env::LOG_LEVEL, value); + } +} + +fn supervisor_sidecar_env( + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, +) -> Vec { + let mut env = Vec::new(); + apply_required_env( + &mut env, + params.sandbox_id, + params.sandbox_name, + params.grpc_endpoint, + "", + !params.client_tls_secret_name.is_empty(), + provider_spiffe_socket_path(params), + ); + if !params.client_tls_secret_name.is_empty() { + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_CA, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/ca.crt"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_CERT, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/tls.crt"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_KEY, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/tls.key"), + ); + } + copy_log_level_env(&mut env, template_environment, spec_environment); + upsert_env( + &mut env, + openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY, + "sidecar", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE, + "sidecar-nftables", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::NETWORK_BINARY_IDENTITY, + "relaxed", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::SUPERVISOR_READY_FILE, + SIDECAR_READY_FILE, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::ENTRYPOINT_PID_FILE, + SIDECAR_ENTRYPOINT_PID_FILE, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR, + SIDECAR_GATEWAY_FORWARD_ADDR, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::PROXY_TLS_DIR, + SIDECAR_TLS_MOUNT_PATH, + ); + env +} + +fn supervisor_sidecar_container( + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, +) -> serde_json::Value { + let mut container = serde_json::json!({ + "name": SUPERVISOR_NETWORK_SIDECAR_NAME, + "image": params.supervisor_image, + "command": [ + SUPERVISOR_IMAGE_BINARY_PATH, + "--mode=network", + ], + "env": supervisor_sidecar_env(template_environment, spec_environment, params), + "securityContext": { + "runAsUser": params.proxy_uid, + "runAsGroup": params.sandbox_gid, + "runAsNonRoot": true, + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + } + }, + "volumeMounts": [ + sidecar_state_volume_mount(), + sidecar_tls_volume_mount(), + { + "name": "openshell-sa-token", + "mountPath": "/var/run/secrets/openshell", + "readOnly": true + } + ] + }); + if !params.supervisor_image_pull_policy.is_empty() { + container["imagePullPolicy"] = serde_json::json!(params.supervisor_image_pull_policy); + } + if params.provider_spiffe_enabled { + container["volumeMounts"] + .as_array_mut() + .expect("volumeMounts is an array") + .push(serde_json::json!({ + "name": SPIFFE_WORKLOAD_API_VOLUME_NAME, + "mountPath": spiffe_socket_mount_path(params.provider_spiffe_workload_api_socket_path), + "readOnly": true, + })); + } + if let Some(profile) = params.app_armor_profile { + container["securityContext"]["appArmorProfile"] = app_armor_profile_to_k8s(profile); + } + container +} + +fn supervisor_network_init_container(params: &SandboxPodParams<'_>) -> serde_json::Value { + let mut container = serde_json::json!({ + "name": SUPERVISOR_NETWORK_INIT_CONTAINER_NAME, + "image": params.supervisor_image, + "command": [ + SUPERVISOR_IMAGE_BINARY_PATH, + "--mode=network-init", + "--proxy-uid", + params.proxy_uid.to_string(), + "--proxy-gid", + params.sandbox_gid.to_string(), + "--sidecar-state-dir", + SIDECAR_STATE_MOUNT_PATH, + "--sidecar-tls-dir", + SIDECAR_TLS_MOUNT_PATH, + ], + "securityContext": { + "runAsUser": 0, + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"], + "add": ["NET_ADMIN", "NET_RAW", "CHOWN", "FOWNER"] + } + }, + "volumeMounts": [ + sidecar_state_volume_mount(), + sidecar_tls_volume_mount(), + ] + }); + if !params.supervisor_image_pull_policy.is_empty() { + container["imagePullPolicy"] = serde_json::json!(params.supervisor_image_pull_policy); + } + if !params.client_tls_secret_name.is_empty() { + container["volumeMounts"] + .as_array_mut() + .expect("volumeMounts is an array") + .push(serde_json::json!({ + "name": "openshell-client-tls", + "mountPath": "/etc/openshell-tls/client", + "readOnly": true + })); + } + if let Some(profile) = params.app_armor_profile { + container["securityContext"]["appArmorProfile"] = app_armor_profile_to_k8s(profile); + } + container +} + +fn apply_supervisor_sidecar_topology( + pod_template: &mut serde_json::Value, + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, +) { + let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { + return; + }; + + let pod_security_context = spec + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = pod_security_context.as_object_mut() { + sc.insert("fsGroup".to_string(), serde_json::json!(params.sandbox_gid)); + } + + apply_supervisor_binary_source( + spec, + params.supervisor_image, + params.supervisor_image_pull_policy, + params.supervisor_sideload_method, + ); + + let volumes = spec + .entry("volumes") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volumes) = volumes { + volumes.push(serde_json::json!({ + "name": SIDECAR_STATE_VOLUME_NAME, + "emptyDir": {} + })); + volumes.push(serde_json::json!({ + "name": SIDECAR_TLS_VOLUME_NAME, + "emptyDir": {} + })); + } + + let init_containers = spec + .entry("initContainers") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(init_containers) = init_containers { + init_containers.push(supervisor_network_init_container(params)); + } + + let Some(containers) = spec.get_mut("containers").and_then(|v| v.as_array_mut()) else { + return; + }; + + let target_index = containers + .iter() + .position(|c| c.get("name").and_then(|v| v.as_str()) == Some("agent")) + .unwrap_or(0); + + if let Some(container) = containers + .get_mut(target_index) + .and_then(|v| v.as_object_mut()) + { + container.insert( + "command".to_string(), + serde_json::json!([ + format!("{}/openshell-sandbox", SUPERVISOR_MOUNT_PATH), + "--mode=process" + ]), + ); + + let security_context = container + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = security_context.as_object_mut() { + match params.process_enforcement { + ProcessEnforcementMode::NetworkOnly => { + sc.insert( + "runAsUser".to_string(), + serde_json::json!(params.sandbox_uid), + ); + sc.insert( + "runAsGroup".to_string(), + serde_json::json!(params.sandbox_gid), + ); + sc.insert("runAsNonRoot".to_string(), serde_json::json!(true)); + sc.insert( + "allowPrivilegeEscalation".to_string(), + serde_json::json!(false), + ); + sc.insert( + "capabilities".to_string(), + serde_json::json!({ + "drop": ["ALL"] + }), + ); + } + ProcessEnforcementMode::Full => { + sc.insert("runAsUser".to_string(), serde_json::json!(0)); + sc.remove("runAsGroup"); + sc.remove("runAsNonRoot"); + sc.remove("allowPrivilegeEscalation"); + sc.entry("capabilities".to_string()).or_insert_with(|| { + serde_json::json!({ + "add": ["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"] + }) + }); + } + } + } + + let volume_mounts = container + .entry("volumeMounts") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volume_mounts) = volume_mounts { + volume_mounts.push(supervisor_volume_mount()); + volume_mounts.push(sidecar_state_volume_mount()); + volume_mounts.push(sidecar_tls_volume_mount()); + } + + let env = container + .entry("env") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(env) = env { + let process_endpoint = sidecar_process_gateway_endpoint(params.grpc_endpoint); + upsert_env( + env, + openshell_core::sandbox_env::ENDPOINT, + &process_endpoint, + ); + if let Some(server_name) = gateway_tls_server_name(params.grpc_endpoint) { + upsert_env( + env, + openshell_core::sandbox_env::GATEWAY_TLS_SERVER_NAME, + &server_name, + ); + } + upsert_env( + env, + openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY, + "sidecar", + ); + upsert_env( + env, + openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE, + "sidecar-nftables", + ); + upsert_env( + env, + openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE, + ¶ms.process_enforcement.to_string(), + ); + upsert_env( + env, + openshell_core::sandbox_env::SSH_SOCKET_PATH, + SIDECAR_SSH_SOCKET_FILE, + ); + upsert_env( + env, + openshell_core::sandbox_env::SUPERVISOR_READY_FILE, + SIDECAR_READY_FILE, + ); + upsert_env( + env, + openshell_core::sandbox_env::ENTRYPOINT_PID_FILE, + SIDECAR_ENTRYPOINT_PID_FILE, + ); + upsert_env( + env, + openshell_core::sandbox_env::PROXY_TLS_DIR, + SIDECAR_TLS_MOUNT_PATH, + ); + upsert_env( + env, + openshell_core::sandbox_env::SANDBOX_UID, + ¶ms.sandbox_uid.to_string(), + ); + upsert_env( + env, + openshell_core::sandbox_env::SANDBOX_GID, + ¶ms.sandbox_gid.to_string(), + ); + } + } + + containers.push(supervisor_sidecar_container( + template_environment, + spec_environment, + params, + )); +} + /// Apply workspace persistence transforms to an already-built pod template. /// /// This injects: @@ -1242,6 +1697,7 @@ fn apply_supervisor_sideload( /// The init container mounts the PVC at a temporary path so it can still see /// the image's `/sandbox` directory. It checks for a sentinel file and skips /// the copy if the PVC was already initialised. +#[allow(clippy::similar_names)] fn apply_workspace_persistence( pod_template: &mut serde_json::Value, image: &str, @@ -1301,6 +1757,10 @@ fn apply_workspace_persistence( // self-referential symlinks under `/sandbox/.uv`, and GNU cp can // fail while seeding the PVC even though preserving the symlink as-is // is valid. `tar` copies the tree without dereferencing those links. + // Archive only the contents, not the `/sandbox` directory entry + // itself, so extraction never tries to chmod the PVC mount root. + // Extract without restoring owner, mode, or timestamps so the + // non-root init container can seed kubelet-owned PVCs. // // The inner `[ -d ... ]` guard handles custom images that don't have // a /sandbox directory — the copy is skipped but the sentinel is @@ -1308,7 +1768,12 @@ fn apply_workspace_persistence( let copy_cmd = format!( "if [ ! -f {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL} ]; then \ if [ -d {WORKSPACE_MOUNT_PATH} ]; then \ - tar -C {WORKSPACE_MOUNT_PATH} -cf - . | tar -C {WORKSPACE_INIT_MOUNT_PATH} -xpf -; \ + tmp=$(mktemp) && rm -f \"$tmp\" && \ + (cd {WORKSPACE_MOUNT_PATH} && find . -mindepth 1 -maxdepth 1 -exec tar -cf \"$tmp\" {{}} +) && \ + if [ -f \"$tmp\" ]; then \ + tar -C {WORKSPACE_INIT_MOUNT_PATH} --no-same-owner --no-same-permissions --touch -xf \"$tmp\" && \ + rm -f \"$tmp\"; \ + fi; \ fi && \ touch {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL}; \ fi" @@ -1366,6 +1831,9 @@ struct SandboxPodParams<'a> { supervisor_image: &'a str, supervisor_image_pull_policy: &'a str, supervisor_sideload_method: SupervisorSideloadMethod, + supervisor_topology: SupervisorTopology, + process_enforcement: ProcessEnforcementMode, + proxy_uid: u32, service_account_name: &'a str, sandbox_id: &'a str, sandbox_name: &'a str, @@ -1397,6 +1865,9 @@ impl Default for SandboxPodParams<'_> { supervisor_image: "", supervisor_image_pull_policy: "", supervisor_sideload_method: SupervisorSideloadMethod::default(), + supervisor_topology: SupervisorTopology::default(), + process_enforcement: ProcessEnforcementMode::default(), + proxy_uid: DEFAULT_PROXY_UID, service_account_name: DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, sandbox_id: "", sandbox_name: "", @@ -1417,6 +1888,20 @@ impl Default for SandboxPodParams<'_> { } } +fn validate_sidecar_proxy_identity( + params: &SandboxPodParams<'_>, +) -> Result<(), KubernetesDriverError> { + if params.supervisor_topology == SupervisorTopology::Sidecar + && params.proxy_uid == params.sandbox_uid + { + return Err(KubernetesDriverError::Precondition(format!( + "proxy_uid ({}) must not match sandbox_uid ({}) in sidecar topology", + params.proxy_uid, params.sandbox_uid + ))); + } + Ok(()) +} + fn spec_pod_env(spec: Option<&SandboxSpec>) -> std::collections::HashMap { let mut env = spec.map_or_else(Default::default, |s| s.environment.clone()); if let Some(s) = spec.filter(|s| !s.log_level.is_empty()) { @@ -1707,13 +2192,22 @@ fn sandbox_template_to_k8s_with_gpu_requirements( serde_json::Value::Array(vec![serde_json::Value::Object(container)]), ); - // Add TLS secret volume. Mode 0400 (owner-read) prevents the - // unprivileged sandbox user from reading the mTLS private key. + // Add TLS secret volume. Combined mode uses mode 0400 because the + // supervisor starts as root and drops privileges before running workload + // children. Sidecar mode keeps the process supervisor non-root, so it uses + // pod fsGroup + 0440 to preserve gateway session and SSH control behavior. let mut volumes: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { + let client_tls_default_mode = match params.supervisor_topology { + SupervisorTopology::Combined => 0o400, + SupervisorTopology::Sidecar => 0o440, + }; volumes.push(serde_json::json!({ "name": "openshell-client-tls", - "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } + "secret": { + "secretName": params.client_tls_secret_name, + "defaultMode": client_tls_default_mode + } })); } if params.provider_spiffe_enabled { @@ -1728,7 +2222,12 @@ fn sandbox_template_to_k8s_with_gpu_requirements( // Projected ServiceAccountToken volume — kubelet writes a short-lived // audience-bound JWT into /var/run/secrets/openshell/token and rotates // it automatically. The supervisor exchanges this for a gateway-minted - // JWT via `IssueSandboxToken` once at startup. + // JWT via `IssueSandboxToken` once at startup. In sidecar topology both + // supervisor containers run with the sandbox GID and need group-read access. + let sa_token_default_mode = match params.supervisor_topology { + SupervisorTopology::Combined => 0o400, + SupervisorTopology::Sidecar => 0o440, + }; volumes.push(serde_json::json!({ "name": "openshell-sa-token", "projected": { @@ -1739,7 +2238,7 @@ fn sandbox_template_to_k8s_with_gpu_requirements( "path": "token" } }], - "defaultMode": 256 + "defaultMode": sa_token_default_mode } })); spec.insert("volumes".to_string(), serde_json::Value::Array(volumes)); @@ -1763,14 +2262,26 @@ fn sandbox_template_to_k8s_with_gpu_requirements( let mut result = serde_json::Value::Object(template_value); - apply_supervisor_sideload( - &mut result, - params.supervisor_image, - params.supervisor_image_pull_policy, - params.supervisor_sideload_method, - params.sandbox_uid, - params.sandbox_gid, - ); + match params.supervisor_topology { + SupervisorTopology::Combined => { + apply_supervisor_sideload( + &mut result, + params.supervisor_image, + params.supervisor_image_pull_policy, + params.supervisor_sideload_method, + params.sandbox_uid, + params.sandbox_gid, + ); + } + SupervisorTopology::Sidecar => { + apply_supervisor_sidecar_topology( + &mut result, + &template.environment, + spec_environment, + params, + ); + } + } // Inject workspace persistence (init container + PVC volume mount) so // that /sandbox data survives pod rescheduling. Skipped when the user @@ -2262,6 +2773,15 @@ mod tests { assert!(!should_try_next_sandbox_api_version(&err)); } + fn rendered_env<'a>(container: &'a serde_json::Value, name: &str) -> Option<&'a str> { + container["env"] + .as_array()? + .iter() + .find(|item| item.get("name").and_then(|value| value.as_str()) == Some(name))? + .get("value")? + .as_str() + } + #[test] fn driver_config_rejects_invalid_shape() { let template = SandboxTemplate { @@ -2601,6 +3121,312 @@ mod tests { ); } + #[test] + fn sidecar_topology_renders_process_agent_and_network_sidecar() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::Sidecar, + supervisor_sideload_method: SupervisorSideloadMethod::InitContainer, + supervisor_image: "supervisor-image:latest", + supervisor_image_pull_policy: "IfNotPresent", + grpc_endpoint: "https://openshell-gateway.openshell.svc:8080", + client_tls_secret_name: "openshell-client-tls", + proxy_uid: 2200, + sandbox_uid: 1500, + sandbox_gid: 1500, + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate { + image: "agent-image:latest".to_string(), + ..SandboxTemplate::default() + }, + false, + &std::collections::HashMap::new(), + false, + ¶ms, + ); + + assert!( + pod_template["spec"]["shareProcessNamespace"].is_null(), + "sidecar mode no longer needs a shared process namespace when binary identity is relaxed" + ); + assert_eq!(pod_template["spec"]["securityContext"]["fsGroup"], 1500); + let containers = pod_template["spec"]["containers"].as_array().unwrap(); + assert_eq!(containers.len(), 2); + + let agent = containers + .iter() + .find(|container| container["name"] == "agent") + .unwrap(); + assert_eq!( + agent["command"], + serde_json::json!([ + format!("{SUPERVISOR_MOUNT_PATH}/openshell-sandbox"), + "--mode=process" + ]) + ); + assert_eq!(agent["securityContext"]["runAsUser"], 1500); + assert_eq!(agent["securityContext"]["runAsGroup"], 1500); + assert_eq!(agent["securityContext"]["runAsNonRoot"], true); + assert_eq!( + agent["securityContext"]["capabilities"], + serde_json::json!({ + "drop": ["ALL"] + }) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::ENDPOINT), + Some("https://127.0.0.1:18080") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::GATEWAY_TLS_SERVER_NAME), + Some("openshell-gateway.openshell.svc") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE), + Some("network-only") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SSH_SOCKET_PATH), + Some(SIDECAR_SSH_SOCKET_FILE) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SUPERVISOR_READY_FILE), + Some(SIDECAR_READY_FILE) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::ENTRYPOINT_PID_FILE), + Some(SIDECAR_ENTRYPOINT_PID_FILE) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROXY_TLS_DIR), + Some(SIDECAR_TLS_MOUNT_PATH) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SANDBOX_UID), + Some("1500") + ); + + let sidecar = containers + .iter() + .find(|container| container["name"] == SUPERVISOR_NETWORK_SIDECAR_NAME) + .unwrap(); + assert_eq!(sidecar["image"], "supervisor-image:latest"); + assert_eq!(sidecar["imagePullPolicy"], "IfNotPresent"); + assert_eq!( + sidecar["command"], + serde_json::json!([SUPERVISOR_IMAGE_BINARY_PATH, "--mode=network"]) + ); + assert_eq!(sidecar["securityContext"]["runAsUser"], 2200); + assert_eq!(sidecar["securityContext"]["runAsGroup"], 1500); + assert_eq!(sidecar["securityContext"]["runAsNonRoot"], true); + assert_eq!( + sidecar["securityContext"]["capabilities"], + serde_json::json!({ + "drop": ["ALL"] + }) + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::ENDPOINT), + Some("https://openshell-gateway.openshell.svc:8080") + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR), + Some(SIDECAR_GATEWAY_FORWARD_ADDR) + ); + assert_eq!( + rendered_env( + sidecar, + openshell_core::sandbox_env::NETWORK_BINARY_IDENTITY + ), + Some("relaxed") + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::ENTRYPOINT_PID_FILE), + Some(SIDECAR_ENTRYPOINT_PID_FILE) + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::PROXY_TLS_DIR), + Some(SIDECAR_TLS_MOUNT_PATH) + ); + assert_eq!( + rendered_env(sidecar, openshell_core::sandbox_env::TLS_CA), + Some("/etc/openshell-tls/proxy/client/ca.crt") + ); + let sidecar_mounts = sidecar["volumeMounts"].as_array().unwrap(); + assert!( + !sidecar_mounts + .iter() + .any(|mount| mount["name"] == "openshell-client-tls"), + "runtime sidecar should use the init-copied TLS files, not the root-owned Secret mount" + ); + let volumes = pod_template["spec"]["volumes"].as_array().unwrap(); + let sa_token = volumes + .iter() + .find(|volume| volume["name"] == "openshell-sa-token") + .unwrap(); + assert_eq!(sa_token["projected"]["defaultMode"], 0o440); + let client_tls = volumes + .iter() + .find(|volume| volume["name"] == "openshell-client-tls") + .unwrap(); + assert_eq!(client_tls["secret"]["defaultMode"], 0o440); + + let init_containers = pod_template["spec"]["initContainers"].as_array().unwrap(); + let network_init = init_containers + .iter() + .find(|container| container["name"] == SUPERVISOR_NETWORK_INIT_CONTAINER_NAME) + .unwrap(); + assert_eq!(network_init["image"], "supervisor-image:latest"); + assert_eq!(network_init["imagePullPolicy"], "IfNotPresent"); + assert_eq!( + network_init["command"], + serde_json::json!([ + SUPERVISOR_IMAGE_BINARY_PATH, + "--mode=network-init", + "--proxy-uid", + "2200", + "--proxy-gid", + "1500", + "--sidecar-state-dir", + SIDECAR_STATE_MOUNT_PATH, + "--sidecar-tls-dir", + SIDECAR_TLS_MOUNT_PATH + ]) + ); + assert_eq!( + network_init["securityContext"]["capabilities"], + serde_json::json!({ + "drop": ["ALL"], + "add": ["NET_ADMIN", "NET_RAW", "CHOWN", "FOWNER"] + }) + ); + let network_init_mounts = network_init["volumeMounts"].as_array().unwrap(); + assert!(network_init_mounts.iter().any(|mount| { + mount["name"] == "openshell-client-tls" + && mount["mountPath"] == "/etc/openshell-tls/client" + })); + } + + #[test] + fn sidecar_topology_adds_shared_state_and_tls_volumes() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::Sidecar, + supervisor_sideload_method: SupervisorSideloadMethod::ImageVolume, + supervisor_image: "supervisor-image:latest", + grpc_endpoint: "http://openshell-gateway.openshell.svc:8080", + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + &std::collections::HashMap::new(), + false, + ¶ms, + ); + + let volumes = pod_template["spec"]["volumes"].as_array().unwrap(); + assert!( + volumes + .iter() + .any(|volume| volume["name"] == SIDECAR_STATE_VOLUME_NAME) + ); + assert!( + volumes + .iter() + .any(|volume| volume["name"] == SIDECAR_TLS_VOLUME_NAME) + ); + assert!(volumes.iter().any(|volume| { + volume["name"] == SUPERVISOR_VOLUME_NAME && volume["image"].is_object() + })); + + let containers = pod_template["spec"]["containers"].as_array().unwrap(); + for container_name in ["agent", SUPERVISOR_NETWORK_SIDECAR_NAME] { + let container = containers + .iter() + .find(|container| container["name"] == container_name) + .unwrap(); + let mounts = container["volumeMounts"].as_array().unwrap(); + assert!(mounts.iter().any(|mount| { + mount["name"] == SIDECAR_STATE_VOLUME_NAME + && mount["mountPath"] == SIDECAR_STATE_MOUNT_PATH + })); + assert!(mounts.iter().any(|mount| { + mount["name"] == SIDECAR_TLS_VOLUME_NAME + && mount["mountPath"] == SIDECAR_TLS_MOUNT_PATH + })); + } + } + + #[test] + fn sidecar_topology_full_process_enforcement_keeps_combined_agent_permissions() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::Sidecar, + process_enforcement: ProcessEnforcementMode::Full, + supervisor_sideload_method: SupervisorSideloadMethod::InitContainer, + supervisor_image: "supervisor-image:latest", + grpc_endpoint: "https://openshell-gateway.openshell.svc:8080", + sandbox_uid: 1500, + sandbox_gid: 1500, + proxy_uid: 2200, + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + &std::collections::HashMap::new(), + false, + ¶ms, + ); + + let containers = pod_template["spec"]["containers"].as_array().unwrap(); + let agent = containers + .iter() + .find(|container| container["name"] == "agent") + .unwrap(); + let sc = &agent["securityContext"]; + assert_eq!(sc["runAsUser"], 0); + assert!(sc.get("runAsGroup").is_none()); + assert!(sc.get("runAsNonRoot").is_none()); + assert!(sc.get("allowPrivilegeEscalation").is_none()); + assert_eq!( + sc["capabilities"], + serde_json::json!({ + "add": ["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"] + }) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE), + Some("full") + ); + + let sidecar = containers + .iter() + .find(|container| container["name"] == SUPERVISOR_NETWORK_SIDECAR_NAME) + .unwrap(); + assert_eq!(sidecar["securityContext"]["runAsUser"], 2200); + assert_eq!( + sidecar["securityContext"]["capabilities"], + serde_json::json!({ + "drop": ["ALL"] + }) + ); + } + + #[test] + fn sidecar_topology_rejects_proxy_uid_matching_sandbox_uid() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::Sidecar, + proxy_uid: 1500, + sandbox_uid: 1500, + ..SandboxPodParams::default() + }; + + let err = validate_sidecar_proxy_identity(¶ms).unwrap_err(); + assert!(matches!(err, KubernetesDriverError::Precondition(_))); + assert!(err.to_string().contains("proxy_uid")); + } + /// Regression test: TLS mount path must match env var paths. /// The volume is mounted at a specific path and the env vars must point to /// files within that same path, otherwise the sandbox will fail to start @@ -3179,6 +4005,16 @@ mod tests { script.contains("tar -C"), "init script must seed image contents with a tar stream" ); + assert!( + script.contains("find . -mindepth 1 -maxdepth 1"), + "init script must archive sandbox contents without the mount root entry" + ); + assert!( + script.contains("--no-same-owner") + && script.contains("--no-same-permissions") + && script.contains("--touch"), + "init script must avoid restoring metadata onto the PVC root" + ); } #[test] diff --git a/crates/openshell-driver-kubernetes/src/lib.rs b/crates/openshell-driver-kubernetes/src/lib.rs index 953ed4abd..114055bcc 100644 --- a/crates/openshell-driver-kubernetes/src/lib.rs +++ b/crates/openshell-driver-kubernetes/src/lib.rs @@ -6,8 +6,9 @@ pub mod driver; pub mod grpc; pub use config::{ - AppArmorProfile, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_WORKSPACE_STORAGE_SIZE, - KubernetesComputeConfig, SupervisorSideloadMethod, SupervisorTopology, + AppArmorProfile, DEFAULT_PROXY_UID, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, + DEFAULT_WORKSPACE_STORAGE_SIZE, KubernetesComputeConfig, ProcessEnforcementMode, + SupervisorSideloadMethod, SupervisorTopology, }; pub use driver::{KubernetesComputeDriver, KubernetesDriverError}; pub use grpc::ComputeDriverService; diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index 77f671dcb..34ec1b55d 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -10,8 +10,9 @@ use tracing_subscriber::EnvFilter; use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_kubernetes::{ - AppArmorProfile, ComputeDriverService, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, - KubernetesComputeConfig, KubernetesComputeDriver, SupervisorSideloadMethod, SupervisorTopology, + AppArmorProfile, ComputeDriverService, DEFAULT_PROXY_UID, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, + KubernetesComputeConfig, KubernetesComputeDriver, ProcessEnforcementMode, + SupervisorSideloadMethod, SupervisorTopology, }; #[derive(Parser, Debug)] @@ -87,6 +88,16 @@ struct Args { )] supervisor_topology: SupervisorTopology, + #[arg( + long, + env = "OPENSHELL_PROCESS_ENFORCEMENT", + default_value = "network-only" + )] + process_enforcement: ProcessEnforcementMode, + + #[arg(long, env = "OPENSHELL_PROXY_UID", default_value_t = DEFAULT_PROXY_UID)] + proxy_uid: u32, + #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] enable_user_namespaces: bool, @@ -131,6 +142,8 @@ async fn main() -> Result<()> { supervisor_image_pull_policy: args.supervisor_image_pull_policy.unwrap_or_default(), supervisor_sideload_method: args.supervisor_sideload_method, supervisor_topology: args.supervisor_topology, + process_enforcement: args.process_enforcement, + proxy_uid: args.proxy_uid, grpc_endpoint: args.grpc_endpoint.unwrap_or_default(), ssh_socket_path: args.sandbox_ssh_socket_path, client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(), diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index c0c84132b..8ff778de2 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -128,8 +128,8 @@ sequenceDiagram C->>C: entrypoint: /opt/openshell/bin/openshell-sandbox ``` -The supervisor image from `deploy/docker/Dockerfile.supervisor` copies the static -`openshell-sandbox` binary to `/openshell-sandbox`. +The supervisor image from `deploy/docker/Dockerfile.supervisor` provides the +static `openshell-sandbox` binary at `/openshell-sandbox`. Mounting that image at `/opt/openshell/bin` makes the binary available as `/opt/openshell/bin/openshell-sandbox`. diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 7c9ab269f..87f0abf78 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -857,9 +857,8 @@ pub fn build_container_spec_with_token_and_gpu_devices( // Side-load the supervisor binary from a standalone OCI image. // Podman resolves image_volumes at the libpod layer, mounting the // image's filesystem at the destination path without starting a - // container from it. The supervisor image is FROM scratch with just - // the binary at /openshell-sandbox, so it appears at - // /opt/openshell/bin/openshell-sandbox. + // container from it. The supervisor image exposes the binary at + // /openshell-sandbox, so it appears at /opt/openshell/bin/openshell-sandbox. image_volumes, hostname: format!("sandbox-{}", sandbox.name), // Override the image's ENTRYPOINT so the supervisor binary runs diff --git a/crates/openshell-sandbox/Cargo.toml b/crates/openshell-sandbox/Cargo.toml index 086dbe02c..a5d344910 100644 --- a/crates/openshell-sandbox/Cargo.toml +++ b/crates/openshell-sandbox/Cargo.toml @@ -33,6 +33,9 @@ clap = { workspace = true } # Error handling miette = { workspace = true } +# Unix ownership for Kubernetes sidecar init setup +nix = { workspace = true } + # TLS crypto provider install (main.rs) rustls = { workspace = true } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 53b1eba58..3ff260c7c 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -13,10 +13,10 @@ mod mechanistic_mapper; #[cfg_attr(not(target_os = "linux"), allow(dead_code))] mod metadata_server; -use miette::Result; +use miette::{IntoDiagnostic, Result}; use std::future::Future; use std::sync::Arc; -use std::sync::atomic::AtomicU32; +use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; use tracing::{debug, info, warn}; @@ -64,12 +64,22 @@ use openshell_core::denial::DenialEvent; use openshell_core::policy::{NetworkMode, NetworkPolicy, ProxyPolicy, SandboxPolicy}; use openshell_core::provider_credentials::ProviderCredentialState; use openshell_supervisor_network::opa::OpaEngine; +use openshell_supervisor_process::process::ProcessEnforcementMode; pub use openshell_supervisor_process::process::{ProcessHandle, ProcessStatus}; use openshell_supervisor_process::skills; +use tokio::io::copy_bidirectional; +use tokio::net::{TcpListener, TcpStream}; use tokio::sync::mpsc::UnboundedSender; #[cfg(target_os = "linux")] use tokio::time::timeout; +const SIDECAR_NETWORK_ENFORCEMENT_MODE: &str = "sidecar-nftables"; +const SIDECAR_TLS_DIR: &str = "/etc/openshell-tls/proxy"; +const SIDECAR_CA_CERT: &str = "openshell-ca.pem"; +const SIDECAR_CA_BUNDLE: &str = "ca-bundle.pem"; +const SIDECAR_PROCESS_PROXY_ADDR: &str = "127.0.0.1:3128"; +const SIDECAR_READY_TIMEOUT_SECS: u64 = 120; + /// Run a command in the sandbox. /// /// # Errors @@ -125,6 +135,16 @@ pub async fn run_sandbox( } } + let sidecar_network_enforcement = sidecar_network_enforcement_enabled(); + let process_enforcement_mode = process_enforcement_mode(); + let sidecar_ready_file = supervisor_ready_file(); + if process_enabled + && !network_enabled + && let Some(path) = sidecar_ready_file.as_deref() + { + wait_for_supervisor_ready(path).await?; + } + // Load policy and initialize OPA engine let openshell_endpoint_for_proxy = openshell_endpoint.clone(); let sandbox_name_for_agg = sandbox.clone(); @@ -251,6 +271,12 @@ pub async fn run_sandbox( // Shared PID: set after process spawn so the proxy can look up // the entrypoint process's /proc/net/tcp for identity binding. let entrypoint_pid = Arc::new(AtomicU32::new(0)); + if network_enabled + && !process_enabled + && let Some(path) = entrypoint_pid_file() + { + spawn_entrypoint_pid_file_watcher(path, entrypoint_pid.clone()); + } // Create the workload's network namespace. It is shared infrastructure: // the proxy binds to its host-side veth IP, the bypass monitor reads @@ -258,7 +284,7 @@ pub async fn run_sandbox( // it via setns(). The RAII handle lives in this frame for the duration // of the sandbox. #[cfg(target_os = "linux")] - let netns = if network_enabled { + let netns = if network_enabled && !sidecar_network_enforcement { openshell_supervisor_process::netns::create_netns_for_proxy(&policy)? } else { None @@ -328,6 +354,34 @@ pub async fn run_sandbox( None }; + let _gateway_forward = if network_enabled && sidecar_network_enforcement { + let endpoint = openshell_endpoint_for_proxy.as_deref().ok_or_else(|| { + miette::miette!("sidecar network enforcement requires an OpenShell gateway endpoint") + })?; + Some(start_gateway_forward_from_env(endpoint).await?) + } else { + None + }; + + #[cfg(target_os = "linux")] + if network_enabled && sidecar_network_enforcement { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return Err(miette::miette!( + "sidecar network enforcement requires proxy network mode" + )); + } + if let Some(path) = sidecar_ready_file.as_deref() { + write_supervisor_ready(path)?; + } + } + + #[cfg(not(target_os = "linux"))] + if network_enabled && sidecar_network_enforcement { + return Err(miette::miette!( + "sidecar network enforcement is only supported on Linux" + )); + } + // Spawn the denial-aggregator flush task. The aggregator drains denial // events from the proxy + bypass monitor, batches them, and ships // summaries to the gateway via `SubmitPolicyAnalysis`. @@ -478,8 +532,17 @@ pub async fn run_sandbox( } } + let process_policy = process_policy_for_topology(&policy, sidecar_network_enforcement)?; + let exit_code = if process_enabled { - let ca_file_paths = networking.as_ref().and_then(|n| n.ca_file_paths.clone()); + let ca_file_paths = networking + .as_ref() + .and_then(|n| n.ca_file_paths.clone()) + .or_else(|| { + sidecar_network_enforcement + .then(sidecar_ca_file_paths) + .flatten() + }); openshell_supervisor_process::run::run_process( program, @@ -490,7 +553,8 @@ pub async fn run_sandbox( sandbox_id.as_deref(), openshell_endpoint.as_deref(), ssh_socket_path, - &policy, + &process_policy, + process_enforcement_mode, entrypoint_pid, provider_credentials, provider_env, @@ -551,6 +615,205 @@ async fn wait_for_shutdown_signal() { } } +fn sidecar_network_enforcement_enabled() -> bool { + std::env::var(openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE) + .is_ok_and(|value| value == SIDECAR_NETWORK_ENFORCEMENT_MODE) +} + +fn process_enforcement_mode() -> ProcessEnforcementMode { + match std::env::var(openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE) + .unwrap_or_else(|_| "full".to_string()) + .as_str() + { + "network-only" => ProcessEnforcementMode::NetworkOnly, + _ => ProcessEnforcementMode::Full, + } +} + +fn supervisor_ready_file() -> Option { + std::env::var(openshell_core::sandbox_env::SUPERVISOR_READY_FILE) + .ok() + .filter(|value| !value.is_empty()) +} + +fn entrypoint_pid_file() -> Option { + std::env::var(openshell_core::sandbox_env::ENTRYPOINT_PID_FILE) + .ok() + .filter(|value| !value.is_empty()) +} + +fn spawn_entrypoint_pid_file_watcher(path: String, entrypoint_pid: Arc) { + tokio::spawn(async move { + let pid_path = std::path::PathBuf::from(&path); + loop { + match std::fs::read_to_string(&pid_path) { + Ok(contents) => match contents.trim().parse::() { + Ok(pid) if pid > 0 => { + entrypoint_pid.store(pid, Ordering::Release); + info!(path, pid, "Loaded sidecar workload entrypoint PID"); + return; + } + Ok(_) | Err(_) => { + debug!(path, contents = %contents.trim(), "Ignoring invalid entrypoint PID file contents"); + } + }, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => { + debug!(path, error = %err, "Failed to read entrypoint PID file"); + } + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + }); +} + +async fn wait_for_supervisor_ready(path: &str) -> Result<()> { + let ready_path = std::path::Path::new(path); + let deadline = tokio::time::Instant::now() + Duration::from_secs(SIDECAR_READY_TIMEOUT_SECS); + loop { + if ready_path.exists() { + info!(path, "Network supervisor sidecar is ready"); + return Ok(()); + } + if tokio::time::Instant::now() >= deadline { + return Err(miette::miette!( + "timed out waiting for network supervisor sidecar readiness file {path}" + )); + } + tokio::time::sleep(Duration::from_millis(250)).await; + } +} + +#[cfg(target_os = "linux")] +fn write_supervisor_ready(path: &str) -> Result<()> { + let ready_path = std::path::Path::new(path); + if let Some(parent) = ready_path.parent() { + std::fs::create_dir_all(parent).into_diagnostic()?; + } + std::fs::write(ready_path, b"ready\n").into_diagnostic()?; + info!(path, "Network supervisor sidecar readiness file written"); + Ok(()) +} + +fn sidecar_ca_file_paths() -> Option<(std::path::PathBuf, std::path::PathBuf)> { + let tls_dir = std::env::var(openshell_core::sandbox_env::PROXY_TLS_DIR) + .unwrap_or_else(|_| SIDECAR_TLS_DIR.to_string()); + let cert = std::path::Path::new(&tls_dir).join(SIDECAR_CA_CERT); + let bundle = std::path::Path::new(&tls_dir).join(SIDECAR_CA_BUNDLE); + (cert.exists() && bundle.exists()).then_some((cert, bundle)) +} + +fn process_policy_for_topology( + policy: &SandboxPolicy, + sidecar_network_enforcement: bool, +) -> Result { + let mut process_policy = policy.clone(); + if sidecar_network_enforcement && matches!(process_policy.network.mode, NetworkMode::Proxy) { + let proxy = process_policy + .network + .proxy + .get_or_insert(ProxyPolicy { http_addr: None }); + if proxy.http_addr.is_none() { + proxy.http_addr = Some(SIDECAR_PROCESS_PROXY_ADDR.parse().into_diagnostic()?); + } + } + Ok(process_policy) +} + +struct GatewayForwardHandle { + task: tokio::task::JoinHandle<()>, +} + +impl Drop for GatewayForwardHandle { + fn drop(&mut self) { + self.task.abort(); + } +} + +async fn start_gateway_forward_from_env(endpoint: &str) -> Result { + let listen_addr = + std::env::var(openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR).map_err(|_| { + miette::miette!( + "{} is required for sidecar gateway forwarding", + openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR + ) + })?; + start_gateway_forward(&listen_addr, endpoint).await +} + +async fn start_gateway_forward(listen_addr: &str, endpoint: &str) -> Result { + let upstream = gateway_tcp_addr(endpoint)?; + let listener = TcpListener::bind(listen_addr).await.into_diagnostic()?; + info!( + listen_addr, + upstream, "Gateway loopback TCP forward started for sidecar topology" + ); + + let task = tokio::spawn(async move { + loop { + let (mut inbound, peer) = match listener.accept().await { + Ok(accepted) => accepted, + Err(e) => { + warn!(error = %e, "Gateway forward accept failed"); + continue; + } + }; + let upstream = upstream.clone(); + tokio::spawn(async move { + let mut outbound = match TcpStream::connect(&upstream).await { + Ok(stream) => stream, + Err(e) => { + warn!(peer = %peer, upstream, error = %e, "Gateway forward connect failed"); + return; + } + }; + if let Err(e) = copy_bidirectional(&mut inbound, &mut outbound).await { + debug!(peer = %peer, error = %e, "Gateway forward connection closed with error"); + } + }); + } + }); + + Ok(GatewayForwardHandle { task }) +} + +fn gateway_tcp_addr(endpoint: &str) -> Result { + let (scheme, rest) = endpoint + .split_once("://") + .ok_or_else(|| miette::miette!("gateway endpoint must include a URL scheme"))?; + let default_port = match scheme { + "http" => 80, + "https" => 443, + other => { + return Err(miette::miette!( + "unsupported gateway endpoint scheme '{other}' for sidecar forwarding" + )); + } + }; + let authority = rest.split('/').next().unwrap_or(rest); + if authority.is_empty() { + return Err(miette::miette!("gateway endpoint is missing a host")); + } + if authority.starts_with('[') { + let closing = authority + .find(']') + .ok_or_else(|| miette::miette!("invalid bracketed IPv6 gateway endpoint"))?; + let host = &authority[..=closing]; + let port = authority[closing + 1..] + .strip_prefix(':') + .and_then(|value| value.parse::().ok()) + .unwrap_or(default_port); + return Ok(format!("{host}:{port}")); + } + let (host, port) = match authority.rsplit_once(':') { + Some((host, port)) if !host.is_empty() => { + (host, port.parse::().unwrap_or(default_port)) + } + _ => (authority, default_port), + }; + Ok(format!("{host}:{port}")) +} + /// Flush aggregated denial summaries to the gateway via `SubmitPolicyAnalysis`. async fn flush_proposals_to_gateway( endpoint: &str, @@ -1960,8 +2223,24 @@ fn format_setting_value(es: &openshell_core::proto::EffectiveSetting) -> String )] mod tests { use super::*; + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkMode, NetworkPolicy, ProcessPolicy, ProxyPolicy, + }; use std::sync::atomic::{AtomicBool, Ordering}; + fn proxy_policy(http_addr: Option) -> SandboxPolicy { + SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy { + mode: NetworkMode::Proxy, + proxy: Some(ProxyPolicy { http_addr }), + }, + landlock: LandlockPolicy::default(), + process: ProcessPolicy::default(), + } + } + fn effective_bool(value: bool) -> openshell_core::proto::EffectiveSetting { openshell_core::proto::EffectiveSetting { value: Some(openshell_core::proto::SettingValue { @@ -1973,6 +2252,73 @@ mod tests { } } + #[test] + fn sidecar_process_policy_sets_loopback_proxy_addr() { + let policy = proxy_policy(None); + + let process_policy = process_policy_for_topology(&policy, true).unwrap(); + + let http_addr = process_policy + .network + .proxy + .and_then(|proxy| proxy.http_addr) + .expect("sidecar process policy should set proxy address"); + assert_eq!(http_addr.to_string(), SIDECAR_PROCESS_PROXY_ADDR); + assert!( + policy + .network + .proxy + .as_ref() + .expect("original policy should keep proxy config") + .http_addr + .is_none(), + "process policy normalization must not mutate the network policy" + ); + } + + #[test] + fn non_sidecar_process_policy_preserves_proxy_addr() { + let policy = proxy_policy(None); + + let process_policy = process_policy_for_topology(&policy, false).unwrap(); + + assert!( + process_policy + .network + .proxy + .and_then(|proxy| proxy.http_addr) + .is_none() + ); + } + + #[test] + fn gateway_tcp_addr_uses_explicit_port() { + assert_eq!( + gateway_tcp_addr("https://openshell-gateway.openshell.svc:8080").unwrap(), + "openshell-gateway.openshell.svc:8080" + ); + } + + #[test] + fn gateway_tcp_addr_uses_scheme_default_port() { + assert_eq!( + gateway_tcp_addr("https://openshell-gateway.openshell.svc").unwrap(), + "openshell-gateway.openshell.svc:443" + ); + assert_eq!( + gateway_tcp_addr("http://openshell-gateway.openshell.svc").unwrap(), + "openshell-gateway.openshell.svc:80" + ); + } + + #[test] + fn gateway_tcp_addr_preserves_ipv6_brackets() { + assert_eq!( + gateway_tcp_addr("https://[fd00::1]:8443").unwrap(), + "[fd00::1]:8443" + ); + } + #[test] fn apply_ocsf_json_setting_enables_from_initial_settings_snapshot() { let enabled = AtomicBool::new(false); diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 91b145c2e..b5481f675 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -35,15 +35,26 @@ const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; /// Default `--mode` value: run both supervisor leaves in a single binary. const DEFAULT_MODE: &str = "network,process"; +const SIDECAR_STATE_DIR: &str = "/run/openshell-sidecar"; +const SIDECAR_TLS_DIR: &str = "/etc/openshell-tls/proxy"; +#[cfg(target_os = "linux")] +const CLIENT_TLS_DIR: &str = "/etc/openshell-tls/client"; +#[cfg(target_os = "linux")] +const SIDECAR_CLIENT_TLS_SUBDIR: &str = "client"; +#[cfg(target_os = "linux")] +const CLIENT_TLS_FILES: [&str; 3] = ["ca.crt", "tls.crt", "tls.key"]; /// Which supervisor leaves are enabled in this process. /// /// Parsed from a comma-separated `--mode` value, e.g. `network`, -/// `process`, or `network,process`. At least one must be set. +/// `process`, or `network,process`. `network-init` is a one-shot setup mode +/// used by the Kubernetes sidecar topology and cannot be combined with other +/// mode components. At least one must be set. #[derive(Clone, Copy, Debug)] struct Mode { network: bool, process: bool, + network_init: bool, } impl std::str::FromStr for Mode { @@ -53,20 +64,27 @@ impl std::str::FromStr for Mode { let mut mode = Self { network: false, process: false, + network_init: false, }; for part in s.split(',').map(str::trim).filter(|p| !p.is_empty()) { match part { "network" => mode.network = true, "process" => mode.process = true, + "network-init" => mode.network_init = true, other => { return Err(format!( - "unknown mode component '{other}' (expected 'network' and/or 'process')" + "unknown mode component '{other}' (expected 'network', 'process', or 'network-init')" )); } } } - if !mode.network && !mode.process { - return Err("--mode must enable at least one of: network, process".into()); + if mode.network_init && (mode.network || mode.process) { + return Err("--mode=network-init cannot be combined with other components".into()); + } + if !mode.network && !mode.process && !mode.network_init { + return Err( + "--mode must enable at least one of: network, process, network-init".into(), + ); } Ok(mode) } @@ -149,9 +167,28 @@ struct Args { /// "network" and/or "process". Defaults to both (single-binary /// topology). Use --mode=network for a network-only sidecar, or /// --mode=process for a process-only supervisor when network - /// enforcement runs in another pod. + /// enforcement runs in another pod. Use --mode=network-init only in + /// the Kubernetes init container that prepares sidecar nftables. #[arg(long, default_value = DEFAULT_MODE)] mode: Mode, + + /// UID that the long-running Kubernetes network sidecar will run as. + /// `--mode=network-init` installs nftables rules that exempt this UID. + #[arg(long, env = "OPENSHELL_PROXY_UID", default_value_t = 1337)] + proxy_uid: u32, + + /// GID assigned to shared sidecar state directories. Defaults to + /// `--proxy-uid` when omitted. + #[arg(long, env = "OPENSHELL_PROXY_GID")] + proxy_gid: Option, + + /// Shared state directory between the network init container and sidecar. + #[arg(long, env = "OPENSHELL_SIDECAR_STATE_DIR", default_value = SIDECAR_STATE_DIR)] + sidecar_state_dir: String, + + /// Shared TLS work directory between the network init container and sidecar. + #[arg(long, env = "OPENSHELL_PROXY_TLS_DIR", default_value = SIDECAR_TLS_DIR)] + sidecar_tls_dir: String, } /// Copy the running executable to `dest`, creating parent directories as @@ -194,6 +231,141 @@ fn copy_self(dest: &str) -> Result<()> { Ok(()) } +#[cfg(target_os = "linux")] +fn prepare_sidecar_directory(path: &Path, uid: u32, gid: u32, mode: u32) -> Result<()> { + use miette::Context as _; + use nix::unistd::{Gid, Uid, chown}; + use std::os::unix::fs::PermissionsExt; + + std::fs::create_dir_all(path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to create sidecar directory {}", path.display()))?; + let mut perms = std::fs::metadata(path).into_diagnostic()?.permissions(); + perms.set_mode(mode); + std::fs::set_permissions(path, perms) + .into_diagnostic() + .wrap_err_with(|| format!("failed to chmod sidecar directory {}", path.display()))?; + chown(path, Some(Uid::from_raw(uid)), Some(Gid::from_raw(gid))) + .into_diagnostic() + .wrap_err_with(|| { + format!( + "failed to chown sidecar directory {} to {uid}:{gid}", + path.display() + ) + })?; + Ok(()) +} + +#[cfg(target_os = "linux")] +fn copy_sidecar_client_tls_if_present( + source_dir: &Path, + sidecar_tls_dir: &Path, + uid: u32, + gid: u32, +) -> Result<()> { + use miette::Context as _; + use nix::unistd::{Gid, Uid, chown}; + use std::os::unix::fs::PermissionsExt; + + if !source_dir.exists() { + return Ok(()); + } + + let dest_dir = sidecar_tls_dir.join(SIDECAR_CLIENT_TLS_SUBDIR); + prepare_sidecar_directory(&dest_dir, uid, gid, 0o750)?; + for file_name in CLIENT_TLS_FILES { + let source = source_dir.join(file_name); + if !source.exists() { + return Err(miette::miette!( + "client TLS source file is missing: {}", + source.display() + )); + } + let dest = dest_dir.join(file_name); + std::fs::copy(&source, &dest) + .into_diagnostic() + .wrap_err_with(|| { + format!( + "failed to copy client TLS file {} to {}", + source.display(), + dest.display() + ) + })?; + let mut perms = std::fs::metadata(&dest).into_diagnostic()?.permissions(); + perms.set_mode(0o400); + std::fs::set_permissions(&dest, perms) + .into_diagnostic() + .wrap_err_with(|| { + format!("failed to chmod copied client TLS file {}", dest.display()) + })?; + chown(&dest, Some(Uid::from_raw(uid)), Some(Gid::from_raw(gid))) + .into_diagnostic() + .wrap_err_with(|| { + format!( + "failed to chown copied client TLS file {} to {uid}:{gid}", + dest.display() + ) + })?; + } + + Ok(()) +} + +#[cfg(target_os = "linux")] +fn run_network_init( + proxy_user_id: u32, + proxy_primary_group_id: u32, + sidecar_state_dir: &str, + sidecar_tls_dir: &str, +) -> Result<()> { + if proxy_user_id < openshell_policy::MIN_SANDBOX_UID { + return Err(miette::miette!( + "--proxy-uid must be at least {}", + openshell_policy::MIN_SANDBOX_UID + )); + } + if proxy_primary_group_id < openshell_policy::MIN_SANDBOX_UID { + return Err(miette::miette!( + "--proxy-gid must be at least {}", + openshell_policy::MIN_SANDBOX_UID + )); + } + + let sidecar_state_dir = Path::new(sidecar_state_dir); + let sidecar_tls_dir = Path::new(sidecar_tls_dir); + prepare_sidecar_directory( + sidecar_state_dir, + proxy_user_id, + proxy_primary_group_id, + 0o775, + )?; + prepare_sidecar_directory( + sidecar_tls_dir, + proxy_user_id, + proxy_primary_group_id, + 0o755, + )?; + copy_sidecar_client_tls_if_present( + Path::new(CLIENT_TLS_DIR), + sidecar_tls_dir, + proxy_user_id, + proxy_primary_group_id, + )?; + openshell_supervisor_process::netns::install_sidecar_bypass_rules(proxy_user_id) +} + +#[cfg(not(target_os = "linux"))] +fn run_network_init( + _proxy_uid: u32, + _proxy_gid: u32, + _sidecar_state_dir: &str, + _sidecar_tls_dir: &str, +) -> Result<()> { + Err(miette::miette!( + "--mode=network-init is only supported on Linux" + )) +} + fn main() -> Result<()> { // Handle `copy-self ` before clap so it works without any of the // sandbox flags. Kubernetes init containers invoke this path to seed an @@ -222,6 +394,16 @@ fn main() -> Result<()> { let args = Args::parse(); + if args.mode.network_init { + let proxy_gid = args.proxy_gid.unwrap_or(args.proxy_uid); + return run_network_init( + args.proxy_uid, + proxy_gid, + &args.sidecar_state_dir, + &args.sidecar_tls_dir, + ); + } + // Try to open a rolling log file; fall back to stderr-only logging if it fails // (e.g., /var/log is not writable in custom workload images). // Rotates daily, keeps the 3 most recent files to bound disk usage. @@ -421,4 +603,24 @@ mod tests { let final_path = dest_dir.join("openshell-sandbox"); assert!(final_path.exists(), "binary should land inside dest dir"); } + + #[test] + fn mode_parses_network_init_standalone() { + let mode = "network-init".parse::().unwrap(); + assert!(mode.network_init); + assert!(!mode.network); + assert!(!mode.process); + } + + #[test] + fn mode_rejects_combined_network_init() { + let err = "network-init,network".parse::().unwrap_err(); + assert!(err.contains("cannot be combined")); + } + + #[test] + fn mode_rejects_empty_value() { + let err = "".parse::().unwrap_err(); + assert!(err.contains("at least one")); + } } diff --git a/crates/openshell-supervisor-network/data/sandbox-policy.rego b/crates/openshell-supervisor-network/data/sandbox-policy.rego index efcdf0732..d70c69b74 100644 --- a/crates/openshell-supervisor-network/data/sandbox-policy.rego +++ b/crates/openshell-supervisor-network/data/sandbox-policy.rego @@ -19,6 +19,10 @@ allow_network if { network_policy_for_request } +binary_identity_required if { + object.get(object.get(data, "runtime", {}), "require_binary_identity", true) +} + # --- Deny reasons (specific diagnostics for debugging policy denials) --- deny_reason := "missing input.network" if { @@ -131,6 +135,12 @@ endpoint_allowed(policy, network) if { endpoint.ports[_] == network.port } +# Binary matching can be relaxed by trusted runtime configuration. In that +# mode, network policies are endpoint/L7 scoped and ignore policy.binaries. +binary_allowed(_, _) if { + not binary_identity_required +} + # Binary matching: exact path. # SHA256 integrity is enforced in Rust via trust-on-first-use (TOFU) cache, # not in Rego. The proxy computes and caches binary hashes at runtime. @@ -161,6 +171,10 @@ binary_allowed(policy, exec) if { glob.match(b.path, ["/"], p) } +user_declared_binary_allowed(_, _) if { + not binary_identity_required +} + user_declared_binary_allowed(policy, exec) if { some b b := policy.binaries[_] diff --git a/crates/openshell-supervisor-network/src/identity.rs b/crates/openshell-supervisor-network/src/identity.rs index fce568f41..5e89c3503 100644 --- a/crates/openshell-supervisor-network/src/identity.rs +++ b/crates/openshell-supervisor-network/src/identity.rs @@ -100,23 +100,34 @@ impl BinaryIdentityCache { /// Returns `Ok(hash)` if it matches, `Err` if the hash changed (binary tampered). #[cfg_attr(not(target_os = "linux"), allow(dead_code))] pub fn verify_or_cache(&self, path: &Path) -> Result { - self.verify_or_cache_with_hasher(path, procfs::file_sha256) + self.verify_or_cache_with_paths(path, path, procfs::file_sha256) } - fn verify_or_cache_with_hasher(&self, path: &Path, mut hash_file: F) -> Result + #[cfg(target_os = "linux")] + pub fn verify_or_cache_process_exe(&self, display_path: &Path, pid: u32) -> Result { + let proc_exe = PathBuf::from(format!("/proc/{pid}/exe")); + self.verify_or_cache_with_paths(display_path, &proc_exe, procfs::file_sha256) + } + + fn verify_or_cache_with_paths( + &self, + cache_path: &Path, + access_path: &Path, + mut hash_file: F, + ) -> Result where F: FnMut(&Path) -> Result, { let start = std::time::Instant::now(); - let metadata = std::fs::metadata(path) - .map_err(|error| miette::miette!("Failed to stat {}: {error}", path.display()))?; + let metadata = std::fs::metadata(access_path) + .map_err(|error| miette::miette!("Failed to stat {}: {error}", cache_path.display()))?; let fingerprint = FileFingerprint::from_metadata(&metadata); let cached = self .hashes .lock() .map_err(|_| miette::miette!("Binary identity cache lock poisoned"))? - .get(path) + .get(cache_path) .cloned(); if let Some(cached_binary) = &cached @@ -125,7 +136,7 @@ impl BinaryIdentityCache { debug!( " verify_or_cache: {}ms CACHE HIT path={}", start.elapsed().as_millis(), - path.display() + cache_path.display() ); return Ok(cached_binary.hash.clone()); } @@ -133,29 +144,29 @@ impl BinaryIdentityCache { debug!( " verify_or_cache: CACHE MISS size={} path={}", metadata.len(), - path.display() + cache_path.display() ); - let current_hash = hash_file(path)?; + let current_hash = hash_file(access_path)?; let mut hashes = self .hashes .lock() .map_err(|_| miette::miette!("Binary identity cache lock poisoned"))?; - if let Some(existing) = hashes.get(path) + if let Some(existing) = hashes.get(cache_path) && existing.hash != current_hash { return Err(miette::miette!( "Binary integrity violation: {} hash changed (cached: {}, current: {})", - path.display(), + cache_path.display(), existing.hash, current_hash )); } hashes.insert( - path.to_path_buf(), + cache_path.to_path_buf(), CachedBinary { hash: current_hash.clone(), fingerprint, @@ -165,7 +176,7 @@ impl BinaryIdentityCache { debug!( " verify_or_cache TOTAL (cold): {}ms path={}", start.elapsed().as_millis(), - path.display() + cache_path.display() ); Ok(current_hash) @@ -212,13 +223,13 @@ mod tests { let mut hash_calls = 0; let hash1 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) .unwrap(); let hash2 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -238,7 +249,7 @@ mod tests { let mut hash_calls = 0; let hash1 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -254,7 +265,7 @@ mod tests { .unwrap(); let hash2 = cache - .verify_or_cache_with_hasher(tmp.path(), |path| { + .verify_or_cache_with_paths(tmp.path(), tmp.path(), |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -275,7 +286,7 @@ mod tests { let mut hash_calls = 0; cache - .verify_or_cache_with_hasher(&path, |path| { + .verify_or_cache_with_paths(&path, &path, |path| { hash_calls += 1; procfs::file_sha256(path) }) @@ -292,7 +303,7 @@ mod tests { .set_modified(original_mtime) .unwrap(); - let result = cache.verify_or_cache_with_hasher(&path, |path| { + let result = cache.verify_or_cache_with_paths(&path, &path, |path| { hash_calls += 1; procfs::file_sha256(path) }); @@ -301,6 +312,28 @@ mod tests { assert_eq!(hash_calls, 2); } + #[test] + fn display_path_can_differ_from_access_path() { + let mut tmp = tempfile::NamedTempFile::new().unwrap(); + tmp.write_all(b"binary content").unwrap(); + tmp.flush().unwrap(); + let display_path = Path::new("/usr/bin/python3"); + + let cache = BinaryIdentityCache::new(); + let hash = cache + .verify_or_cache_with_paths(display_path, tmp.path(), procfs::file_sha256) + .unwrap(); + + assert!(!hash.is_empty()); + assert!( + cache + .hashes + .lock() + .unwrap() + .contains_key(Path::new("/usr/bin/python3")) + ); + } + #[test] fn hash_mismatch_returns_error() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/openshell-supervisor-network/src/opa.rs b/crates/openshell-supervisor-network/src/opa.rs index fbab5fedd..850c38320 100644 --- a/crates/openshell-supervisor-network/src/opa.rs +++ b/crates/openshell-supervisor-network/src/opa.rs @@ -18,6 +18,7 @@ use std::sync::{ Arc, Mutex, atomic::{AtomicU64, Ordering}, }; +use tracing::info; /// Baked-in rego rules for OPA policy evaluation. /// These rules define the network access decision logic and static config @@ -55,6 +56,49 @@ pub struct NetworkInput { pub cmdline_paths: Vec, } +pub(crate) fn network_binary_identity_required() -> bool { + std::env::var(openshell_core::sandbox_env::NETWORK_BINARY_IDENTITY).map_or(true, |value| { + !matches!( + value.as_str(), + "relaxed" | "disabled" | "endpoint-only" | "false" | "0" + ) + }) +} + +fn inject_runtime_policy_data(data: &mut serde_json::Value, require_binary_identity: bool) { + let Some(obj) = data.as_object_mut() else { + return; + }; + obj.insert( + "runtime".to_string(), + serde_json::json!({ + "require_binary_identity": require_binary_identity, + }), + ); +} + +fn emit_binary_identity_mode(require_binary_identity: bool, source: &str) { + info!( + require_binary_identity, + source, "Configured OPA runtime binary identity mode" + ); + openshell_ocsf::ocsf_emit!( + openshell_ocsf::ConfigStateChangeBuilder::new(openshell_ocsf::ctx::ctx()) + .severity(openshell_ocsf::SeverityId::Informational) + .status(openshell_ocsf::StatusId::Success) + .state(openshell_ocsf::StateId::Enabled, "configured") + .unmapped( + "require_binary_identity", + serde_json::json!(require_binary_identity) + ) + .unmapped("source", serde_json::json!(source)) + .message(format!( + "OPA runtime binary identity mode configured [source:{source} require_binary_identity:{require_binary_identity}]" + )) + .build() + ); +} + /// Sandbox configuration extracted from OPA data at startup. pub struct SandboxConfig { pub filesystem: FilesystemPolicy, @@ -146,7 +190,9 @@ impl OpaEngine { engine .add_policy_from_file(policy_path) .map_err(|e| miette::miette!("{e}"))?; - let data_json = preprocess_yaml_data(&yaml_str)?; + let require_binary_identity = network_binary_identity_required(); + emit_binary_identity_mode(require_binary_identity, "files"); + let data_json = preprocess_yaml_data(&yaml_str, require_binary_identity)?; engine .add_data_json(&data_json) .map_err(|e| miette::miette!("{e}"))?; @@ -160,11 +206,24 @@ impl OpaEngine { /// /// Preprocesses the YAML data to expand access presets and validate L7 config. pub fn from_strings(policy: &str, data_yaml: &str) -> Result { + Self::from_strings_with_binary_identity_required( + policy, + data_yaml, + network_binary_identity_required(), + ) + } + + pub(crate) fn from_strings_with_binary_identity_required( + policy: &str, + data_yaml: &str, + require_binary_identity: bool, + ) -> Result { let mut engine = regorus::Engine::new(); engine .add_policy("policy.rego".into(), policy.into()) .map_err(|e| miette::miette!("{e}"))?; - let data_json = preprocess_yaml_data(data_yaml)?; + emit_binary_identity_mode(require_binary_identity, "strings"); + let data_json = preprocess_yaml_data(data_yaml, require_binary_identity)?; engine .add_data_json(&data_json) .map_err(|e| miette::miette!("{e}"))?; @@ -193,11 +252,25 @@ impl OpaEngine { /// gap between user-specified symlink paths (e.g., `/usr/bin/python3`) and /// kernel-resolved canonical paths (e.g., `/usr/bin/python3.11`). pub fn from_proto_with_pid(proto: &ProtoSandboxPolicy, entrypoint_pid: u32) -> Result { + Self::from_proto_with_pid_and_binary_identity_required( + proto, + entrypoint_pid, + network_binary_identity_required(), + ) + } + + fn from_proto_with_pid_and_binary_identity_required( + proto: &ProtoSandboxPolicy, + entrypoint_pid: u32, + require_binary_identity: bool, + ) -> Result { + emit_binary_identity_mode(require_binary_identity, "proto"); let data_json_str = proto_to_opa_data_json(proto, entrypoint_pid); // Parse back to Value for preprocessing, then re-serialize let mut data: serde_json::Value = serde_json::from_str(&data_json_str) .map_err(|e| miette::miette!("internal: failed to parse proto JSON: {e}"))?; + inject_runtime_policy_data(&mut data, require_binary_identity); // Validate BEFORE expanding presets let (errors, warnings) = crate::l7::validate_l7_policies(&data); @@ -720,9 +793,10 @@ fn parse_process_policy(val: ®orus::Value) -> ProcessPolicy { } /// Preprocess YAML policy data: parse, normalize, validate, expand access presets, return JSON. -fn preprocess_yaml_data(yaml_str: &str) -> Result { +fn preprocess_yaml_data(yaml_str: &str, require_binary_identity: bool) -> Result { let mut data: serde_json::Value = serde_yml::from_str(yaml_str) .map_err(|e| miette::miette!("failed to parse YAML data: {e}"))?; + inject_runtime_policy_data(&mut data, require_binary_identity); // Normalize port → ports for all endpoints so Rego always sees "ports" array. normalize_endpoint_ports(&mut data); @@ -2264,6 +2338,88 @@ process: assert!(eval_l7(&engine, &input)); } + #[test] + fn l7_get_allowed_by_rules_when_binary_identity_relaxed() { + let engine = + OpaEngine::from_strings_with_binary_identity_required(TEST_POLICY, L7_TEST_DATA, false) + .expect("Failed to load relaxed L7 test data"); + let mut input = l7_input("api.example.com", 8080, "GET", "/repos/myorg/foo"); + input["exec"]["path"] = "".into(); + assert!(eval_l7(&engine, &input)); + } + + #[test] + fn relaxed_binary_identity_preserves_matched_policy_and_l7_for_proto() { + let mut network_policies = std::collections::HashMap::new(); + network_policies.insert( + "test_l7".to_string(), + NetworkPolicyRule { + name: "test_l7".to_string(), + endpoints: vec![NetworkEndpoint { + host: "host.k3d.internal".to_string(), + port: 56123, + protocol: "rest".to_string(), + enforcement: "enforce".to_string(), + rules: vec![L7Rule { + allow: Some(L7Allow { + method: "GET".to_string(), + path: "/allowed".to_string(), + command: String::new(), + query: std::collections::HashMap::new(), + operation_type: String::new(), + operation_name: String::new(), + fields: Vec::new(), + params: std::collections::HashMap::new(), + }), + }], + allowed_ips: vec!["192.168.0.0/16".to_string()], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }, + ); + let proto = ProtoSandboxPolicy { + version: 1, + filesystem: Some(ProtoFs { + include_workdir: true, + read_only: vec![], + read_write: vec![], + }), + landlock: Some(openshell_core::proto::LandlockPolicy { + compatibility: "best_effort".to_string(), + }), + process: Some(ProtoProc { + run_as_user: "sandbox".to_string(), + run_as_group: "sandbox".to_string(), + }), + network_policies, + }; + let engine = OpaEngine::from_proto_with_pid_and_binary_identity_required(&proto, 0, false) + .expect("engine from relaxed proto"); + let network_input = NetworkInput { + host: "host.k3d.internal".into(), + port: 56123, + binary_path: PathBuf::new(), + binary_sha256: String::new(), + ancestors: vec![], + cmdline_paths: vec![], + }; + let action = engine.evaluate_network_action(&network_input).unwrap(); + assert_eq!( + action, + NetworkAction::Allow { + matched_policy: Some("test_l7".to_string()) + } + ); + + let mut input = l7_input("host.k3d.internal", 56123, "GET", "/allowed"); + input["exec"]["path"] = "".into(); + assert!(eval_l7(&engine, &input)); + } + #[test] fn l7_post_allowed_by_rules() { let engine = l7_engine(); @@ -4592,6 +4748,46 @@ process: ); } + #[test] + fn relaxed_binary_identity_allows_declared_endpoint_without_binary_match() { + let engine = OpaEngine::from_strings_with_binary_identity_required( + TEST_POLICY, + INFERENCE_TEST_DATA, + false, + ) + .expect("Failed to load relaxed binary identity test data"); + let input = NetworkInput { + host: "api.anthropic.com".into(), + port: 443, + binary_path: PathBuf::from("/tmp/unlisted-agent"), + binary_sha256: "unused".into(), + ancestors: vec![], + cmdline_paths: vec![], + }; + + let action = engine.evaluate_network_action(&input).unwrap(); + assert_eq!( + action, + NetworkAction::Allow { + matched_policy: Some("claude_code".to_string()) + }, + ); + assert!( + engine.query_exact_declared_endpoint_host(&input).unwrap(), + "relaxed identity should preserve exact declared endpoint handling" + ); + + let undeclared = NetworkInput { + host: "api.openai.com".into(), + ..input + }; + let action = engine.evaluate_network_action(&undeclared).unwrap(); + assert!( + matches!(action, NetworkAction::Deny { .. }), + "relaxed identity must not allow undeclared endpoints" + ); + } + #[test] fn unknown_endpoint_returns_deny() { let engine = inference_engine(); diff --git a/crates/openshell-supervisor-network/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs index 0d2c8c025..5debc2926 100644 --- a/crates/openshell-supervisor-network/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -42,6 +42,8 @@ const TUNNEL_PROTOCOL_PEEK_POLL: std::time::Duration = std::time::Duration::from const TUNNEL_PROTOCOL_PEEK_POLL: std::time::Duration = std::time::Duration::from_millis(1); const INFERENCE_LOCAL_HOST: &str = "inference.local"; const INFERENCE_LOCAL_PORT: u16 = 443; +#[cfg(target_os = "linux")] +const SIDECAR_SUPERVISOR_TOPOLOGY: &str = "sidecar"; /// Hostnames injected by compute drivers as `/etc/hosts` aliases for the host /// machine. Traffic to these names is eligible for the trusted-gateway SSRF @@ -1426,7 +1428,7 @@ fn resolve_owner_identity( })?; let bin_hash = identity_cache - .verify_or_cache(&bin_path) + .verify_or_cache_process_exe(&bin_path, owner_pid) .map_err(|e| IdentityError { reason: format!("binary integrity check failed: {e}"), binary: Some(bin_path.clone()), @@ -1434,11 +1436,15 @@ fn resolve_owner_identity( ancestors: vec![], })?; - let ancestors = crate::procfs::collect_ancestor_binaries(owner_pid, entrypoint_pid); + let ancestor_identities = collect_ancestor_identities(owner_pid, entrypoint_pid); + let ancestors: Vec = ancestor_identities + .iter() + .map(|(_, path)| path.clone()) + .collect(); - for ancestor in &ancestors { + for (ancestor_pid, ancestor) in &ancestor_identities { identity_cache - .verify_or_cache(ancestor) + .verify_or_cache_process_exe(ancestor, *ancestor_pid) .map_err(|e| IdentityError { reason: format!( "ancestor integrity check failed for {}: {e}", @@ -1463,6 +1469,31 @@ fn resolve_owner_identity( }) } +#[cfg(target_os = "linux")] +fn collect_ancestor_identities(start_pid: u32, stop_pid: u32) -> Vec<(u32, PathBuf)> { + const MAX_DEPTH: usize = 64; + let mut ancestors = Vec::new(); + let mut current = start_pid; + + for _ in 0..MAX_DEPTH { + let parent_pid = match crate::procfs::read_ppid(current) { + Some(parent) if parent > 0 && parent != current => parent, + _ => break, + }; + + if let Ok(path) = crate::procfs::binary_path(parent_pid.cast_signed()) { + ancestors.push((parent_pid, path)); + } + + if parent_pid == stop_pid || parent_pid == 1 { + break; + } + current = parent_pid; + } + + ancestors +} + /// Resolve the identity of the process owning a TCP peer connection. /// /// Walks `/proc//net/tcp` to find the socket inode, locates @@ -1573,8 +1604,17 @@ fn evaluate_opa_tcp( } }; - let pid = entrypoint_pid.load(Ordering::Acquire); - if pid == 0 { + if !crate::opa::network_binary_identity_required() { + let result = evaluate_endpoint_only_opa(engine, host, port); + debug!( + "evaluate_opa_tcp endpoint-only: host={host} port={port} action={:?}", + result.action + ); + return result; + } + + let entrypoint_pid = entrypoint_pid.load(Ordering::Acquire); + let Some(proc_net_anchor_pid) = proc_net_anchor_pid(entrypoint_pid) else { return deny( "entrypoint process not yet spawned".into(), None, @@ -1582,12 +1622,12 @@ fn evaluate_opa_tcp( vec![], vec![], ); - } + }; let total_start = std::time::Instant::now(); let peer_port = peer_addr.port(); - let identity = match resolve_process_identity(pid, peer_port, identity_cache) { + let identity = match resolve_process_identity(proc_net_anchor_pid, peer_port, identity_cache) { Ok(id) => id, Err(err) => { return deny( @@ -1641,6 +1681,52 @@ fn evaluate_opa_tcp( result } +#[cfg(target_os = "linux")] +fn proc_net_anchor_pid(entrypoint_pid: u32) -> Option { + if entrypoint_pid != 0 { + return Some(entrypoint_pid); + } + sidecar_topology_enabled().then(std::process::id) +} + +#[cfg(target_os = "linux")] +fn sidecar_topology_enabled() -> bool { + std::env::var(openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY) + .is_ok_and(|value| value == SIDECAR_SUPERVISOR_TOPOLOGY) +} + +fn evaluate_endpoint_only_opa(engine: &OpaEngine, host: &str, port: u16) -> ConnectDecision { + let input = crate::opa::NetworkInput { + host: host.to_string(), + port, + binary_path: PathBuf::new(), + binary_sha256: String::new(), + ancestors: vec![], + cmdline_paths: vec![], + }; + + match engine.evaluate_network_action_with_generation(&input) { + Ok((action, generation)) => ConnectDecision { + action, + generation, + binary: None, + binary_pid: None, + ancestors: vec![], + cmdline_paths: vec![], + }, + Err(e) => ConnectDecision { + action: NetworkAction::Deny { + reason: format!("policy evaluation error: {e}"), + }, + generation: engine.current_generation(), + binary: None, + binary_pid: None, + ancestors: vec![], + cmdline_paths: vec![], + }, + } +} + /// Non-Linux stub: OPA identity binding requires /proc. #[cfg(not(target_os = "linux"))] fn evaluate_opa_tcp( @@ -1648,9 +1734,13 @@ fn evaluate_opa_tcp( engine: &OpaEngine, _identity_cache: &BinaryIdentityCache, _entrypoint_pid: &AtomicU32, - _host: &str, - _port: u16, + host: &str, + port: u16, ) -> ConnectDecision { + if !crate::opa::network_binary_identity_required() { + return evaluate_endpoint_only_opa(engine, host, port); + } + ConnectDecision { action: NetworkAction::Deny { reason: "identity binding unavailable on this platform".into(), @@ -2152,14 +2242,24 @@ fn query_l7_route_snapshot( }; match engine.query_endpoint_configs_with_generation(&input) { - Ok((vals, generation)) => Some(L7RouteSnapshot { - configs: vals + Ok((vals, generation)) => { + let configs: Vec<_> = vals .into_iter() .filter_map(|val| crate::l7::parse_l7_config(&val)) .map(|config| L7ConfigSnapshot { config }) - .collect(), - generation, - }), + .collect(); + debug!( + host, + port, + generation, + config_count = configs.len(), + "Forward proxy L7 route lookup complete" + ); + Some(L7RouteSnapshot { + configs, + generation, + }) + } Err(e) => { let event = NetworkActivityBuilder::new(openshell_ocsf::ctx::ctx()) .activity(ActivityId::Fail) @@ -3337,10 +3437,29 @@ async fn handle_forward_proxy( } }; let policy_str = matched_policy.as_deref().unwrap_or("-"); + debug!( + host = %host_lc, + port, + binary = %binary_str, + binary_pid = %pid_str, + matched_policy = %policy_str, + decision_generation = decision.generation, + current_generation = opa_engine.current_generation(), + action = ?decision.action, + "Forward proxy L4 policy decision" + ); let sandbox_entrypoint_pid = entrypoint_pid.load(Ordering::Acquire); let forward_generation_guard = match opa_engine.generation_guard(decision.generation) { Ok(guard) => guard, Err(e) => { + warn!( + host = %host_lc, + port, + decision_generation = decision.generation, + current_generation = opa_engine.current_generation(), + error = %e, + "Forward proxy rejected request because policy generation changed after L4 decision" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); emit_activity_simple(activity_tx, true, "policy_stale"); respond( @@ -3401,6 +3520,15 @@ async fn handle_forward_proxy( && !route.configs.is_empty() { if route.generation != forward_generation_guard.captured_generation() { + warn!( + host = %host_lc, + port, + decision_generation = decision.generation, + guard_generation = forward_generation_guard.captured_generation(), + route_generation = route.generation, + current_generation = opa_engine.current_generation(), + "Forward proxy rejected request because L7 route lookup used a different policy generation" + ); emit_l7_tunnel_close_after_policy_change( &host_lc, port, @@ -3426,6 +3554,14 @@ async fn handle_forward_proxy( let tunnel_engine = match opa_engine.clone_engine_for_tunnel(route.generation) { Ok(engine) => engine, Err(e) => { + warn!( + host = %host_lc, + port, + route_generation = route.generation, + current_generation = opa_engine.current_generation(), + error = %e, + "Forward proxy rejected request because L7 tunnel engine could not be cloned" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); emit_activity_simple(activity_tx, true, "policy_stale"); respond( @@ -4105,6 +4241,14 @@ async fn handle_forward_proxy( }; if let Err(e) = forward_generation_guard.ensure_current() { + warn!( + host = %host_lc, + port, + captured_generation = forward_generation_guard.captured_generation(), + current_generation = forward_generation_guard.current_generation(), + error = %e, + "Forward proxy rejected request because policy changed before upstream connect" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); emit_activity_simple(activity_tx, true, "policy_stale"); respond( @@ -4243,6 +4387,14 @@ async fn handle_forward_proxy( }; if let Err(e) = forward_generation_guard.ensure_current() { + warn!( + host = %host_lc, + port, + captured_generation = forward_generation_guard.captured_generation(), + current_generation = forward_generation_guard.current_generation(), + error = %e, + "Forward proxy rejected request because policy changed before relay" + ); emit_l7_tunnel_close_after_policy_change(&host_lc, port, e); respond( client, @@ -4379,6 +4531,46 @@ mod tests { use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use tokio::net::{TcpListener, TcpStream}; + #[test] + fn endpoint_only_opa_allows_declared_endpoint_without_process_identity() { + let policy = include_str!("../data/sandbox-policy.rego"); + let data = r#" +version: 1 +network_policies: + test_l7: + name: test_l7 + endpoints: + - host: host.k3d.internal + port: 56123 + protocol: rest + enforcement: enforce + rules: + - allow: + method: GET + path: /allowed + binaries: + - path: /usr/bin/curl +"#; + let engine = OpaEngine::from_strings_with_binary_identity_required(policy, data, false) + .expect("relaxed engine"); + + let decision = evaluate_endpoint_only_opa(&engine, "host.k3d.internal", 56123); + assert_eq!( + decision.action, + NetworkAction::Allow { + matched_policy: Some("test_l7".to_string()), + } + ); + assert!(decision.binary.is_none()); + assert!(decision.ancestors.is_empty()); + + let denied = evaluate_endpoint_only_opa(&engine, "api.example.com", 443); + assert!( + matches!(denied.action, NetworkAction::Deny { .. }), + "endpoint-only mode must still deny undeclared endpoints" + ); + } + fn websocket_l7_config( protocol: crate::l7::L7Protocol, websocket_credential_rewrite: bool, diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs index 9553e0673..8e17758bd 100644 --- a/crates/openshell-supervisor-network/src/run.rs +++ b/crates/openshell-supervisor-network/src/run.rs @@ -201,7 +201,9 @@ pub async fn run_networking( let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { match SandboxCa::generate() { Ok(ca) => { - let tls_dir = std::path::Path::new("/etc/openshell-tls"); + let tls_dir = std::env::var(openshell_core::sandbox_env::PROXY_TLS_DIR) + .unwrap_or_else(|_| "/etc/openshell-tls".to_string()); + let tls_dir = std::path::Path::new(&tls_dir); let system_ca_bundle = read_system_ca_bundle(); match write_ca_files(&ca, tls_dir, &system_ca_bundle) { Ok(paths) => { diff --git a/crates/openshell-supervisor-process/src/netns/mod.rs b/crates/openshell-supervisor-process/src/netns/mod.rs index cc7b1d84c..86a5406ad 100644 --- a/crates/openshell-supervisor-process/src/netns/mod.rs +++ b/crates/openshell-supervisor-process/src/netns/mod.rs @@ -467,6 +467,123 @@ pub fn create_netns_for_proxy( } } +/// Install pod-network bypass enforcement for Kubernetes sidecar topology. +/// +/// This runs in the current network namespace, not in a per-workload netns. +/// The rules allow loopback and the sidecar proxy UID, then reject direct +/// TCP/UDP egress from other UIDs so traffic must use the sidecar's local +/// proxy. +/// +/// # Errors +/// +/// Returns an error when `nft` is unavailable or the ruleset cannot be loaded. +pub fn install_sidecar_bypass_rules(proxy_uid: u32) -> Result<()> { + match install_sidecar_nft_bypass_rules(proxy_uid) { + Ok(()) => Ok(()), + Err(nft_error) => { + warn!( + error = %nft_error, + "Failed to install nftables sidecar rules; trying iptables-legacy fallback" + ); + install_sidecar_iptables_legacy_bypass_rules(proxy_uid).map_err(|iptables_error| { + miette::miette!( + "sidecar nft ruleset load failed: {nft_error}; sidecar iptables-legacy fallback failed: {iptables_error}" + ) + }) + } + } +} + +fn install_sidecar_nft_bypass_rules(proxy_uid: u32) -> Result<()> { + let nft_cmd = find_nft().ok_or_else(|| { + miette::miette!( + "trusted nft helper not found; sidecar network enforcement requires nftables" + ) + })?; + let log_prefix = Some("openshell:sidecar-bypass:"); + let ruleset = nft_ruleset::generate_sidecar_bypass_ruleset(proxy_uid, log_prefix); + run_nft_current_namespace(&nft_cmd, &ruleset) +} + +const SIDECAR_IPTABLES_CHAIN: &str = "OPENSHELL_SIDECAR_BYPASS"; + +fn install_sidecar_iptables_legacy_bypass_rules(proxy_uid: u32) -> Result<()> { + let iptables_cmd = find_iptables_legacy().ok_or_else(|| { + miette::miette!( + "trusted iptables-legacy helper not found; sidecar network enforcement fallback unavailable" + ) + })?; + + cleanup_sidecar_iptables_legacy_rules(&iptables_cmd); + + let proxy_uid_arg = proxy_uid.to_string(); + let commands: Vec> = vec![ + vec!["-N", SIDECAR_IPTABLES_CHAIN], + vec!["-A", SIDECAR_IPTABLES_CHAIN, "-o", "lo", "-j", "ACCEPT"], + vec![ + "-A", + SIDECAR_IPTABLES_CHAIN, + "-m", + "conntrack", + "--ctstate", + "ESTABLISHED,RELATED", + "-j", + "ACCEPT", + ], + vec![ + "-A", + SIDECAR_IPTABLES_CHAIN, + "-m", + "owner", + "--uid-owner", + &proxy_uid_arg, + "-j", + "ACCEPT", + ], + vec![ + "-A", + SIDECAR_IPTABLES_CHAIN, + "-p", + "tcp", + "-j", + "REJECT", + "--reject-with", + "tcp-reset", + ], + vec![ + "-A", + SIDECAR_IPTABLES_CHAIN, + "-p", + "udp", + "-j", + "REJECT", + "--reject-with", + "icmp-port-unreachable", + ], + vec!["-A", "OUTPUT", "-j", SIDECAR_IPTABLES_CHAIN], + ]; + + for args in commands { + if let Err(e) = run_iptables_legacy_current_namespace(&iptables_cmd, &args) { + cleanup_sidecar_iptables_legacy_rules(&iptables_cmd); + return Err(e); + } + } + + Ok(()) +} + +fn cleanup_sidecar_iptables_legacy_rules(iptables_cmd: &str) { + while run_iptables_legacy_current_namespace( + iptables_cmd, + &["-D", "OUTPUT", "-j", SIDECAR_IPTABLES_CHAIN], + ) + .is_ok() + {} + let _ = run_iptables_legacy_current_namespace(iptables_cmd, &["-F", SIDECAR_IPTABLES_CHAIN]); + let _ = run_iptables_legacy_current_namespace(iptables_cmd, &["-X", SIDECAR_IPTABLES_CHAIN]); +} + /// Run an `ip` command on the host. fn run_ip(args: &[&str]) -> Result<()> { let ip_path = find_trusted_binary("ip", IP_SEARCH_PATHS)?; @@ -490,6 +607,62 @@ fn run_ip(args: &[&str]) -> Result<()> { Ok(()) } +fn run_iptables_legacy_current_namespace(iptables_cmd: &str, args: &[&str]) -> Result<()> { + debug!( + command = %format!("{iptables_cmd} {}", args.join(" ")), + "Running iptables-legacy sidecar command" + ); + + let output = Command::new(iptables_cmd) + .args(args) + .output() + .into_diagnostic()?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(miette::miette!( + "{iptables_cmd} {} failed: {}", + args.join(" "), + stderr.trim() + )); + } + + Ok(()) +} + +fn run_nft_current_namespace(nft_cmd: &str, ruleset: &str) -> Result<()> { + use std::io::Write; + let mut tmp = tempfile::Builder::new() + .prefix("openshell-sidecar-nft-") + .suffix(".conf") + .tempfile() + .into_diagnostic()?; + tmp.write_all(ruleset.as_bytes()).into_diagnostic()?; + let ruleset_path = tmp.path().to_string_lossy().to_string(); + + debug!( + command = %format!("{nft_cmd} -f {ruleset_path}"), + "Loading nftables sidecar ruleset" + ); + + let output = Command::new(nft_cmd) + .args(["-f", &ruleset_path]) + .output() + .into_diagnostic()?; + + drop(tmp); + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(miette::miette!( + "sidecar nft ruleset load failed: {}", + stderr.trim() + )); + } + + Ok(()) +} + /// Run an `ip` command inside a network namespace via `nsenter --net=`. /// /// We use `nsenter` instead of `ip netns exec` because `ip netns exec` @@ -605,6 +778,11 @@ fn enable_nf_log_all_netns() { /// Well-known paths where nft may be installed. const NFT_SEARCH_PATHS: &[&str] = &["/usr/sbin/nft", "/sbin/nft", "/usr/bin/nft"]; +const IPTABLES_LEGACY_SEARCH_PATHS: &[&str] = &[ + "/usr/sbin/iptables-legacy", + "/sbin/iptables-legacy", + "/usr/bin/iptables-legacy", +]; fn find_trusted_binary<'a>(name: &str, paths: &'a [&str]) -> Result<&'a str> { paths @@ -629,6 +807,12 @@ fn find_nft() -> Option { .map(String::from) } +fn find_iptables_legacy() -> Option { + find_trusted_binary("iptables-legacy", IPTABLES_LEGACY_SEARCH_PATHS) + .ok() + .map(String::from) +} + #[cfg(test)] mod tests { use super::*; @@ -668,6 +852,16 @@ mod tests { } } + #[test] + fn iptables_legacy_search_paths_are_absolute() { + for path in IPTABLES_LEGACY_SEARCH_PATHS { + assert!( + path.starts_with('/'), + "IPTABLES_LEGACY_SEARCH_PATHS entry must be absolute: {path}" + ); + } + } + #[test] #[ignore = "requires root privileges"] fn test_create_and_drop_namespace() { diff --git a/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs b/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs index ba7aeb936..d7ec5132e 100644 --- a/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs +++ b/crates/openshell-supervisor-process/src/netns/nft_ruleset.rs @@ -53,6 +53,46 @@ pub fn generate_bypass_ruleset(host_ip: &str, proxy_port: u16, log_prefix: Optio ) } +/// Generate a pod-network ruleset for Kubernetes sidecar enforcement. +/// +/// The network sidecar and the process supervisor share a pod network +/// namespace. The sidecar runs as `proxy_uid` and owns external egress; +/// sandbox traffic must use loopback services hosted by that sidecar +/// (gateway forward and HTTP CONNECT proxy). +pub fn generate_sidecar_bypass_ruleset(proxy_uid: u32, log_prefix: Option<&str>) -> String { + let log_tcp = log_prefix + .map(|p| { + format!( + "\n tcp flags syn limit rate 5/second burst 10 packets log prefix \"{p}\" flags skuid" + ) + }) + .unwrap_or_default(); + let log_udp = log_prefix + .map(|p| { + format!( + "\n meta l4proto udp limit rate 5/second burst 10 packets log prefix \"{p}\" flags skuid" + ) + }) + .unwrap_or_default(); + + format!( + r#"table inet openshell_sidecar_bypass {{ + chain output {{ + type filter hook output priority 0; policy accept; + + oifname "lo" accept + ct state established,related accept + meta skuid {proxy_uid} accept{log_tcp} + meta nfproto ipv4 meta l4proto tcp reject with icmp type port-unreachable + meta nfproto ipv6 meta l4proto tcp reject with icmpv6 type port-unreachable{log_udp} + meta nfproto ipv4 meta l4proto udp reject with icmp type port-unreachable + meta nfproto ipv6 meta l4proto udp reject with icmpv6 type port-unreachable + }} +}} +"# + ) +} + #[cfg(test)] mod tests { use super::*; @@ -145,4 +185,27 @@ mod tests { "UDP log rule must come before UDP reject rule" ); } + + #[test] + fn sidecar_ruleset_allows_supervisor_uid_and_loopback() { + let ruleset = generate_sidecar_bypass_ruleset(1337, None); + assert!(ruleset.contains("table inet openshell_sidecar_bypass")); + assert!(ruleset.contains("oifname \"lo\" accept")); + assert!(ruleset.contains("meta skuid 1337 accept")); + } + + #[test] + fn sidecar_ruleset_rejects_tcp_and_udp_egress() { + let ruleset = generate_sidecar_bypass_ruleset(0, Some("openshell:sidecar:test:")); + assert!(ruleset.contains("meta nfproto ipv4 meta l4proto tcp reject")); + assert!(ruleset.contains("meta nfproto ipv6 meta l4proto tcp reject")); + assert!(ruleset.contains("meta nfproto ipv4 meta l4proto udp reject")); + assert!(ruleset.contains("meta nfproto ipv6 meta l4proto udp reject")); + assert_eq!( + ruleset + .matches("log prefix \"openshell:sidecar:test:\"") + .count(), + 2 + ); + } } diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index fcd7ae69c..bd8be04c8 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -28,10 +28,32 @@ use std::sync::OnceLock; use tokio::process::{Child, Command}; use tracing::{debug, info}; +/// Process/filesystem enforcement performed by the process supervisor. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessEnforcementMode { + /// Preserve the existing supervisor behavior: prepare filesystem policy, + /// drop privileges, and apply Landlock/seccomp to workload processes. + Full, + /// Preserve process launch and SSH/session behavior, but skip controls + /// that require root or extra Linux capabilities. Kubernetes sidecar mode + /// uses this when network policy is enforced by the network sidecar. + NetworkOnly, +} + +impl ProcessEnforcementMode { + #[must_use] + pub const fn enforces_process_controls(self) -> bool { + matches!(self, Self::Full) + } +} + const SUPERVISOR_ONLY_ENV_VARS: &[&str] = &[ openshell_core::sandbox_env::SANDBOX_TOKEN, openshell_core::sandbox_env::SANDBOX_TOKEN_FILE, openshell_core::sandbox_env::K8S_SA_TOKEN_FILE, + openshell_core::sandbox_env::TLS_CA, + openshell_core::sandbox_env::TLS_CERT, + openshell_core::sandbox_env::TLS_KEY, openshell_core::sandbox_env::PROVIDER_SPIFFE_WORKLOAD_API_SOCKET, ]; @@ -443,6 +465,7 @@ impl ProcessHandle { workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, netns: Option<&NetworkNamespace>, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, @@ -453,6 +476,7 @@ impl ProcessHandle { workdir, interactive, policy, + enforcement_mode, netns.and_then(NetworkNamespace::ns_fd), ca_paths, provider_env, @@ -465,12 +489,14 @@ impl ProcessHandle { /// /// Returns an error if the process fails to start. #[cfg(not(target_os = "linux"))] + #[allow(clippy::too_many_arguments)] pub fn spawn( program: &str, args: &[String], workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, ) -> Result { @@ -480,6 +506,7 @@ impl ProcessHandle { workdir, interactive, policy, + enforcement_mode, ca_paths, provider_env, ) @@ -493,6 +520,7 @@ impl ProcessHandle { workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, netns_fd: Option, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, @@ -552,18 +580,30 @@ impl ProcessHandle { // process where the tracing subscriber is functional. The child's // pre_exec context cannot reliably emit structured logs. #[cfg(target_os = "linux")] - sandbox::linux::log_sandbox_readiness(policy, workdir); + if enforcement_mode.enforces_process_controls() { + sandbox::linux::log_sandbox_readiness(policy, workdir); + } // Phase 1 (as root): Prepare Landlock ruleset by opening PathFds. // This MUST happen before drop_privileges() so that root-only paths // (e.g. mode 700 directories) can be opened. See issue #803. #[cfg(target_os = "linux")] - let prepared_sandbox = sandbox::linux::prepare(policy, workdir) - .map_err(|err| miette::miette!("Failed to prepare sandbox: {err}"))?; + let prepared_sandbox = if enforcement_mode.enforces_process_controls() { + Some( + sandbox::linux::prepare(policy, workdir) + .map_err(|err| miette::miette!("Failed to prepare sandbox: {err}"))?, + ) + } else { + None + }; #[cfg(target_os = "linux")] - let supervisor_identity_mount = supervisor_identity_mount_from_env().map_err(|err| { - miette::miette!("Failed to prepare supervisor identity isolation: {err}") - })?; + let supervisor_identity_mount = if enforcement_mode.enforces_process_controls() { + supervisor_identity_mount_from_env().map_err(|err| { + miette::miette!("Failed to prepare supervisor identity isolation: {err}") + })? + } else { + None + }; // Set up process group for signal handling (non-interactive mode only). // In interactive mode, we inherit the parent's process group to maintain @@ -575,7 +615,7 @@ impl ProcessHandle { // Wrap in Option so we can .take() it out of the FnMut closure. // pre_exec is only called once (after fork, before exec). #[cfg(target_os = "linux")] - let mut prepared_sandbox = Some(prepared_sandbox); + let mut prepared_sandbox = prepared_sandbox; #[allow(unsafe_code)] unsafe { cmd.pre_exec(move || { @@ -600,8 +640,10 @@ impl ProcessHandle { // Drop privileges. initgroups/setgid/setuid need access to // /etc/group and /etc/passwd which would be blocked if // Landlock were already enforced. - drop_privileges(&policy) - .map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + drop_privileges(&policy) + .map_err(|err| std::io::Error::other(err.to_string()))?; + } harden_child_process().map_err(|err| std::io::Error::other(err.to_string()))?; @@ -629,12 +671,14 @@ impl ProcessHandle { } #[cfg(not(target_os = "linux"))] + #[allow(clippy::too_many_arguments)] fn spawn_impl( program: &str, args: &[String], workdir: Option<&str>, interactive: bool, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, ca_paths: Option<&(PathBuf, PathBuf)>, provider_env: &HashMap, ) -> Result { @@ -697,13 +741,17 @@ impl ProcessHandle { // Drop privileges before applying sandbox restrictions. // initgroups/setgid/setuid need access to /etc/group and /etc/passwd // which may be blocked by Landlock. - drop_privileges(&policy) - .map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + drop_privileges(&policy) + .map_err(|err| std::io::Error::other(err.to_string()))?; + } harden_child_process().map_err(|err| std::io::Error::other(err.to_string()))?; - sandbox::apply(&policy, workdir.as_deref()) - .map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + sandbox::apply(&policy, workdir.as_deref()) + .map_err(|err| std::io::Error::other(err.to_string()))?; + } Ok(()) }); diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index e16f11892..bd3fea91f 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -33,7 +33,7 @@ use openshell_core::denial::DenialEvent; #[cfg(target_os = "linux")] use crate::managed_children; -use crate::process::ProcessHandle; +use crate::process::{ProcessEnforcementMode, ProcessHandle}; fn ocsf_ctx() -> &'static openshell_ocsf::SandboxContext { openshell_ocsf::ctx::ctx() @@ -57,6 +57,7 @@ pub async fn run_process( openshell_endpoint: Option<&str>, ssh_socket_path: Option, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, entrypoint_pid: Arc, provider_credentials: ProviderCredentialState, provider_env: std::collections::HashMap, @@ -71,21 +72,26 @@ pub async fn run_process( // /etc/group so the "sandbox" entry matches. Must run before // validate_sandbox_user so passwd lookups see the correct identity. #[cfg(unix)] - crate::process::update_sandbox_passwd_entries()?; + if enforcement_mode.enforces_process_controls() { + crate::process::update_sandbox_passwd_entries()?; + } // Validate that the sandbox user exists in the image. All sandbox images // must include a "sandbox" user for privilege dropping; failing fast here // beats silently running children as root. #[cfg(unix)] - crate::process::validate_sandbox_user(policy)?; - #[cfg(unix)] - crate::process::validate_sandbox_group(policy)?; + if enforcement_mode.enforces_process_controls() { + crate::process::validate_sandbox_user(policy)?; + crate::process::validate_sandbox_group(policy)?; + } // Create read_write directories and chown newly-created ones to the // sandbox user/group. Runs as the supervisor (root) before the child // is forked so the workload sees writable paths it owns. #[cfg(unix)] - crate::process::prepare_filesystem(policy)?; + if enforcement_mode.enforces_process_controls() { + crate::process::prepare_filesystem(policy)?; + } // Eagerly fetch initial settings and install the agent skill if the // proposals flag is on at startup, rather than waiting for the policy @@ -206,31 +212,10 @@ pub async fn run_process( // their env so cooperative tools (curl, npm, Node) route through the // CONNECT proxy. Linux uses the netns host_ip; on other targets fall back // to the policy-declared http_addr directly. - let ssh_proxy_url = if matches!(policy.network.mode, NetworkMode::Proxy) { - #[cfg(target_os = "linux")] - { - netns.map(|ns| { - let port = policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map_or(3128, |addr| addr.port()); - format!("http://{}:{port}", ns.host_ip()) - }) - } - #[cfg(not(target_os = "linux"))] - { - policy - .network - .proxy - .as_ref() - .and_then(|p| p.http_addr) - .map(|addr| format!("http://{addr}")) - } - } else { - None - }; + #[cfg(target_os = "linux")] + let ssh_proxy_url = ssh_proxy_url_for_policy(policy, netns.map(NetworkNamespace::host_ip)); + #[cfg(not(target_os = "linux"))] + let ssh_proxy_url = ssh_proxy_url_for_policy(policy, None); let ssh_socket_path: Option = ssh_socket_path.map(std::path::PathBuf::from); if let Some(listen_path) = ssh_socket_path.clone() { @@ -259,6 +244,7 @@ pub async fn run_process( ca_paths, provider_credentials_clone, user_env_clone, + enforcement_mode, ) .await { @@ -325,6 +311,7 @@ pub async fn run_process( workdir, interactive, policy, + enforcement_mode, netns, ca_file_paths.as_ref(), &provider_env, @@ -337,12 +324,16 @@ pub async fn run_process( workdir, interactive, policy, + enforcement_mode, ca_file_paths.as_ref(), &provider_env, )?; // Store the entrypoint PID so the proxy can resolve TCP peer identity entrypoint_pid.store(handle.pid(), Ordering::Release); + if let Some(path) = entrypoint_pid_file() { + write_entrypoint_pid_file(&path, handle.pid())?; + } ocsf_emit!( ProcessActivityBuilder::new(ocsf_ctx()) .activity(ActivityId::Open) @@ -395,6 +386,42 @@ pub async fn run_process( Ok(status.code()) } +fn entrypoint_pid_file() -> Option { + std::env::var(openshell_core::sandbox_env::ENTRYPOINT_PID_FILE) + .ok() + .filter(|value| !value.is_empty()) +} + +fn write_entrypoint_pid_file(path: &str, pid: u32) -> Result<()> { + let pid_path = std::path::Path::new(path); + if let Some(parent) = pid_path.parent() { + std::fs::create_dir_all(parent).into_diagnostic()?; + } + std::fs::write(pid_path, format!("{pid}\n")).into_diagnostic()?; + info!( + path, + pid, "Published workload entrypoint PID for network sidecar" + ); + Ok(()) +} + +fn ssh_proxy_url_for_policy( + policy: &SandboxPolicy, + netns_proxy_host: Option, +) -> Option { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return None; + } + + let proxy = policy.network.proxy.as_ref()?; + if let Some(host) = netns_proxy_host { + let port = proxy.http_addr.map_or(3128, |addr| addr.port()); + return Some(format!("http://{host}:{port}")); + } + + proxy.http_addr.map(|addr| format!("http://{addr}")) +} + /// Eagerly fetch initial settings and install the agent-driven policy /// proposal skill if the flag is on at startup. /// @@ -451,3 +478,53 @@ async fn install_initial_agent_skill(sandbox_id: Option<&str>, openshell_endpoin ); } } + +#[cfg(test)] +mod tests { + use super::*; + use openshell_core::policy::{ + FilesystemPolicy, LandlockPolicy, NetworkMode, NetworkPolicy, ProcessPolicy, ProxyPolicy, + }; + + fn policy(mode: NetworkMode, http_addr: Option) -> SandboxPolicy { + SandboxPolicy { + version: 1, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy { + mode, + proxy: http_addr.map(|http_addr| ProxyPolicy { + http_addr: Some(http_addr), + }), + }, + landlock: LandlockPolicy::default(), + process: ProcessPolicy::default(), + } + } + + #[test] + fn ssh_proxy_url_uses_policy_addr_without_netns() { + let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 3128).into())); + + assert_eq!( + ssh_proxy_url_for_policy(&policy, None).as_deref(), + Some("http://127.0.0.1:3128") + ); + } + + #[test] + fn ssh_proxy_url_prefers_netns_host_with_policy_port() { + let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 8080).into())); + + assert_eq!( + ssh_proxy_url_for_policy(&policy, Some([10, 200, 0, 1].into())).as_deref(), + Some("http://10.200.0.1:8080") + ); + } + + #[test] + fn ssh_proxy_url_skips_non_proxy_mode() { + let policy = policy(NetworkMode::Allow, Some(([127, 0, 0, 1], 3128).into())); + + assert_eq!(ssh_proxy_url_for_policy(&policy, None), None); + } +} diff --git a/crates/openshell-supervisor-process/src/ssh.rs b/crates/openshell-supervisor-process/src/ssh.rs index 62d10f374..c55a6d877 100644 --- a/crates/openshell-supervisor-process/src/ssh.rs +++ b/crates/openshell-supervisor-process/src/ssh.rs @@ -6,7 +6,7 @@ use crate::child_env; #[cfg(target_os = "linux")] use crate::managed_children; -use crate::process::{drop_privileges, is_supervisor_only_env_var}; +use crate::process::{ProcessEnforcementMode, drop_privileges, is_supervisor_only_env_var}; use crate::sandbox; use miette::{IntoDiagnostic, Result}; use nix::pty::{Winsize, openpty}; @@ -42,6 +42,7 @@ type SshServerInit = ( fn ssh_server_init( listen_path: &Path, ca_file_paths: &Option<(PathBuf, PathBuf)>, + enforcement_mode: ProcessEnforcementMode, ) -> Result { let mut rng = OsRng; let host_key = PrivateKey::random(&mut rng, Algorithm::Ed25519).into_diagnostic()?; @@ -55,13 +56,16 @@ fn ssh_server_init( let config = Arc::new(config); let ca_paths = ca_file_paths.as_ref().map(|p| Arc::new(p.clone())); - // Ensure the parent directory exists and is root-owned with 0700 - // permissions. The sandbox entrypoint runs as an unprivileged user; it - // must not be able to enter this directory and connect to the socket. + // In full enforcement mode the supervisor starts as root and can isolate + // the SSH socket in a root-only directory before spawning unprivileged + // children. In network-only sidecar mode the process supervisor itself + // runs as the sandbox UID, so the driver points the socket at a writable + // sidecar state volume and accepts that Unix permissions no longer isolate + // same-UID child processes from the socket. if let Some(parent) = listen_path.parent() { std::fs::create_dir_all(parent).into_diagnostic()?; #[cfg(unix)] - { + if enforcement_mode.enforces_process_controls() { use std::os::unix::fs::PermissionsExt; let perms = std::fs::Permissions::from_mode(0o700); std::fs::set_permissions(parent, perms).into_diagnostic()?; @@ -108,21 +112,23 @@ pub async fn run_ssh_server( ca_file_paths: Option<(PathBuf, PathBuf)>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> Result<()> { - let (listener, config, ca_paths) = match ssh_server_init(&listen_path, &ca_file_paths) { - Ok(v) => { - // Signal that the SSH server has bound the socket and is ready to - // accept connections. The parent task awaits this before spawning - // the entrypoint process, ensuring exec requests won't race - // against server startup. - let _ = ready_tx.send(Ok(())); - v - } - Err(err) => { - let _ = ready_tx.send(Err(err)); - return Ok(()); - } - }; + let (listener, config, ca_paths) = + match ssh_server_init(&listen_path, &ca_file_paths, enforcement_mode) { + Ok(v) => { + // Signal that the SSH server has bound the socket and is ready to + // accept connections. The parent task awaits this before spawning + // the entrypoint process, ensuring exec requests won't race + // against server startup. + let _ = ready_tx.send(Ok(())); + v + } + Err(err) => { + let _ = ready_tx.send(Err(err)); + return Ok(()); + } + }; loop { let (stream, _peer) = listener.accept().await.into_diagnostic()?; @@ -145,6 +151,7 @@ pub async fn run_ssh_server( ca_paths, provider_credentials, user_environment, + enforcement_mode, ) .await { @@ -172,6 +179,7 @@ async fn handle_connection( ca_file_paths: Option>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> Result<()> { // Access is gated by the Unix-socket filesystem permissions (root-only), // not by an application-level preface. The supervisor bridges the @@ -195,6 +203,7 @@ async fn handle_connection( ca_file_paths, provider_credentials, user_environment, + enforcement_mode, ); russh::server::run_stream(config, stream, handler) .await @@ -223,6 +232,7 @@ struct SshHandler { ca_file_paths: Option>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, channels: HashMap, } @@ -236,6 +246,7 @@ impl SshHandler { ca_file_paths: Option>, provider_credentials: ProviderCredentialState, user_environment: HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> Self { Self { policy, @@ -245,6 +256,7 @@ impl SshHandler { ca_file_paths, provider_credentials, user_environment, + enforcement_mode, channels: HashMap::new(), } } @@ -468,6 +480,7 @@ impl russh::server::Handler for SshHandler { self.ca_file_paths.clone(), &self.provider_credentials.child_env_with_gcp_resolved(), &self.user_environment, + self.enforcement_mode, )?; let state = self.channels.get_mut(&channel).ok_or_else(|| { anyhow::anyhow!("subsystem_request on unknown channel {channel:?}") @@ -564,6 +577,7 @@ impl SshHandler { self.ca_file_paths.clone(), &provider_env, &self.user_environment, + self.enforcement_mode, )?; state.pty_master = Some(pty_master); state.input_sender = Some(input_sender); @@ -582,6 +596,7 @@ impl SshHandler { self.ca_file_paths.clone(), &provider_env, &self.user_environment, + self.enforcement_mode, )?; state.input_sender = Some(input_sender); } @@ -748,6 +763,7 @@ fn spawn_pty_shell( ca_file_paths: Option>, provider_env: &HashMap, user_environment: &HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> anyhow::Result<(std::fs::File, mpsc::Sender>)> { let winsize = Winsize { ws_row: to_u16(pty.row_height.max(1)), @@ -806,12 +822,20 @@ fn spawn_pty_shell( // Probe Landlock availability from the parent process where tracing works. #[cfg(target_os = "linux")] - sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + if enforcement_mode.enforces_process_controls() { + sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + } // Phase 1 (as root): Prepare Landlock ruleset before drop_privileges. #[cfg(target_os = "linux")] - let prepared_sandbox = sandbox::linux::prepare(policy, workdir.as_deref()) - .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?; + let prepared_sandbox = if enforcement_mode.enforces_process_controls() { + Some( + sandbox::linux::prepare(policy, workdir.as_deref()) + .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?, + ) + } else { + None + }; #[cfg(unix)] { @@ -821,6 +845,7 @@ fn spawn_pty_shell( workdir.clone(), slave_fd, netns_fd, + enforcement_mode, #[cfg(target_os = "linux")] prepared_sandbox, )?; @@ -913,6 +938,7 @@ fn spawn_pipe_exec( ca_file_paths: Option>, provider_env: &HashMap, user_environment: &HashMap, + enforcement_mode: ProcessEnforcementMode, ) -> anyhow::Result>> { let mut cmd = command.map_or_else( || { @@ -955,12 +981,20 @@ fn spawn_pipe_exec( // Probe Landlock availability from the parent process where tracing works. #[cfg(target_os = "linux")] - sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + if enforcement_mode.enforces_process_controls() { + sandbox::linux::log_sandbox_readiness(policy, workdir.as_deref()); + } // Phase 1 (as root): Prepare Landlock ruleset before drop_privileges. #[cfg(target_os = "linux")] - let prepared_sandbox = sandbox::linux::prepare(policy, workdir.as_deref()) - .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?; + let prepared_sandbox = if enforcement_mode.enforces_process_controls() { + Some( + sandbox::linux::prepare(policy, workdir.as_deref()) + .map_err(|err| anyhow::anyhow!("Failed to prepare sandbox: {err}"))?, + ) + } else { + None + }; #[cfg(unix)] { @@ -969,6 +1003,7 @@ fn spawn_pipe_exec( policy.clone(), workdir.clone(), netns_fd, + enforcement_mode, #[cfg(target_os = "linux")] prepared_sandbox, )?; @@ -1068,7 +1103,9 @@ fn spawn_pipe_exec( mod unsafe_pty { #[cfg(not(target_os = "linux"))] use super::sandbox; - use super::{Command, RawFd, SandboxPolicy, Winsize, drop_privileges, setsid}; + use super::{ + Command, ProcessEnforcementMode, RawFd, SandboxPolicy, Winsize, drop_privileges, setsid, + }; #[cfg(unix)] use std::os::unix::process::CommandExt; @@ -1107,17 +1144,21 @@ mod unsafe_pty { _workdir: Option, slave_fd: RawFd, netns_fd: Option, - #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, + enforcement_mode: ProcessEnforcementMode, + #[cfg(target_os = "linux")] prepared: Option, ) -> anyhow::Result<()> { // Wrap in Option so we can .take() it out of the FnMut closure. // pre_exec is only called once (after fork, before exec). #[cfg(target_os = "linux")] - let mut prepared = Some(prepared); + let mut prepared = prepared; #[cfg(target_os = "linux")] - let supervisor_identity_mount = crate::process::supervisor_identity_mount_from_env() - .map_err(|err| { + let supervisor_identity_mount = if enforcement_mode.enforces_process_controls() { + crate::process::supervisor_identity_mount_from_env().map_err(|err| { anyhow::anyhow!("failed to prepare supervisor identity isolation: {err}") - })?; + })? + } else { + None + }; unsafe { cmd.pre_exec(move || { setsid().map_err(|err| std::io::Error::other(err.to_string()))?; @@ -1126,6 +1167,7 @@ mod unsafe_pty { enter_netns_and_sandbox( netns_fd, &policy, + enforcement_mode, #[cfg(target_os = "linux")] supervisor_identity_mount, #[cfg(target_os = "linux")] @@ -1152,20 +1194,25 @@ mod unsafe_pty { policy: SandboxPolicy, _workdir: Option, netns_fd: Option, - #[cfg(target_os = "linux")] prepared: crate::sandbox::linux::PreparedSandbox, + enforcement_mode: ProcessEnforcementMode, + #[cfg(target_os = "linux")] prepared: Option, ) -> anyhow::Result<()> { #[cfg(target_os = "linux")] - let mut prepared = Some(prepared); + let mut prepared = prepared; #[cfg(target_os = "linux")] - let supervisor_identity_mount = crate::process::supervisor_identity_mount_from_env() - .map_err(|err| { + let supervisor_identity_mount = if enforcement_mode.enforces_process_controls() { + crate::process::supervisor_identity_mount_from_env().map_err(|err| { anyhow::anyhow!("failed to prepare supervisor identity isolation: {err}") - })?; + })? + } else { + None + }; unsafe { cmd.pre_exec(move || { enter_netns_and_sandbox( netns_fd, &policy, + enforcement_mode, #[cfg(target_os = "linux")] supervisor_identity_mount, #[cfg(target_os = "linux")] @@ -1179,6 +1226,7 @@ mod unsafe_pty { fn enter_netns_and_sandbox( netns_fd: Option, policy: &SandboxPolicy, + enforcement_mode: ProcessEnforcementMode, #[cfg(target_os = "linux")] supervisor_identity_mount: Option< &crate::process::SupervisorIdentityMountNamespace, >, @@ -1207,7 +1255,9 @@ mod unsafe_pty { // Drop privileges. initgroups/setgid/setuid need /etc/group and // /etc/passwd which would be blocked if Landlock were already enforced. - drop_privileges(policy).map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + drop_privileges(policy).map_err(|err| std::io::Error::other(err.to_string()))?; + } crate::process::harden_child_process() .map_err(|err| std::io::Error::other(err.to_string()))?; @@ -1220,7 +1270,9 @@ mod unsafe_pty { } #[cfg(not(target_os = "linux"))] - sandbox::apply(policy, None).map_err(|err| std::io::Error::other(err.to_string()))?; + if enforcement_mode.enforces_process_controls() { + sandbox::apply(policy, None).map_err(|err| std::io::Error::other(err.to_string()))?; + } Ok(()) } @@ -1681,21 +1733,24 @@ mod tests { policy, None, None, // no netns fd + ProcessEnforcementMode::Full, #[cfg(target_os = "linux")] - sandbox::linux::prepare( - &SandboxPolicy { - version: 0, - filesystem: FilesystemPolicy::default(), - network: NetworkPolicy::default(), - landlock: LandlockPolicy::default(), - process: ProcessPolicy { - run_as_user: None, - run_as_group: None, + Some( + sandbox::linux::prepare( + &SandboxPolicy { + version: 0, + filesystem: FilesystemPolicy::default(), + network: NetworkPolicy::default(), + landlock: LandlockPolicy::default(), + process: ProcessPolicy { + run_as_user: None, + run_as_group: None, + }, }, - }, - None, - ) - .expect("prepare should succeed in test environment"), + None, + ) + .expect("prepare should succeed in test environment"), + ), ) .expect("install pre_exec should succeed"); diff --git a/deploy/docker/Dockerfile.supervisor b/deploy/docker/Dockerfile.supervisor index c84cc70e9..c760bbc89 100644 --- a/deploy/docker/Dockerfile.supervisor +++ b/deploy/docker/Dockerfile.supervisor @@ -5,10 +5,10 @@ # Supervisor image build. # -# The final image is `scratch`: it only carries the static `openshell-sandbox` -# binary used by Docker extraction, Podman image volumes, and the Kubernetes -# init container copy-self path. A static musl binary lets the image stay -# `scratch` while still being executable as an init container. +# The final image carries the static `openshell-sandbox` binary used by Docker +# extraction, Podman image volumes, and the Kubernetes init container copy-self +# path. It also includes nftables so the Kubernetes supervisor sidecar can +# install pod-namespace egress enforcement rules. # # The Rust binary is built natively before this image build runs and staged at: # deploy/docker/.build/prebuilt-binaries//openshell-sandbox @@ -19,17 +19,16 @@ # target) and uploads it as an artifact, which is downloaded into the same # staging directory before the image build job runs. -FROM scratch AS supervisor +FROM alpine:3.22 AS supervisor ARG TARGETARCH -# --chmod=0550 drops world-execute and survives the actions/upload-artifact -# + download-artifact roundtrip (which strips exec perms). Ownership is left -# at root (0:0) deliberately: the Podman driver mounts this image as a -# read-only image volume into the sandbox container and drops DAC_OVERRIDE, -# so the container's UID 0 must own the binary to read+exec it. Mode 0550 -# (r-xr-x---) is the security win; the chown to a non-root UID was breaking -# Podman without buying anything since the container is always UID 0. -COPY --chmod=0550 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /openshell-sandbox +RUN apk add --no-cache nftables iptables iptables-legacy + +# --chmod=0555 restores execute bits after the actions/upload-artifact + +# download-artifact roundtrip strips them. Ownership stays root (0:0) for +# Podman image-volume mounts, while world-execute lets the Kubernetes +# network sidecar run this binary as the dedicated non-root proxy UID. +COPY --chmod=0555 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /openshell-sandbox ENTRYPOINT ["/openshell-sandbox"] diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index f3a86f884..0ed095915 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -237,8 +237,10 @@ add `ci/values-spire.yaml` to the OpenShell release values files. | supervisor.image.pullPolicy | string | `""` | Supervisor image pull policy. Defaults to the gateway image pull policy when empty. | | supervisor.image.repository | string | `"ghcr.io/nvidia/openshell/supervisor"` | Supervisor image repository. | | supervisor.image.tag | string | `""` | Supervisor image tag. Defaults to the chart appVersion when empty. | +| supervisor.processEnforcement | string | `"network-only"` | Process/filesystem controls applied by the agent process supervisor in non-combined topologies. "network-only" keeps the low-permission agent shape; "full" grants combined-mode process/filesystem controls. | +| supervisor.proxyUid | int | `1337` | UID for the long-running network sidecar in sidecar topology. The network init container installs nftables rules that exempt this UID. | | supervisor.sideloadMethod | string | `""` | How the supervisor binary is delivered into sandbox pods. Empty (default) = auto-detect from cluster version: K8s >= v1.35 -> "image-volume" (ImageVolume enabled by default; GA in v1.36) K8s < v1.35 -> "init-container" (copies via init container + emptyDir) On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, set this to "image-volume" explicitly. | -| supervisor.topology | string | `"combined"` | Supervisor pod topology for Kubernetes sandboxes. "combined" runs networking and process supervision in the agent container. | +| supervisor.topology | string | `"combined"` | Supervisor pod topology for Kubernetes sandboxes. "combined" runs the current single supervisor container in the agent pod. "sidecar" runs network enforcement in a dedicated sidecar and the process supervisor as a low-capability wrapper in the agent container. | | tolerations | list | `[]` | Tolerations for the gateway pod. | | workload.allowMultiReplicaStatefulSet | bool | `false` | Allow replicaCount > 1 while rendering a StatefulSet. Prefer workload.kind=deployment for external database-backed multi-replica gateways; this override exists for operators who explicitly require StatefulSet identity or storage semantics. | | workload.kind | string | `"statefulset"` | Gateway workload controller kind. Use `statefulset` for the default SQLite database, or `deployment` when server.externalDbSecret points at an external database. | diff --git a/deploy/helm/openshell/ci/values-sidecar.yaml b/deploy/helm/openshell/ci/values-sidecar.yaml new file mode 100644 index 000000000..dac9e810f --- /dev/null +++ b/deploy/helm/openshell/ci/values-sidecar.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# CI/dev overlay for exercising the Kubernetes supervisor sidecar topology. +# +# Merge after values.yaml and ci/values-skaffold.yaml: +# helm install ... -f values.yaml -f ci/values-skaffold.yaml -f ci/values-sidecar.yaml +# +# Or set: +# OPENSHELL_E2E_KUBE_EXTRA_VALUES=deploy/helm/openshell/ci/values-sidecar.yaml +# before running `mise run e2e:kubernetes`. +supervisor: + topology: sidecar diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index d571c3bd9..76a9e7a5b 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -119,6 +119,8 @@ deploy: # To enable SPIFFE/SPIRE provider token grants (requires the # spire-crds and spire releases above): #- ci/values-spire.yaml + # To exercise the Kubernetes supervisor sidecar topology: + #- ci/values-sidecar.yaml # To test multi-replica external PostgreSQL behavior: #- ci/values-high-availability.yaml setValueTemplates: @@ -126,3 +128,9 @@ deploy: image.tag: '{{.IMAGE_TAG_openshell_gateway}}' supervisor.image.repository: '{{.IMAGE_REPO_openshell_supervisor}}' supervisor.image.tag: '{{.IMAGE_TAG_openshell_supervisor}}' +profiles: + - name: sidecar + patches: + - op: add + path: /deploy/helm/releases/0/valuesFiles/- + value: ci/values-sidecar.yaml diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 0e75a311f..9637c3328 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -114,6 +114,8 @@ data: service_account_name = {{ include "openshell.sandboxServiceAccountName" . | quote }} supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} supervisor_topology = {{ .Values.supervisor.topology | default "combined" | quote }} + process_enforcement = {{ .Values.supervisor.processEnforcement | default "network-only" | quote }} + proxy_uid = {{ .Values.supervisor.proxyUid | default 1337 }} sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} {{- if .Values.server.providerTokenGrants.spiffe.enabled }} provider_spiffe_workload_api_socket_path = {{ .Values.server.providerTokenGrants.spiffe.workloadApiSocketPath | quote }} diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml index 50051e003..509eb4279 100644 --- a/deploy/helm/openshell/tests/gateway_config_test.yaml +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -83,21 +83,39 @@ tests: path: data["gateway.toml"] pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?service_account_name\s*=\s*"openshell-sandbox"' - - it: renders combined supervisor topology by default under [openshell.drivers.kubernetes] + - it: renders supervisor topology under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + set: + supervisor.topology: sidecar + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"sidecar"' + + - it: renders process enforcement under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + set: + supervisor.processEnforcement: full + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?process_enforcement\s*=\s*"full"' + + - it: renders default process enforcement under [openshell.drivers.kubernetes] template: templates/gateway-config.yaml asserts: - matchRegex: path: data["gateway.toml"] - pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"combined"' + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?process_enforcement\s*=\s*"network-only"' - - it: renders explicit combined supervisor topology under [openshell.drivers.kubernetes] + - it: renders proxy uid under [openshell.drivers.kubernetes] template: templates/gateway-config.yaml set: - supervisor.topology: combined + supervisor.proxyUid: 2200 asserts: - matchRegex: path: data["gateway.toml"] - pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"combined"' + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?proxy_uid\s*=\s*2200' - it: renders sandbox image pull secrets under [openshell.drivers.kubernetes] template: templates/gateway-config.yaml diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index fd73dd071..c670a97b8 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -45,8 +45,17 @@ supervisor: # set this to "image-volume" explicitly. sideloadMethod: "" # -- Supervisor pod topology for Kubernetes sandboxes. - # "combined" runs networking and process supervision in the agent container. + # "combined" runs the current single supervisor container in the agent pod. + # "sidecar" runs network enforcement in a dedicated sidecar and the process + # supervisor as a low-capability wrapper in the agent container. topology: "combined" + # -- Process/filesystem controls applied by the agent process supervisor in + # non-combined topologies. "network-only" keeps the low-permission agent + # shape; "full" grants combined-mode process/filesystem controls. + processEnforcement: "network-only" + # -- UID for the long-running network sidecar in sidecar topology. The + # network init container installs nftables rules that exempt this UID. + proxyUid: 1337 # -- Image pull secrets attached to gateway and helper pods. imagePullSecrets: [] diff --git a/docs/kubernetes/access-control.mdx b/docs/kubernetes/access-control.mdx index 8824b6de1..5409a4b11 100644 --- a/docs/kubernetes/access-control.mdx +++ b/docs/kubernetes/access-control.mdx @@ -5,7 +5,7 @@ title: "Access Control" sidebar-title: "Access Control" description: "Configure OIDC user authentication or reverse-proxy auth termination for a Kubernetes-deployed OpenShell gateway." keywords: "Generative AI, Cybersecurity, Kubernetes, Authentication, mTLS, OIDC, Keycloak, Entra ID, Okta, Gateway Auth" -position: 4 +position: 5 --- The OpenShell gateway supports two access-control models for human callers on Kubernetes: diff --git a/docs/kubernetes/ingress.mdx b/docs/kubernetes/ingress.mdx index 844167246..66178bc3a 100644 --- a/docs/kubernetes/ingress.mdx +++ b/docs/kubernetes/ingress.mdx @@ -5,7 +5,7 @@ title: "Ingress" sidebar-title: "Ingress" description: "Expose the OpenShell gateway externally using the Kubernetes Gateway API and a GRPCRoute." keywords: "Generative AI, Cybersecurity, Kubernetes, Gateway API, Envoy Gateway, GRPCRoute, Ingress, External Access" -position: 3 +position: 4 --- By default, the OpenShell gateway is only reachable inside the cluster. To let CLI clients connect without a `kubectl port-forward`, expose the gateway through an ingress. diff --git a/docs/kubernetes/managing-certificates.mdx b/docs/kubernetes/managing-certificates.mdx index 179388151..885a5c952 100644 --- a/docs/kubernetes/managing-certificates.mdx +++ b/docs/kubernetes/managing-certificates.mdx @@ -5,7 +5,7 @@ title: "Managing Certificates" sidebar-title: "Managing Certificates" description: "Configure the OpenShell Helm chart to use cert-manager for mTLS certificate issuance and automatic renewal." keywords: "Generative AI, Cybersecurity, Kubernetes, cert-manager, PKI, TLS, mTLS, Certificates" -position: 2 +position: 3 --- The OpenShell gateway uses mTLS certificates for transport between the gateway and sandbox supervisors. These certificates are not Kubernetes user authentication; configure OIDC or a trusted access proxy for user access. The Helm chart supports two ways to provision and manage the certificate bundle: diff --git a/docs/kubernetes/openshift.mdx b/docs/kubernetes/openshift.mdx index b8313bdfe..caf799b51 100644 --- a/docs/kubernetes/openshift.mdx +++ b/docs/kubernetes/openshift.mdx @@ -5,7 +5,7 @@ title: "OpenShift" sidebar-title: "OpenShift" description: "Install the OpenShell Helm chart on OpenShift, including the SCC binding and chart overrides required by OpenShift's Security Context Constraints." keywords: "Generative AI, Cybersecurity, Kubernetes, OpenShift, SCC, Security Context Constraints, Helm, Gateway, Installation" -position: 5 +position: 6 --- diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index f6051b123..cc886c168 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -161,6 +161,7 @@ The most commonly changed values are: | `pkiInitJob.serverDnsNames` / `certManager.serverDnsNames` | Additional gateway server DNS SANs. Wildcard SANs also enable sandbox service URLs under that domain. | | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect based on cluster version: clusters running Kubernetes 1.35 or later use `image-volume` (ImageVolume GA in 1.36); older clusters use `init-container`. Set explicitly to `image-volume` on Kubernetes 1.33 or 1.34 with the ImageVolume feature gate enabled, or to `init-container` to force the legacy path on any version. | | `supervisor.topology` | Sandbox pod topology. Refer to [Topology](/kubernetes/topology). | +| `supervisor.proxyUid` | Non-root UID for the long-running network sidecar when `supervisor.topology=sidecar`. The UID must not match the sandbox UID. | Use a values file for repeatable deployments: @@ -244,7 +245,7 @@ The gateway exposes `/healthz` for process liveness and `/readyz` for dependency ## Next Steps -- To review Kubernetes sandbox topology, refer to [Topology](/kubernetes/topology). +- To choose between combined and sidecar sandbox pods, refer to [Topology](/kubernetes/topology). - To enable automatic certificate rotation with cert-manager, refer to [Managing Certificates](/kubernetes/managing-certificates). - To expose the gateway externally without port-forwarding, refer to [Ingress](/kubernetes/ingress). - To configure OIDC or reverse-proxy authentication, refer to [Access Control](/kubernetes/access-control). diff --git a/docs/kubernetes/topology.mdx b/docs/kubernetes/topology.mdx index f3f69cf6e..5bf942e35 100644 --- a/docs/kubernetes/topology.mdx +++ b/docs/kubernetes/topology.mdx @@ -3,37 +3,36 @@ # SPDX-License-Identifier: Apache-2.0 title: "Kubernetes Sandbox Topology" sidebar-title: "Topology" -description: "Review the default combined supervisor topology for Kubernetes sandbox pods." -keywords: "Generative AI, Cybersecurity, Kubernetes, Sandboxing, RuntimeClass" +description: "Choose between combined and sidecar supervisor topology for Kubernetes sandbox pods." +keywords: "Generative AI, Cybersecurity, Kubernetes, Sandboxing, Sidecar, Network Policy, RuntimeClass" position: 2 --- -Kubernetes sandbox pods run the OpenShell supervisor in `combined` topology by -default. Combined topology keeps network, filesystem, and process controls in -the agent pod so the supervisor can enforce the complete OpenShell sandbox -contract before launching the workload. +Kubernetes sandbox pods can run the OpenShell supervisor in `combined` or +`sidecar` topology. Choose the topology based on which controls you need inside +the pod and how much privilege your cluster allows on the agent container. ## Choose a Topology The default `combined` topology preserves the full OpenShell enforcement model. -Use it when you need OpenShell to apply all sandbox controls inside the workload -pod and your cluster policy permits the required Linux capabilities. +Use `sidecar` only when you accept network-focused enforcement in exchange for a +lower-privilege agent container. | Topology | Use when | Main tradeoff | |---|---|---| | `combined` | You need OpenShell network, filesystem, and process controls in the sandbox workload. | The agent container carries the Linux capabilities the supervisor needs. | - -Additional Kubernetes sandbox topologies are still being designed. Until they -are documented as supported configuration values, `combined` is the only -supported value for `supervisor.topology`. +| `sidecar` | You need the agent container to run as non-root without added Linux capabilities, and network policy is the primary control. | Defaults to network-only process supervision unless you opt in to `processEnforcement=full`. | ## Privilege Model -The long-running container permissions for `combined` topology are: +The long-running container permissions differ by topology: | Topology | Pod or container | UID/GID | Privilege escalation | Capabilities | Result | |---|---|---|---|---|---| | `combined` | Agent container, which also runs the supervisor | Not forced by topology | Not explicitly disabled by the driver | Adds `SYS_ADMIN`, `NET_ADMIN`, `SYS_PTRACE`, and `SYSLOG`; adds `SETUID`, `SETGID`, and `DAC_READ_SEARCH` when user namespaces are enabled | Full supervisor controls run in the agent container. | +| `sidecar` | Agent container, process-only supervisor (`network-only`) | `sandbox_uid:sandbox_gid` | `false` | Drops `ALL` | Agent and workload run without added Linux capabilities. | +| `sidecar` | Agent container, process-only supervisor (`full`) | Root supervisor | Not explicitly disabled by the driver | Adds combined-mode capabilities | Agent keeps combined-mode process/filesystem guards. | +| `sidecar` | Network supervisor sidecar | `proxyUid:sandbox_gid` | `false` | Drops `ALL` | Long-running proxy sidecar is also non-root without added capabilities. | Short-lived setup containers still have the permissions needed to prepare the pod: @@ -41,6 +40,7 @@ pod: | Topology | Setup container | UID/GID | Privilege escalation | Capabilities | Purpose | |---|---|---|---|---|---| | `combined` | Supervisor install init container | `0` | Not set | Not set | Copies the supervisor binary into the agent container volume. | +| `sidecar` | Network init container | `0` | `false` | Drops `ALL`; adds `NET_ADMIN`, `NET_RAW`, `CHOWN`, and `FOWNER` | Installs pod-local nftables rules and prepares shared sidecar state. | ## Combined Topology @@ -48,6 +48,26 @@ Combined topology is the original Kubernetes mode and remains the default. The agent container starts the OpenShell supervisor, and the supervisor launches the workload after applying sandbox setup. +```mermaid +flowchart TB + Sandbox["agents.x-k8s.io Sandbox"] + + subgraph Pod["Sandbox pod"] + subgraph Agent["agent container"] + Supervisor["OpenShell supervisor
network + process + filesystem"] + Workload["Agent workload"] + end + end + + Gateway["OpenShell Gateway"] + External["External services"] + + Sandbox --> Pod + Supervisor --> Workload + Supervisor -->|"gateway callback / SSH relay"| Gateway + Supervisor -->|"policy-enforced egress"| External +``` + Combined topology keeps these controls in one supervisor path: - Network endpoint and L7 policy enforcement. @@ -61,12 +81,90 @@ controls from the agent container, Kubernetes grants that container elevated Linux capabilities. Use this mode when you need the complete OpenShell sandbox contract and your cluster policy permits those capabilities. +## Sidecar Topology + +Sidecar topology splits the supervisor into a network sidecar and a +low-privilege process supervisor in the agent container. + +```mermaid +flowchart TB + Sandbox["agents.x-k8s.io Sandbox"] + + subgraph Pod["Sandbox pod"] + Init["network init container
root setup capabilities"] + State["shared state + TLS volumes"] + NetNS["pod network namespace"] + + subgraph Agent["agent container"] + ProcessSupervisor["process supervisor
network-only by default"] + Workload["Agent workload"] + end + + NetworkSidecar["network supervisor sidecar
proxyUid"] + end + + Gateway["OpenShell Gateway"] + External["External services"] + + Sandbox --> Pod + Init -->|"installs nftables rules"| NetNS + ProcessSupervisor --> Workload + Workload -->|"egress redirected on loopback"| NetworkSidecar + NetworkSidecar -->|"gateway forwarding"| Gateway + NetworkSidecar -->|"policy-enforced egress"| External + ProcessSupervisor --- State + NetworkSidecar --- State +``` + +The pod contains these OpenShell-managed pieces: + +| Component | Runs as | Purpose | +|---|---|---| +| Network init container | Root with setup capabilities | Installs pod-level nftables rules and prepares shared sidecar state. | +| Network sidecar | `supervisor.proxyUid` | Runs the proxy, enforces network policy, writes proxy TLS material, and forwards gateway traffic on loopback. | +| Agent container | Resolved sandbox UID/GID | Runs the process supervisor and launches the user workload. | + +In this topology, the agent container defaults to `runAsNonRoot: true`, +`allowPrivilegeEscalation: false`, and `capabilities.drop: ["ALL"]`. Set +`supervisor.processEnforcement=full` only when you want combined-mode +process/filesystem guards and accept the added agent-container permissions. The +long-running network sidecar always drops all Linux capabilities. The root init +container keeps the setup capabilities needed to configure pod networking. + +Sidecar mode preserves gateway session behavior, including SSH connectivity, +because the process supervisor still owns the session relay. The network sidecar +handles outbound enforcement and forwards the process supervisor's gateway +traffic to the real gateway endpoint. + + +Sidecar mode defaults the process supervisor to `network-only`. OpenShell still +enforces network endpoint and L7 policy through the sidecar, but the process +supervisor does not apply Landlock filesystem policy, process privilege +dropping, or process/binary identity checks unless you opt in to +`supervisor.processEnforcement=full`. + + +## Credential Exposure + +Sidecar topology uses pod `fsGroup` and group-readable projected credentials so +the non-root process supervisor can authenticate to the gateway. This includes +the projected ServiceAccount token used for sandbox token bootstrap and the +sandbox client TLS secret. + +Treat the agent container as trusted with respect to those in-pod gateway +credentials. Use `combined` topology when that credential exposure is not +acceptable for your deployment. + ## RuntimeClass Isolation -RuntimeClass isolation can add a stronger container boundary for the sandbox -workload when the cluster supports it. Runtime classes do not replace the -combined topology's supervisor controls; they add another isolation boundary -around the same supervised workload. +Sidecar topology pairs well with runtime classes such as gVisor or Kata +Containers when the cluster supports them. A sandboxed runtime strengthens the +container boundary while OpenShell focuses on network policy enforcement from +the sidecar. + +Runtime classes do not re-enable the OpenShell filesystem and process controls +that sidecar mode relaxes. Use them as an additional workload boundary, not as a +replacement for the combined topology's full supervisor controls. You can set a default runtime class in the Kubernetes driver configuration or override it per sandbox with driver config: @@ -77,24 +175,37 @@ openshell sandbox create \ -- claude ``` -## Configure Combined Mode +## Enable Sidecar Mode -For direct gateway TOML configuration, leave `supervisor_topology` unset, or -set it to `combined`, to use the default single-container supervisor path: +For direct gateway TOML configuration, set the Kubernetes driver fields: ```toml [openshell.drivers.kubernetes] -supervisor_topology = "combined" +supervisor_topology = "sidecar" +process_enforcement = "network-only" +proxy_uid = 1337 ``` -When the Helm chart renders `gateway.toml`, leave `supervisor.topology` unset, -or set it to `combined`, to produce the same driver configuration: +`proxy_uid` must be a non-root UID and must not match the sandbox UID. +The network init container exempts this UID from proxy redirection so the +sidecar can reach the gateway. +Set `process_enforcement="full"` only when you want the agent process supervisor +to keep combined-mode process/filesystem guards and accept the added +agent-container permissions. + +When the Helm chart renders `gateway.toml`, set the equivalent chart values: ```yaml supervisor: - topology: combined + topology: sidecar + processEnforcement: network-only + proxyUid: 1337 ``` +Leave `supervisor_topology` unset, or set it to `combined`, to keep the +original single-container supervisor path. For Helm installs, leave +`supervisor.topology` unset or set it to `combined`. + ## Next Steps - To install OpenShell on Kubernetes, refer to [Setup](/kubernetes/setup). diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index 88b82870d..fd125231e 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -177,9 +177,17 @@ supervisor_image_pull_policy = "IfNotPresent" # Use the image volume on Kubernetes >= 1.35 (GA in 1.36); switch to "init-container" # on older clusters or where the ImageVolume feature gate is off. supervisor_sideload_method = "image-volume" -# "combined" runs networking and process supervision in the sandbox agent -# container and preserves the existing Kubernetes sandbox behavior. +# "combined" runs the existing single supervisor container with full process, +# filesystem, and network enforcement in the agent container. "sidecar" moves +# pod-level network enforcement and gateway forwarding into a network sidecar. supervisor_topology = "combined" +# Process/filesystem controls for non-combined topologies. "network-only" +# keeps the low-permission agent shape; "full" grants combined-mode +# process/filesystem controls to the agent process supervisor. +process_enforcement = "network-only" +# UID used by the long-running network sidecar. In sidecar topology the +# network init container installs nftables rules that exempt this UID. +proxy_uid = 1337 grpc_endpoint = "https://openshell-gateway.agents.svc:8080" ssh_socket_path = "/run/openshell/ssh.sock" client_tls_secret_name = "openshell-client-tls" diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index f860d64d4..3dc305154 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -304,10 +304,30 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA | `supervisor_image` | `supervisor.image.repository` / `supervisor.image.tag` | Set the supervisor image that provides the `openshell-sandbox` binary. | | `supervisor_image_pull_policy` | `supervisor.image.pullPolicy` | Set the Kubernetes image pull policy for the supervisor image. | | `supervisor_sideload_method` | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect from cluster version. Set to `image-volume` to mount the supervisor OCI image directly as a volume (requires Kubernetes 1.33+ with the ImageVolume feature gate; GA in 1.36), or `init-container` to copy it through an init container on older clusters. | +| `supervisor_topology` | `supervisor.topology` | Set `combined` for the default single supervisor path, or `sidecar` to move pod-level network enforcement and gateway forwarding into a dedicated sidecar. | +| `process_enforcement` | `supervisor.processEnforcement` | Process/filesystem controls for non-combined topologies. `network-only` keeps the low-permission agent shape. `full` grants combined-mode process/filesystem controls to the agent process supervisor. | +| `proxy_uid` | `supervisor.proxyUid` | UID used by the long-running network sidecar in `sidecar` topology. The network init container exempts this UID from proxy redirection. | | `app_armor_profile` | `server.appArmorProfile` | Set the sandbox agent container's AppArmor profile. Helm defaults this to `Unconfined` so AppArmor-enabled nodes do not block supervisor network namespace setup. Set the Helm value to an empty string to omit the field, or use `RuntimeDefault` or `Localhost/` for operator-managed profiles. | | `workspace_default_storage_size` | `server.workspaceDefaultStorageSize` | Set the default workspace PVC size for new sandboxes. | | `sa_token_ttl_secs` | `server.sandboxJwt.k8sSaTokenTtlSecs` | Set the projected ServiceAccount token TTL used for the bootstrap token exchange. | +In `combined` topology, the agent container carries the Linux capabilities +needed by the supervisor for network namespace setup, Landlock filesystem +policy, process privilege changes, and network policy enforcement. In `sidecar` +topology, the agent container runs as the resolved sandbox UID/GID with no added +Linux capabilities. A root init container performs the nftables setup, and the +long-running sidecar runs non-root with no added Linux capabilities. Sidecar +mode keeps gateway session and SSH behavior, but the process supervisor runs in +`network-only` mode by default: filesystem policy, process privilege dropping, +and process/binary identity checks are not applied by the process container. +Set `process_enforcement = "full"` only when you want those combined-mode +process/filesystem guards and accept the added agent-container permissions. + +Sidecar mode uses pod `fsGroup` so the non-root process supervisor can read the +projected ServiceAccount token and sandbox client TLS secret required for +gateway authentication. Treat the workload container as trusted with respect to +those in-pod gateway credentials. + The Kubernetes driver creates namespaced `agents.x-k8s.io` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. It detects the served Sandbox API at runtime, caches the selected API version for the gateway process, and uses `v1beta1` when available before falling back to `v1alpha1`, so supported Agent Sandbox installations work without version-specific operator configuration. The Agent Sandbox controller turns those resources into sandbox pods and related storage. If Agent Sandbox is upgraded in place, restart the OpenShell gateway after the controller and CRD rollout completes so the gateway can detect the served API versions again. diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index 47b8730dc..8ce1989da 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -393,6 +393,46 @@ require_cmd() { fi } +configure_fixture_container_engine() { + local selected_engine="" + + if [ -n "${CONTAINER_ENGINE:-}" ]; then + selected_engine="$(printf '%s' "${CONTAINER_ENGINE}" | tr '[:upper:]' '[:lower:]')" + case "${selected_engine}" in + docker|podman) + export CONTAINER_ENGINE="${selected_engine}" + return + ;; + *) + echo "ERROR: CONTAINER_ENGINE=${CONTAINER_ENGINE} is invalid; expected docker or podman" >&2 + exit 2 + ;; + esac + fi + + case "${KUBE_CONTEXT}" in + k3d-*) + selected_engine="docker" + ;; + kind-*) + case "$(printf '%s' "${KIND_EXPERIMENTAL_PROVIDER:-}" | tr '[:upper:]' '[:lower:]')" in + podman) + selected_engine="podman" + ;; + *) + selected_engine="docker" + ;; + esac + ;; + *) + return + ;; + esac + + export CONTAINER_ENGINE="${selected_engine}" + echo "Using ${CONTAINER_ENGINE} for Kubernetes e2e host-side fixture containers." +} + require_cmd helm require_cmd kubectl require_cmd curl @@ -423,6 +463,8 @@ else KUBE_CONTEXT="k3d-${CLUSTER_NAME}" fi +configure_fixture_container_engine + if [ -z "${OPENSHELL_E2E_KUBE_BUILD_IMAGES+x}" ]; then if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then OPENSHELL_E2E_KUBE_BUILD_IMAGES=1 diff --git a/tasks/helm.toml b/tasks/helm.toml index f25dadb09..433f04f32 100644 --- a/tasks/helm.toml +++ b/tasks/helm.toml @@ -55,16 +55,31 @@ description = "Run skaffold dev for deploy/helm/openshell (iterative deploy)" dir = "deploy/helm/openshell" run = "skaffold dev" +["helm:skaffold:dev:sidecar"] +description = "Run skaffold dev with the Kubernetes supervisor sidecar topology" +dir = "deploy/helm/openshell" +run = "skaffold dev -p sidecar" + ["helm:skaffold:run"] description = "Run skaffold run for deploy/helm/openshell (one-shot deploy)" dir = "deploy/helm/openshell" run = "skaffold run" +["helm:skaffold:run:sidecar"] +description = "Run skaffold run with the Kubernetes supervisor sidecar topology" +dir = "deploy/helm/openshell" +run = "skaffold run -p sidecar" + ["helm:skaffold:delete"] description = "Run skaffold delete for deploy/helm/openshell" dir = "deploy/helm/openshell" run = "skaffold delete" +["helm:skaffold:delete:sidecar"] +description = "Run skaffold delete for the Kubernetes supervisor sidecar topology" +dir = "deploy/helm/openshell" +run = "skaffold delete -p sidecar" + ["helm:skaffold:diagnose"] description = "Run skaffold diagnose for deploy/helm/openshell" dir = "deploy/helm/openshell" diff --git a/tasks/test.toml b/tasks/test.toml index db3878f75..c08fcc5a0 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -114,6 +114,11 @@ run = [ "AGENT_SANDBOX_VERSION=v0.4.6 e2e/rust/e2e-kubernetes.sh", ] +["e2e:kubernetes:sidecar"] +description = "Run Kubernetes e2e with the supervisor sidecar topology overlay" +env = { OPENSHELL_E2E_KUBE_EXTRA_VALUES = "deploy/helm/openshell/ci/values-sidecar.yaml" } +run = "e2e/rust/e2e-kubernetes.sh" + ["e2e:kubernetes:db"] description = "Run Kubernetes e2e with all database backend scenarios (SQLite and external PostgreSQL with existingSecret)" env = { OPENSHELL_E2E_KUBE_DB_SCENARIOS = "1" }