From 0928bf7b926f5599b725c617cd0b4e8e8d379bec Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 2 Jul 2026 14:14:07 -0700 Subject: [PATCH] feat(kubernetes): add proxy-pod supervisor topology Add the Kubernetes proxy-pod topology with one supervisor Deployment and Service per sandbox, NetworkPolicy confinement, proxy-pod Helm/Skaffold configuration, topology documentation, and focused supervisor identity tests. Signed-off-by: Taylor Mutch --- .../skills/debug-openshell-cluster/SKILL.md | 19 +- .agents/skills/helm-dev-environment/SKILL.md | 47 +- Cargo.lock | 4 + Cargo.toml | 2 +- architecture/gateway.md | 8 +- crates/openshell-core/src/sandbox_env.rs | 20 + crates/openshell-driver-kubernetes/Cargo.toml | 1 + crates/openshell-driver-kubernetes/README.md | 14 +- .../openshell-driver-kubernetes/src/config.rs | 36 +- .../openshell-driver-kubernetes/src/driver.rs | 1723 +++++++++++++++-- crates/openshell-sandbox/src/lib.rs | 60 +- crates/openshell-sandbox/src/main.rs | 27 +- crates/openshell-server/src/auth/k8s_sa.rs | 284 ++- .../src/l7/tls.rs | 36 + .../openshell-supervisor-network/src/proxy.rs | 61 +- .../openshell-supervisor-network/src/run.rs | 50 +- .../openshell-supervisor-process/Cargo.toml | 1 + .../src/netns/mod.rs | 2 +- .../src/process.rs | 73 +- .../openshell-supervisor-process/src/run.rs | 59 +- deploy/helm/openshell/README.md | 4 +- .../helm/openshell/ci/values-proxy-pod.yaml | 18 + deploy/helm/openshell/skaffold.yaml | 10 + .../openshell/templates/gateway-config.yaml | 2 +- deploy/helm/openshell/templates/role.yaml | 46 +- .../openshell/tests/gateway_config_test.yaml | 9 + .../tests/sandbox_namespace_test.yaml | 43 + deploy/helm/openshell/values.yaml | 7 +- docs/kubernetes/setup.mdx | 9 +- docs/kubernetes/topology.mdx | 145 +- docs/reference/gateway-config.mdx | 7 +- docs/reference/sandbox-compute-drivers.mdx | 21 +- e2e/with-kube-gateway.sh | 15 + tasks/helm.toml | 15 + tasks/scripts/helm-k3s-local.sh | 4 + tasks/test.toml | 5 + 36 files changed, 2516 insertions(+), 371 deletions(-) create mode 100644 deploy/helm/openshell/ci/values-proxy-pod.yaml diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 08e423013..fcb890257 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -274,8 +274,8 @@ If `supervisor_topology = "sidecar"` is rendered, sandbox pods should have an `openshell-supervisor-network` container running `--mode=network`. The init container owns nftables setup and should be the only sidecar topology container with `NET_ADMIN`. It also needs `CHOWN`/`FOWNER` to hand shared emptyDir state -to `sidecar_proxy_uid`. The long-running network sidecar runs as -`sidecar_proxy_uid` with primary GID `0` so it can read the root-owned, +to `proxy_uid`. The long-running network sidecar runs as +`proxy_uid` with primary GID `0` so it can read the root-owned, group-readable projected service-account token. In sidecar topology the `openshell-sa-token` projected volume should render `defaultMode: 288` (`0440`); if the proxy logs `failed to read K8s SA token`, verify this token mode and the @@ -284,6 +284,21 @@ workload entrypoint PID to `OPENSHELL_ENTRYPOINT_PID_FILE` (`/run/openshell-sidecar/entrypoint.pid` by default), and the network sidecar should read it for binary-scoped policy decisions; if allowed network rules are all denied, inspect that file and the network sidecar logs. + +If `supervisor_topology = "proxy-pod"` is rendered, each sandbox should have a +separate supervisor Deployment with one supervisor pod, a headless supervisor +Service, a proxy CA Secret, and two per-sandbox NetworkPolicies. The agent pod +should have `openshell.ai/sandbox-role=agent`; the supervisor pod should have +`openshell.ai/sandbox-role=supervisor`; both should share the same +`openshell.ai/sandbox-id`. The supervisor Deployment must have a controlling +`Sandbox` ownerReference. The Deployment pod template must carry the +`openshell.io/sandbox-id` annotation so the TokenReview bootstrap path can mint +a sandbox JWT. For supervisor pods, the gateway validates the +`Pod -> ReplicaSet -> Deployment -> Sandbox` owner chain, so missing +`apps/replicasets get` RBAC can also break bootstrap. If the agent cannot reach +the gateway, check DNS to the headless Service, the agent egress NetworkPolicy +DNS exception for kube-dns/CoreDNS, and the supervisor ingress NetworkPolicy +allowing only that agent pod on ports `3128` and `18080`. Inspect all three when sandbox registration or egress enforcement fails: ```bash diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index 7d6ad7cd5..a2a34f8c0 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -65,10 +65,21 @@ mise run helm:skaffold:run mise run helm:skaffold:run:sidecar ``` -Both commands build the `gateway` and `supervisor` images and deploy the OpenShell Helm +**Supervisor proxy-pod topology** (build once and leave running): +```bash +mise run helm:skaffold:run:proxy-pod +``` + +All Skaffold commands build the `gateway` and `supervisor` images and deploy the OpenShell Helm chart. The sidecar profile renders an `openshell-network-init` init container for nftables setup and a non-root `openshell-supervisor-network` runtime sidecar for -proxying. The `pkiInitJob` hook (a pre-install Job that runs `openshell-gateway +proxying. The proxy-pod profile renders network supervision in a separate +supervisor Deployment with one pod and relies on Kubernetes NetworkPolicy +enforcement so the agent pod can reach only its paired supervisor plus DNS. The +default local k3s/k3d cluster keeps k3s's embedded NetworkPolicy controller +enabled; if you replace the CNI, install a policy-enforcing CNI before using +proxy-pod. The +`pkiInitJob` hook (a pre-install Job that runs `openshell-gateway generate-certs`) generates mTLS secrets on first install. Envoy Gateway opt-in; see the Optional Add-ons section below. @@ -79,6 +90,31 @@ The gateway Service uses ClusterIP. Access is via Envoy Gateway (port `8080`) or create the Secret named `openshell-ha-pg` with a `uri` key, then run `mise run helm:skaffold:run` or `mise run helm:skaffold:dev`. +### Kubernetes e2e profiles + +Run the default Kubernetes e2e environment: + +```bash +mise run e2e:kubernetes +``` + +Run the sidecar topology e2e environment: + +```bash +mise run e2e:kubernetes:sidecar +``` + +Run the proxy-pod topology e2e environment: + +```bash +mise run e2e:kubernetes:proxy-pod +``` + +The proxy-pod e2e task applies `ci/values-proxy-pod.yaml` through +`OPENSHELL_E2E_KUBE_EXTRA_VALUES`. Use an existing cluster with NetworkPolicy +enforcement, or let the wrapper create the default local k3d/k3s cluster with +k3s's embedded NetworkPolicy controller enabled. + ### TLS behaviour `ci/values-skaffold.yaml` sets `server.disableTls: true`, so Skaffold-based deploys run @@ -140,6 +176,12 @@ For a sidecar-profile deployment: mise run helm:skaffold:delete:sidecar ``` +For a proxy-pod-profile deployment: + +```bash +mise run helm:skaffold:delete:proxy-pod +``` + ### Delete the cluster entirely ```bash @@ -265,6 +307,7 @@ for dependencies still declared in `Chart.yaml`. | `deploy/helm/openshell/ci/values-high-availability.yaml` | HA test overlay (`replicaCount: 2` with external PostgreSQL Secret) | | `deploy/helm/openshell/ci/values-keycloak.yaml` | Keycloak OIDC overlay | | `deploy/helm/openshell/ci/values-sidecar.yaml` | Supervisor sidecar topology overlay for Kubernetes e2e/dev | +| `deploy/helm/openshell/ci/values-proxy-pod.yaml` | Supervisor proxy-pod topology overlay for Kubernetes e2e/dev; requires NetworkPolicy enforcement | | `deploy/helm/openshell/ci/values-spire.yaml` | SPIFFE/SPIRE provider token grant overlay | | `deploy/helm/openshell/ci/values-spire-stack.yaml` | SPIRE hardened chart values for local dev | | `deploy/helm/openshell/ci/values-tls-disabled.yaml` | Lint-only: TLS + auth disabled (reverse-proxy edge termination) | diff --git a/Cargo.lock b/Cargo.lock index e94eb56f0..d9f6d1160 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3683,6 +3683,7 @@ dependencies = [ "openshell-policy", "prost", "prost-types", + "rcgen", "serde", "serde_json", "temp-env", @@ -3993,6 +3994,7 @@ dependencies = [ "seccompiler", "serde_json", "sha2 0.10.9", + "temp-env", "tempfile", "tokio", "tokio-stream", @@ -4808,6 +4810,7 @@ dependencies = [ "ring", "rustls-pki-types", "time", + "x509-parser", "yasna", ] @@ -7776,6 +7779,7 @@ dependencies = [ "lazy_static", "nom", "oid-registry", + "ring", "rusticata-macros", "thiserror 1.0.69", "time", diff --git a/Cargo.toml b/Cargo.toml index f450cd5c8..c469cf1bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,7 @@ http-body-util = "0.1" tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "tls12", "ring"] } rustls = { version = "0.23", default-features = false, features = ["std", "logging", "tls12", "ring"] } rustls-pemfile = "2" -rcgen = { version = "0.13", features = ["crypto", "pem"] } +rcgen = { version = "0.13", features = ["crypto", "pem", "x509-parser"] } webpki-roots = "1" # CLI diff --git a/architecture/gateway.md b/architecture/gateway.md index d873b2a10..9b0e70977 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -64,9 +64,11 @@ Podman, and VM drivers deliver the initial token through supervisor-only runtime material; Kubernetes supervisors exchange a projected ServiceAccount token through `IssueSandboxToken`. The gateway validates that projected token with Kubernetes `TokenReview`, requires the configured sandbox service account, -checks the returned pod binding against the live pod UID, and verifies the pod's -controlling `Sandbox` ownerReference against the live Sandbox CR UID and -sandbox-id label before minting the gateway JWT. The bootstrap path accepts +checks the returned pod binding against the live pod UID, and verifies the +pod's ownership against the live Sandbox CR UID and sandbox-id label before +minting the gateway JWT. Agent pods must be directly controlled by the +`Sandbox` CR. Proxy-pod supervisor pods may be controlled through the Kubernetes +`Pod -> ReplicaSet -> Deployment -> Sandbox` chain. The bootstrap path accepts both `agents.x-k8s.io/v1beta1` ownerReferences from newer Agent Sandbox controllers and `agents.x-k8s.io/v1alpha1` ownerReferences from existing deployments. Supervisors renew gateway JWTs in memory before expiry only while diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index ae3a21787..d1ac71580 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -55,6 +55,10 @@ pub const NETWORK_BINARY_IDENTITY: &str = "OPENSHELL_NETWORK_BINARY_IDENTITY"; /// File written by the network supervisor when sidecar networking is ready. pub const SUPERVISOR_READY_FILE: &str = "OPENSHELL_SUPERVISOR_READY_FILE"; +/// TCP address the process supervisor waits for before starting when the +/// network supervisor runs outside the agent process. +pub const SUPERVISOR_READY_ADDR: &str = "OPENSHELL_SUPERVISOR_READY_ADDR"; + /// File written by the process supervisor with the workload entrypoint PID and /// read by the network sidecar for process/binary-bound network policy checks. pub const ENTRYPOINT_PID_FILE: &str = "OPENSHELL_ENTRYPOINT_PID_FILE"; @@ -66,10 +70,26 @@ pub const GATEWAY_FORWARD_ADDR: &str = "OPENSHELL_GATEWAY_FORWARD_ADDR"; /// gateway through a loopback TCP forward. pub const GATEWAY_TLS_SERVER_NAME: &str = "OPENSHELL_GATEWAY_TLS_SERVER_NAME"; +/// Explicit URL injected into sandbox child processes for proxy-mode egress. +/// +/// Kubernetes proxy-pod topology uses a headless Service DNS name, which +/// cannot be represented by the policy's `SocketAddr` proxy field. +pub const PROXY_URL: &str = "OPENSHELL_PROXY_URL"; + +/// Explicit listener address for the network supervisor's HTTP CONNECT proxy. +pub const PROXY_BIND_ADDR: &str = "OPENSHELL_PROXY_BIND_ADDR"; + /// Directory where the network supervisor writes the proxy CA files consumed /// by workload child processes. pub const PROXY_TLS_DIR: &str = "OPENSHELL_PROXY_TLS_DIR"; +/// Optional CA certificate PEM path used by the network supervisor instead of +/// generating an ephemeral CA. +pub const PROXY_CA_CERT_PATH: &str = "OPENSHELL_PROXY_CA_CERT_PATH"; + +/// Optional CA private key PEM path paired with [`PROXY_CA_CERT_PATH`]. +pub const PROXY_CA_KEY_PATH: &str = "OPENSHELL_PROXY_CA_KEY_PATH"; + /// Path to the CA certificate for mTLS communication with the gateway. pub const TLS_CA: &str = "OPENSHELL_TLS_CA"; diff --git a/crates/openshell-driver-kubernetes/Cargo.toml b/crates/openshell-driver-kubernetes/Cargo.toml index 2c02f864a..002635a71 100644 --- a/crates/openshell-driver-kubernetes/Cargo.toml +++ b/crates/openshell-driver-kubernetes/Cargo.toml @@ -34,6 +34,7 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true } thiserror = { workspace = true } miette = { workspace = true } +rcgen = { workspace = true } [dev-dependencies] temp-env = "0.3" diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 452b26484..f26b877c9 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -65,8 +65,18 @@ In this mode OpenShell preserves gateway session and SSH behavior, but the process supervisor defaults to network-only mode and does not apply Landlock filesystem policy, process privilege dropping, or process/binary identity checks. Network endpoint and L7 policy remain enforced by the network sidecar. -Set `process_enforcement = "full"` only when you want combined-mode -process/filesystem guards and accept the added agent-container permissions. + +The `proxy-pod` supervisor topology runs network enforcement and gateway +forwarding in a separate supervisor Deployment with one pod. The agent pod runs +only the process-mode supervisor and reaches the supervisor through a +per-sandbox headless Service. The driver creates an owner-referenced supervisor +Deployment with one replica plus Service, proxy CA Secret, and NetworkPolicy +resources so agent egress is limited to its paired supervisor pod plus DNS. If +the supervisor pod is deleted, the Deployment recreates it. + +Set `process_enforcement = "full"` in sidecar or proxy-pod topology only when +you want combined-mode process/filesystem guards and accept the added +agent-container permissions. Sidecar mode uses the pod `fsGroup` to make the projected service-account token and sandbox client TLS secret group-readable so the non-root process supervisor diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index a0d3920cd..2551776cb 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -15,7 +15,7 @@ pub const DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME: &str = "default"; /// Default storage size for the workspace PVC. pub const DEFAULT_WORKSPACE_STORAGE_SIZE: &str = "2Gi"; -/// Default UID for the long-running Kubernetes network supervisor sidecar. +/// Default UID for the long-running Kubernetes network proxy. pub const DEFAULT_PROXY_UID: u32 = 1337; /// How the supervisor binary is delivered into sandbox pods. @@ -65,6 +65,9 @@ pub enum SupervisorTopology { /// Run network supervision in a privileged sidecar and process supervision /// as a low-capability wrapper in the agent container. Sidecar, + /// Run network supervision in a separate supervisor pod and process + /// supervision as a low-capability wrapper in the agent pod. + ProxyPod, } impl std::fmt::Display for SupervisorTopology { @@ -72,6 +75,7 @@ impl std::fmt::Display for SupervisorTopology { match self { Self::Combined => f.write_str("combined"), Self::Sidecar => f.write_str("sidecar"), + Self::ProxyPod => f.write_str("proxy-pod"), } } } @@ -83,22 +87,23 @@ impl FromStr for SupervisorTopology { match s { "combined" => Ok(Self::Combined), "sidecar" => Ok(Self::Sidecar), + "proxy-pod" => Ok(Self::ProxyPod), other => Err(format!("unknown supervisor topology '{other}'")), } } } -/// Process/filesystem controls applied by the process supervisor in split -/// Kubernetes topologies. +/// Process/filesystem controls applied by the process supervisor in +/// non-combined Kubernetes topologies. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] pub enum ProcessEnforcementMode { - /// Preserve process launch and session relay behavior, but leave - /// filesystem/process guards to the network supervisor topology. + /// Preserve process launch and session relay behavior while network + /// enforcement is handled by the sidecar or proxy pod. #[default] NetworkOnly, - /// Run the process supervisor with the same process/filesystem controls as - /// combined topology. + /// Run the process supervisor with combined-mode process/filesystem + /// controls. Full, } @@ -255,9 +260,10 @@ pub struct KubernetesComputeConfig { /// non-combined topologies. `network-only` keeps the low-permission agent /// shape; `full` grants the agent supervisor combined-mode controls. pub process_enforcement: ProcessEnforcementMode, - /// UID used by the long-running network sidecar in `sidecar` topology. - /// The network init container installs nftables rules that exempt this - /// UID, so it must not match the sandbox workload UID. + /// UID used by the long-running network proxy in sidecar and proxy-pod + /// topologies. In sidecar topology, the network init container installs + /// nftables rules that exempt this UID, so it must not match the sandbox + /// workload UID. pub proxy_uid: u32, pub grpc_endpoint: String, pub ssh_socket_path: String, @@ -565,6 +571,16 @@ mod tests { assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined); } + #[test] + fn serde_override_supervisor_topology_proxy_pod() { + let json = serde_json::json!({ + "supervisor_topology": "proxy-pod" + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.supervisor_topology, SupervisorTopology::ProxyPod); + assert_eq!(cfg.supervisor_topology.to_string(), "proxy-pod"); + } + #[test] fn serde_override_process_enforcement_full() { let json = serde_json::json!({ diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index ae45b3f50..67f260f60 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -10,7 +10,9 @@ use crate::config::{ SupervisorSideloadMethod, SupervisorTopology, }; use futures::{Stream, StreamExt, TryStreamExt}; -use k8s_openapi::api::core::v1::{Event as KubeEventObj, Namespace, Node}; +use k8s_openapi::api::apps::v1::Deployment; +use k8s_openapi::api::core::v1::{Event as KubeEventObj, Namespace, Node, Secret, Service}; +use k8s_openapi::api::networking::v1::NetworkPolicy; use kube::api::{Api, ApiResource, DeleteParams, ListParams, PostParams}; use kube::core::gvk::GroupVersionKind; use kube::core::{DynamicObject, ObjectMeta}; @@ -33,7 +35,9 @@ use openshell_core::proto::compute::v1::{ watch_sandboxes_event, }; use openshell_core::proto_struct::{struct_to_json_object, value_to_json}; +use rcgen::{CertificateParams, DnType, IsCa, KeyPair, KeyUsagePurpose}; use serde::Deserialize; +use serde::de::DeserializeOwned; use std::collections::BTreeMap; use std::pin::Pin; use std::sync::Arc; @@ -566,6 +570,7 @@ impl KubernetesComputeDriver { supervisor_topology: self.config.supervisor_topology, process_enforcement: self.config.process_enforcement, proxy_uid: self.config.proxy_uid, + namespace: &self.config.namespace, service_account_name: &self.config.service_account_name, sandbox_id: &sandbox.id, sandbox_name: &sandbox.name, @@ -585,7 +590,7 @@ impl KubernetesComputeDriver { sandbox_uid: resolved_user_id, sandbox_gid: resolved_group_id, }; - validate_sidecar_proxy_identity(¶ms)?; + validate_proxy_identity(¶ms)?; let mut obj = DynamicObject::new(name, &agent_sandbox_api.resource); // Copy only the SCC-related annotations onto the Sandbox CR for @@ -615,19 +620,19 @@ impl KubernetesComputeDriver { }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); - match tokio::time::timeout( + let created = match tokio::time::timeout( KUBE_API_TIMEOUT, agent_sandbox_api.api.create(&PostParams::default(), &obj), ) .await { - Ok(Ok(_result)) => { + Ok(Ok(result)) => { info!( sandbox_id = %sandbox.id, sandbox_name = %name, "Sandbox created in Kubernetes successfully" ); - Ok(()) + result } Ok(Err(err)) => { warn!( @@ -636,7 +641,7 @@ impl KubernetesComputeDriver { error = %err, "Failed to create sandbox in Kubernetes" ); - Err(KubernetesDriverError::from_kube(err)) + return Err(KubernetesDriverError::from_kube(err)); } Err(_elapsed) => { warn!( @@ -645,12 +650,197 @@ impl KubernetesComputeDriver { timeout_secs = KUBE_API_TIMEOUT.as_secs(), "Timed out creating sandbox in Kubernetes" ); - Err(KubernetesDriverError::Message(format!( + return Err(KubernetesDriverError::Message(format!( "timed out after {}s waiting for Kubernetes API", KUBE_API_TIMEOUT.as_secs() - ))) + ))); } + }; + + if self.config.supervisor_topology == SupervisorTopology::ProxyPod + && let Err(err) = self + .create_proxy_pod_resources( + sandbox, + sandbox.spec.as_ref(), + ¶ms, + &created, + &agent_sandbox_api.resource.api_version, + ) + .await + { + warn!( + sandbox_id = %sandbox.id, + sandbox_name = %name, + error = %err, + "Failed to create proxy-pod resources; deleting Sandbox CR" + ); + self.cleanup_proxy_pod_resources(name).await; + let _ = tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.delete(name, &DeleteParams::default()), + ) + .await; + return Err(err); } + + Ok(()) + } + + async fn create_proxy_pod_resources( + &self, + sandbox: &Sandbox, + spec: Option<&SandboxSpec>, + params: &SandboxPodParams<'_>, + sandbox_cr: &DynamicObject, + sandbox_api_version: &str, + ) -> Result<(), KubernetesDriverError> { + let names = proxy_pod_resource_names(&sandbox.name); + let template_environment = spec + .and_then(|spec| spec.template.as_ref()) + .map(|template| template.environment.clone()) + .unwrap_or_default(); + let spec_environment = spec_pod_env(spec); + let deployment_owner_ref = + proxy_pod_owner_reference(sandbox_cr, sandbox_api_version, true)?; + let dependent_owner_ref = + proxy_pod_owner_reference(sandbox_cr, sandbox_api_version, false)?; + let (ca_cert_pem, ca_key_pem) = generate_proxy_pod_ca()?; + + let secret = proxy_pod_ca_secret( + &names, + params, + dependent_owner_ref.clone(), + &ca_cert_pem, + &ca_key_pem, + ); + let service = proxy_pod_supervisor_service(&names, params, dependent_owner_ref.clone()); + let agent_egress = + proxy_pod_agent_egress_network_policy(&names, params, dependent_owner_ref.clone()); + let supervisor_ingress = + proxy_pod_supervisor_ingress_network_policy(&names, params, dependent_owner_ref); + let supervisor_deployment = proxy_pod_supervisor_deployment( + &names, + &template_environment, + &spec_environment, + params, + deployment_owner_ref, + ); + + let secrets: Api = Api::namespaced(self.client.clone(), &self.config.namespace); + let services: Api = Api::namespaced(self.client.clone(), &self.config.namespace); + let policies: Api = + Api::namespaced(self.client.clone(), &self.config.namespace); + let deployments: Api = + Api::namespaced(self.client.clone(), &self.config.namespace); + + tokio::time::timeout( + KUBE_API_TIMEOUT, + secrets.create(&PostParams::default(), &secret), + ) + .await + .map_err(|_| { + KubernetesDriverError::Message(format!( + "timed out after {}s creating proxy-pod CA secret", + KUBE_API_TIMEOUT.as_secs() + )) + })? + .map_err(KubernetesDriverError::from_kube)?; + tokio::time::timeout( + KUBE_API_TIMEOUT, + services.create(&PostParams::default(), &service), + ) + .await + .map_err(|_| { + KubernetesDriverError::Message(format!( + "timed out after {}s creating proxy-pod service", + KUBE_API_TIMEOUT.as_secs() + )) + })? + .map_err(KubernetesDriverError::from_kube)?; + tokio::time::timeout( + KUBE_API_TIMEOUT, + policies.create(&PostParams::default(), &agent_egress), + ) + .await + .map_err(|_| { + KubernetesDriverError::Message(format!( + "timed out after {}s creating proxy-pod agent egress NetworkPolicy", + KUBE_API_TIMEOUT.as_secs() + )) + })? + .map_err(KubernetesDriverError::from_kube)?; + tokio::time::timeout( + KUBE_API_TIMEOUT, + policies.create(&PostParams::default(), &supervisor_ingress), + ) + .await + .map_err(|_| { + KubernetesDriverError::Message(format!( + "timed out after {}s creating proxy-pod supervisor ingress NetworkPolicy", + KUBE_API_TIMEOUT.as_secs() + )) + })? + .map_err(KubernetesDriverError::from_kube)?; + tokio::time::timeout( + KUBE_API_TIMEOUT, + deployments.create(&PostParams::default(), &supervisor_deployment), + ) + .await + .map_err(|_| { + KubernetesDriverError::Message(format!( + "timed out after {}s creating proxy-pod supervisor deployment", + KUBE_API_TIMEOUT.as_secs() + )) + })? + .map_err(KubernetesDriverError::from_kube)?; + + info!( + sandbox_id = %sandbox.id, + sandbox_name = %sandbox.name, + supervisor_deployment = %names.supervisor_deployment, + service = %names.service, + "Created proxy-pod supervisor resources" + ); + Ok(()) + } + + async fn cleanup_proxy_pod_resources(&self, sandbox_name: &str) { + let names = proxy_pod_resource_names(sandbox_name); + let secrets: Api = Api::namespaced(self.client.clone(), &self.config.namespace); + let services: Api = Api::namespaced(self.client.clone(), &self.config.namespace); + let policies: Api = + Api::namespaced(self.client.clone(), &self.config.namespace); + let deployments: Api = + Api::namespaced(self.client.clone(), &self.config.namespace); + + let _ = tokio::time::timeout( + KUBE_API_TIMEOUT, + deployments.delete(&names.supervisor_deployment, &DeleteParams::default()), + ) + .await; + let _ = tokio::time::timeout( + KUBE_API_TIMEOUT, + policies.delete( + &names.supervisor_ingress_network_policy, + &DeleteParams::default(), + ), + ) + .await; + let _ = tokio::time::timeout( + KUBE_API_TIMEOUT, + policies.delete(&names.agent_egress_network_policy, &DeleteParams::default()), + ) + .await; + let _ = tokio::time::timeout( + KUBE_API_TIMEOUT, + services.delete(&names.service, &DeleteParams::default()), + ) + .await; + let _ = tokio::time::timeout( + KUBE_API_TIMEOUT, + secrets.delete(&names.proxy_ca_secret, &DeleteParams::default()), + ) + .await; } pub async fn delete_sandbox(&self, name: &str) -> Result { @@ -663,6 +853,9 @@ impl KubernetesComputeDriver { let agent_sandbox_api = self .supported_agent_sandbox_api(self.client.clone()) .await?; + if self.config.supervisor_topology == SupervisorTopology::ProxyPod { + self.cleanup_proxy_pod_resources(name).await; + } match tokio::time::timeout( KUBE_API_TIMEOUT, agent_sandbox_api.api.delete(name, &DeleteParams::default()), @@ -1069,6 +1262,18 @@ const SIDECAR_CLIENT_TLS_MOUNT_PATH: &str = "/etc/openshell-tls/proxy/client"; /// gateway endpoint using its own network privileges. const SIDECAR_GATEWAY_FORWARD_ADDR: &str = "127.0.0.1:18080"; +const LABEL_SANDBOX_ROLE: &str = "openshell.ai/sandbox-role"; +const SANDBOX_ROLE_AGENT: &str = "agent"; +const SANDBOX_ROLE_SUPERVISOR: &str = "supervisor"; +const PROXY_POD_PROXY_PORT: u16 = 3128; +const PROXY_POD_GATEWAY_FORWARD_PORT: u16 = 18080; +const PROXY_POD_GATEWAY_FORWARD_ADDR: &str = "0.0.0.0:18080"; +const PROXY_POD_NETWORK_ENFORCEMENT_MODE: &str = "proxy-pod"; +const PROXY_POD_CA_SECRET_MOUNT_PATH: &str = "/var/run/openshell-proxy-ca"; +const PROXY_POD_CA_CERT_FILE: &str = "openshell-ca.pem"; +const PROXY_POD_CA_KEY_FILE: &str = "openshell-ca-key.pem"; +const PROXY_POD_SSH_SOCKET_FILE: &str = "/tmp/openshell/ssh.sock"; + /// Build the emptyDir volume that holds the supervisor binary. /// /// The init container writes the binary here; the agent container reads it. @@ -1308,6 +1513,95 @@ fn gateway_tls_server_name(grpc_endpoint: &str) -> Option { .map(str::to_string) } +#[derive(Debug, Clone)] +struct ProxyPodResourceNames { + supervisor_deployment: String, + service: String, + proxy_ca_secret: String, + agent_egress_network_policy: String, + supervisor_ingress_network_policy: String, +} + +fn proxy_pod_resource_names(sandbox_name: &str) -> ProxyPodResourceNames { + ProxyPodResourceNames { + supervisor_deployment: dns_label_name("os-sup", sandbox_name), + service: dns_label_name("os-svc", sandbox_name), + proxy_ca_secret: dns_label_name("os-ca", sandbox_name), + agent_egress_network_policy: dns_label_name("os-eg", sandbox_name), + supervisor_ingress_network_policy: dns_label_name("os-ing", sandbox_name), + } +} + +fn dns_label_name(prefix: &str, name: &str) -> String { + let mut hash = 0xcbf2_9ce4_8422_2325_u64; + for byte in name.as_bytes() { + hash ^= u64::from(*byte); + hash = hash.wrapping_mul(0x0000_0100_0000_01b3); + } + let suffix_hash = hash & 0xffff_ffff; + let suffix = format!("{suffix_hash:08x}"); + let mut sanitized = name + .chars() + .map(|c| { + let c = c.to_ascii_lowercase(); + if c.is_ascii_alphanumeric() || c == '-' { + c + } else { + '-' + } + }) + .collect::(); + sanitized = sanitized + .trim_matches('-') + .split('-') + .filter(|part| !part.is_empty()) + .collect::>() + .join("-"); + if sanitized.is_empty() { + sanitized = "sandbox".to_string(); + } + let max_base_len = 63usize.saturating_sub(prefix.len() + suffix.len() + 2); + if sanitized.len() > max_base_len { + sanitized.truncate(max_base_len); + sanitized = sanitized.trim_matches('-').to_string(); + } + format!("{prefix}-{sanitized}-{suffix}") +} + +fn proxy_pod_service_dns(service_name: &str, namespace: &str) -> String { + format!("{service_name}.{namespace}.svc.cluster.local") +} + +fn proxy_pod_process_gateway_endpoint(service_dns: &str, grpc_endpoint: &str) -> String { + if grpc_endpoint.is_empty() { + String::new() + } else if grpc_endpoint.starts_with("https://") { + format!("https://{service_dns}:{PROXY_POD_GATEWAY_FORWARD_PORT}") + } else { + format!("http://{service_dns}:{PROXY_POD_GATEWAY_FORWARD_PORT}") + } +} + +fn proxy_pod_proxy_url(service_dns: &str) -> String { + format!("http://{service_dns}:{PROXY_POD_PROXY_PORT}") +} + +fn apply_host_gateway_aliases( + spec: &mut serde_json::Map, + host_gateway_ip: &str, +) { + if host_gateway_ip.is_empty() { + return; + } + spec.insert( + "hostAliases".to_string(), + serde_json::json!([{ + "ip": host_gateway_ip, + "hostnames": ["host.docker.internal", "host.openshell.internal"] + }]), + ); +} + fn copy_log_level_env( env: &mut Vec, template_environment: &std::collections::HashMap, @@ -1682,131 +1976,434 @@ fn apply_supervisor_sidecar_topology( )); } -/// Apply workspace persistence transforms to an already-built pod template. -/// -/// This injects: -/// 1. A volume mount on the agent container at `/sandbox`. -/// 2. An init container (same image) that seeds the PVC with the image's -/// original `/sandbox` contents on first use. -/// -/// The PVC volume itself is **not** added here — the Sandbox CRD controller -/// automatically creates a volume for each entry in `volumeClaimTemplates` -/// (following the `StatefulSet` convention). Adding one here would create a -/// duplicate volume name and fail pod validation. -/// -/// The init container mounts the PVC at a temporary path so it can still see -/// the image's `/sandbox` directory. It checks for a sentinel file and skips -/// the copy if the PVC was already initialised. -#[allow(clippy::similar_names)] -fn apply_workspace_persistence( - pod_template: &mut serde_json::Value, +fn proxy_pod_ca_source_volume_mount() -> serde_json::Value { + serde_json::json!({ + "name": "openshell-proxy-pod-ca-source", + "mountPath": PROXY_POD_CA_SECRET_MOUNT_PATH, + "readOnly": true + }) +} + +fn proxy_pod_ca_tls_volume_mount() -> serde_json::Value { + serde_json::json!({ + "name": "openshell-proxy-pod-tls", + "mountPath": SIDECAR_TLS_MOUNT_PATH, + }) +} + +fn proxy_pod_ca_init_container( image: &str, image_pull_policy: &str, sandbox_gid: u32, +) -> serde_json::Value { + let copy_cmd = format!( + "set -eu; \ + mkdir -p {SIDECAR_TLS_MOUNT_PATH}; \ + cp {PROXY_POD_CA_SECRET_MOUNT_PATH}/{PROXY_POD_CA_CERT_FILE} {SIDECAR_TLS_MOUNT_PATH}/{PROXY_POD_CA_CERT_FILE}; \ + bundle={SIDECAR_TLS_MOUNT_PATH}/ca-bundle.pem; \ + found=0; \ + for path in /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt /etc/ssl/ca-bundle.pem /etc/ssl/cert.pem; do \ + if [ -f \"$path\" ]; then cat \"$path\" > \"$bundle\"; found=1; break; fi; \ + done; \ + if [ \"$found\" = 0 ]; then : > \"$bundle\"; fi; \ + printf '\\n' >> \"$bundle\"; \ + cat {PROXY_POD_CA_SECRET_MOUNT_PATH}/{PROXY_POD_CA_CERT_FILE} >> \"$bundle\"" + ); + let mut init_spec = serde_json::json!({ + "name": "openshell-proxy-ca-install", + "image": image, + "command": ["sh", "-c", copy_cmd], + "securityContext": { + "runAsUser": 0, + "runAsGroup": sandbox_gid, + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + } + }, + "volumeMounts": [ + proxy_pod_ca_source_volume_mount(), + proxy_pod_ca_tls_volume_mount(), + ] + }); + if !image_pull_policy.is_empty() { + init_spec["imagePullPolicy"] = serde_json::json!(image_pull_policy); + } + init_spec +} + +fn apply_proxy_pod_affinity( + spec: &mut serde_json::Map, + sandbox_id: &str, +) { + if sandbox_id.is_empty() { + return; + } + + let affinity = spec + .entry("affinity".to_string()) + .or_insert_with(|| serde_json::json!({})); + if !affinity.is_object() { + *affinity = serde_json::json!({}); + } + let affinity = affinity + .as_object_mut() + .expect("affinity was converted to object"); + let pod_affinity = affinity + .entry("podAffinity".to_string()) + .or_insert_with(|| serde_json::json!({})); + if !pod_affinity.is_object() { + *pod_affinity = serde_json::json!({}); + } + let pod_affinity = pod_affinity + .as_object_mut() + .expect("podAffinity was converted to object"); + let required = pod_affinity + .entry("requiredDuringSchedulingIgnoredDuringExecution".to_string()) + .or_insert_with(|| serde_json::json!([])); + if !required.is_array() { + *required = serde_json::json!([]); + } + if let Some(required) = required.as_array_mut() { + required.push(serde_json::json!({ + "labelSelector": { + "matchLabels": proxy_pod_match_labels(sandbox_id, SANDBOX_ROLE_SUPERVISOR) + }, + "topologyKey": "kubernetes.io/hostname" + })); + } +} + +fn apply_supervisor_proxy_pod_topology( + pod_template: &mut serde_json::Value, + params: &SandboxPodParams<'_>, ) { let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { return; }; - // fsGroup is a pod-level field — it instructs kubelet to chown mounted - // volumes to this GID. It is invalid at the container securityContext level. - let pod_sc = spec + let pod_security_context = spec .entry("securityContext") .or_insert_with(|| serde_json::json!({})); - if let Some(pod_sc_obj) = pod_sc.as_object_mut() { - pod_sc_obj.insert("fsGroup".to_string(), serde_json::json!(sandbox_gid)); + if let Some(sc) = pod_security_context.as_object_mut() { + sc.insert("fsGroup".to_string(), serde_json::json!(params.sandbox_gid)); } - // 1. Add workspace volume mount to the agent container - let containers = spec.get_mut("containers").and_then(|v| v.as_array_mut()); - if let Some(containers) = containers { - let mut target_index = None; - for (i, c) in containers.iter().enumerate() { - if c.get("name").and_then(|v| v.as_str()) == Some("agent") { - target_index = Some(i); - break; - } - } - let index = target_index.unwrap_or(0); + apply_supervisor_binary_source( + spec, + params.supervisor_image, + params.supervisor_image_pull_policy, + params.supervisor_sideload_method, + ); - if let Some(container) = containers.get_mut(index).and_then(|v| v.as_object_mut()) { - let volume_mounts = container - .entry("volumeMounts") - .or_insert_with(|| serde_json::json!([])) - .as_array_mut(); - if let Some(volume_mounts) = volume_mounts { - volume_mounts.push(serde_json::json!({ - "name": WORKSPACE_VOLUME_NAME, - "mountPath": WORKSPACE_MOUNT_PATH - })); + apply_proxy_pod_affinity(spec, params.sandbox_id); + + let names = proxy_pod_resource_names(params.sandbox_name); + let service_dns = proxy_pod_service_dns(&names.service, params.namespace); + + let volumes = spec + .entry("volumes") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volumes) = volumes { + volumes.push(serde_json::json!({ + "name": "openshell-proxy-pod-ca-source", + "secret": { + "secretName": names.proxy_ca_secret, + "defaultMode": 0o444, + "items": [{ + "key": PROXY_POD_CA_CERT_FILE, + "path": PROXY_POD_CA_CERT_FILE, + }] } - } + })); + volumes.push(serde_json::json!({ + "name": "openshell-proxy-pod-tls", + "emptyDir": {} + })); } - // 3. Add the init container that seeds the PVC from the image + let image = spec + .get("containers") + .and_then(|v| v.as_array()) + .and_then(|containers| containers.first()) + .and_then(|container| container.get("image")) + .and_then(|value| value.as_str()) + .unwrap_or(params.default_image) + .to_string(); let init_containers = spec .entry("initContainers") .or_insert_with(|| serde_json::json!([])) .as_array_mut(); if let Some(init_containers) = init_containers { - // The init container mounts the PVC at a temp path so it can still - // read the image's original /sandbox contents. It copies them into - // the PVC only when the sentinel file is absent. - // - // Prefer a tar stream over `cp -a`: some sandbox images contain - // self-referential symlinks under `/sandbox/.uv`, and GNU cp can - // fail while seeding the PVC even though preserving the symlink as-is - // is valid. `tar` copies the tree without dereferencing those links. - // Archive only the contents, not the `/sandbox` directory entry - // itself, so extraction never tries to chmod the PVC mount root. - // Extract without restoring owner, mode, or timestamps so the - // non-root init container can seed kubelet-owned PVCs. - // - // The inner `[ -d ... ]` guard handles custom images that don't have - // a /sandbox directory — the copy is skipped but the sentinel is - // still written so subsequent starts are instant. - let copy_cmd = format!( - "if [ ! -f {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL} ]; then \ - if [ -d {WORKSPACE_MOUNT_PATH} ]; then \ - tmp=$(mktemp) && rm -f \"$tmp\" && \ - (cd {WORKSPACE_MOUNT_PATH} && find . -mindepth 1 -maxdepth 1 -exec tar -cf \"$tmp\" {{}} +) && \ - if [ -f \"$tmp\" ]; then \ - tar -C {WORKSPACE_INIT_MOUNT_PATH} --no-same-owner --no-same-permissions --touch -xf \"$tmp\" && \ - rm -f \"$tmp\"; \ - fi; \ - fi && \ - touch {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL}; \ - fi" - ); - - let mut init_spec = serde_json::json!({ - "name": WORKSPACE_INIT_CONTAINER_NAME, - "image": image, - "command": ["sh", "-c", copy_cmd], - "securityContext": { - "runAsUser": 0, - }, - "volumeMounts": [{ - "name": WORKSPACE_VOLUME_NAME, - "mountPath": WORKSPACE_INIT_MOUNT_PATH - }] - }); - if !image_pull_policy.is_empty() { - init_spec["imagePullPolicy"] = serde_json::json!(image_pull_policy); - } - init_containers.push(init_spec); + init_containers.push(proxy_pod_ca_init_container( + &image, + params.image_pull_policy, + params.sandbox_gid, + )); } -} -/// Build the default `volumeClaimTemplates` array for sandbox pods. -/// -/// Provides a single PVC named "workspace" that backs the `/sandbox` -/// directory. The init container seeds it from the image on first use. -fn default_workspace_volume_claim_templates(storage_size: &str) -> serde_json::Value { - let size = if storage_size.is_empty() { - DEFAULT_WORKSPACE_STORAGE_SIZE - } else { - storage_size + let Some(containers) = spec.get_mut("containers").and_then(|v| v.as_array_mut()) else { + return; + }; + let target_index = containers + .iter() + .position(|c| c.get("name").and_then(|v| v.as_str()) == Some("agent")) + .unwrap_or(0); + if let Some(container) = containers + .get_mut(target_index) + .and_then(|v| v.as_object_mut()) + { + container.insert( + "command".to_string(), + serde_json::json!([ + format!("{}/openshell-sandbox", SUPERVISOR_MOUNT_PATH), + "--mode=process" + ]), + ); + + let security_context = container + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(sc) = security_context.as_object_mut() { + match params.process_enforcement { + ProcessEnforcementMode::NetworkOnly => { + sc.insert( + "runAsUser".to_string(), + serde_json::json!(params.sandbox_uid), + ); + sc.insert( + "runAsGroup".to_string(), + serde_json::json!(params.sandbox_gid), + ); + sc.insert("runAsNonRoot".to_string(), serde_json::json!(true)); + sc.insert( + "allowPrivilegeEscalation".to_string(), + serde_json::json!(false), + ); + sc.insert( + "capabilities".to_string(), + serde_json::json!({ + "drop": ["ALL"] + }), + ); + } + ProcessEnforcementMode::Full => { + sc.insert("runAsUser".to_string(), serde_json::json!(0)); + sc.remove("runAsGroup"); + sc.remove("runAsNonRoot"); + sc.remove("allowPrivilegeEscalation"); + sc.entry("capabilities".to_string()).or_insert_with(|| { + serde_json::json!({ + "add": ["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"] + }) + }); + } + } + } + + let volume_mounts = container + .entry("volumeMounts") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volume_mounts) = volume_mounts { + volume_mounts.push(supervisor_volume_mount()); + volume_mounts.push(proxy_pod_ca_tls_volume_mount()); + } + + let env = container + .entry("env") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(env) = env { + let process_endpoint = + proxy_pod_process_gateway_endpoint(&service_dns, params.grpc_endpoint); + upsert_env( + env, + openshell_core::sandbox_env::ENDPOINT, + &process_endpoint, + ); + if let Some(server_name) = gateway_tls_server_name(params.grpc_endpoint) { + upsert_env( + env, + openshell_core::sandbox_env::GATEWAY_TLS_SERVER_NAME, + &server_name, + ); + } + upsert_env( + env, + openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY, + "proxy-pod", + ); + upsert_env( + env, + openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE, + PROXY_POD_NETWORK_ENFORCEMENT_MODE, + ); + upsert_env( + env, + openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE, + ¶ms.process_enforcement.to_string(), + ); + upsert_env( + env, + openshell_core::sandbox_env::SSH_SOCKET_PATH, + PROXY_POD_SSH_SOCKET_FILE, + ); + upsert_env( + env, + openshell_core::sandbox_env::PROXY_URL, + &proxy_pod_proxy_url(&service_dns), + ); + upsert_env( + env, + openshell_core::sandbox_env::SUPERVISOR_READY_ADDR, + &format!("{service_dns}:{PROXY_POD_PROXY_PORT}"), + ); + upsert_env( + env, + openshell_core::sandbox_env::PROXY_TLS_DIR, + SIDECAR_TLS_MOUNT_PATH, + ); + upsert_env( + env, + openshell_core::sandbox_env::SANDBOX_UID, + ¶ms.sandbox_uid.to_string(), + ); + upsert_env( + env, + openshell_core::sandbox_env::SANDBOX_GID, + ¶ms.sandbox_gid.to_string(), + ); + } + } +} + +/// Apply workspace persistence transforms to an already-built pod template. +/// +/// This injects: +/// 1. A volume mount on the agent container at `/sandbox`. +/// 2. An init container (same image) that seeds the PVC with the image's +/// original `/sandbox` contents on first use. +/// +/// The PVC volume itself is **not** added here — the Sandbox CRD controller +/// automatically creates a volume for each entry in `volumeClaimTemplates` +/// (following the `StatefulSet` convention). Adding one here would create a +/// duplicate volume name and fail pod validation. +/// +/// The init container mounts the PVC at a temporary path so it can still see +/// the image's `/sandbox` directory. It checks for a sentinel file and skips +/// the copy if the PVC was already initialised. +#[allow(clippy::similar_names)] +fn apply_workspace_persistence( + pod_template: &mut serde_json::Value, + image: &str, + image_pull_policy: &str, + sandbox_gid: u32, +) { + let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else { + return; + }; + + // fsGroup is a pod-level field — it instructs kubelet to chown mounted + // volumes to this GID. It is invalid at the container securityContext level. + let pod_sc = spec + .entry("securityContext") + .or_insert_with(|| serde_json::json!({})); + if let Some(pod_sc_obj) = pod_sc.as_object_mut() { + pod_sc_obj.insert("fsGroup".to_string(), serde_json::json!(sandbox_gid)); + } + + // 1. Add workspace volume mount to the agent container + let containers = spec.get_mut("containers").and_then(|v| v.as_array_mut()); + if let Some(containers) = containers { + let mut target_index = None; + for (i, c) in containers.iter().enumerate() { + if c.get("name").and_then(|v| v.as_str()) == Some("agent") { + target_index = Some(i); + break; + } + } + let index = target_index.unwrap_or(0); + + if let Some(container) = containers.get_mut(index).and_then(|v| v.as_object_mut()) { + let volume_mounts = container + .entry("volumeMounts") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(volume_mounts) = volume_mounts { + volume_mounts.push(serde_json::json!({ + "name": WORKSPACE_VOLUME_NAME, + "mountPath": WORKSPACE_MOUNT_PATH + })); + } + } + } + + // 3. Add the init container that seeds the PVC from the image + let init_containers = spec + .entry("initContainers") + .or_insert_with(|| serde_json::json!([])) + .as_array_mut(); + if let Some(init_containers) = init_containers { + // The init container mounts the PVC at a temp path so it can still + // read the image's original /sandbox contents. It copies them into + // the PVC only when the sentinel file is absent. + // + // Prefer a tar stream over `cp -a`: some sandbox images contain + // self-referential symlinks under `/sandbox/.uv`, and GNU cp can + // fail while seeding the PVC even though preserving the symlink as-is + // is valid. `tar` copies the tree without dereferencing those links. + // Archive only the contents, not the `/sandbox` directory entry + // itself, so extraction never tries to chmod the PVC mount root. + // Extract without restoring owner, mode, or timestamps so the + // non-root init container can seed kubelet-owned PVCs. + // + // The inner `[ -d ... ]` guard handles custom images that don't have + // a /sandbox directory — the copy is skipped but the sentinel is + // still written so subsequent starts are instant. + let copy_cmd = format!( + "if [ ! -f {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL} ]; then \ + if [ -d {WORKSPACE_MOUNT_PATH} ]; then \ + tmp=$(mktemp) && rm -f \"$tmp\" && \ + (cd {WORKSPACE_MOUNT_PATH} && find . -mindepth 1 -maxdepth 1 -exec tar -cf \"$tmp\" {{}} +) && \ + if [ -f \"$tmp\" ]; then \ + tar -C {WORKSPACE_INIT_MOUNT_PATH} --no-same-owner --no-same-permissions --touch -xf \"$tmp\" && \ + rm -f \"$tmp\"; \ + fi; \ + fi && \ + touch {WORKSPACE_INIT_MOUNT_PATH}/{WORKSPACE_SENTINEL}; \ + fi" + ); + + let mut init_spec = serde_json::json!({ + "name": WORKSPACE_INIT_CONTAINER_NAME, + "image": image, + "command": ["sh", "-c", copy_cmd], + "securityContext": { + "runAsUser": 0, + }, + "volumeMounts": [{ + "name": WORKSPACE_VOLUME_NAME, + "mountPath": WORKSPACE_INIT_MOUNT_PATH + }] + }); + if !image_pull_policy.is_empty() { + init_spec["imagePullPolicy"] = serde_json::json!(image_pull_policy); + } + init_containers.push(init_spec); + } +} + +/// Build the default `volumeClaimTemplates` array for sandbox pods. +/// +/// Provides a single PVC named "workspace" that backs the `/sandbox` +/// directory. The init container seeds it from the image on first use. +fn default_workspace_volume_claim_templates(storage_size: &str) -> serde_json::Value { + let size = if storage_size.is_empty() { + DEFAULT_WORKSPACE_STORAGE_SIZE + } else { + storage_size }; serde_json::json!([{ "metadata": { @@ -1834,6 +2431,7 @@ struct SandboxPodParams<'a> { supervisor_topology: SupervisorTopology, process_enforcement: ProcessEnforcementMode, proxy_uid: u32, + namespace: &'a str, service_account_name: &'a str, sandbox_id: &'a str, sandbox_name: &'a str, @@ -1868,6 +2466,7 @@ impl Default for SandboxPodParams<'_> { supervisor_topology: SupervisorTopology::default(), process_enforcement: ProcessEnforcementMode::default(), proxy_uid: DEFAULT_PROXY_UID, + namespace: "default", service_account_name: DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, sandbox_id: "", sandbox_name: "", @@ -1888,14 +2487,15 @@ impl Default for SandboxPodParams<'_> { } } -fn validate_sidecar_proxy_identity( - params: &SandboxPodParams<'_>, -) -> Result<(), KubernetesDriverError> { - if params.supervisor_topology == SupervisorTopology::Sidecar - && params.proxy_uid == params.sandbox_uid +fn validate_proxy_identity(params: &SandboxPodParams<'_>) -> Result<(), KubernetesDriverError> { + if matches!( + params.supervisor_topology, + SupervisorTopology::Sidecar | SupervisorTopology::ProxyPod + ) && params.proxy_uid == params.sandbox_uid { + let topology = params.supervisor_topology.to_string(); return Err(KubernetesDriverError::Precondition(format!( - "proxy_uid ({}) must not match sandbox_uid ({}) in sidecar topology", + "proxy_uid ({}) must not match sandbox_uid ({}) in {topology} topology", params.proxy_uid, params.sandbox_uid ))); } @@ -2004,7 +2604,8 @@ fn sandbox_template_to_k8s_with_gpu_requirements( .iter() .map(|(key, value)| (key.clone(), serde_json::Value::String(value.clone()))) .collect::>(); - if params.provider_spiffe_enabled { + let proxy_pod_topology = params.supervisor_topology == SupervisorTopology::ProxyPod; + if params.provider_spiffe_enabled || proxy_pod_topology { pod_labels.insert( LABEL_MANAGED_BY.to_string(), serde_json::Value::String(LABEL_MANAGED_BY_VALUE.to_string()), @@ -2016,6 +2617,12 @@ fn sandbox_template_to_k8s_with_gpu_requirements( ); } } + if proxy_pod_topology { + pod_labels.insert( + LABEL_SANDBOX_ROLE.to_string(), + serde_json::Value::String(SANDBOX_ROLE_AGENT.to_string()), + ); + } if !pod_labels.is_empty() { metadata.insert("labels".to_string(), serde_json::Value::Object(pod_labels)); } @@ -2200,7 +2807,7 @@ fn sandbox_template_to_k8s_with_gpu_requirements( if !params.client_tls_secret_name.is_empty() { let client_tls_default_mode = match params.supervisor_topology { SupervisorTopology::Combined => 0o400, - SupervisorTopology::Sidecar => 0o440, + SupervisorTopology::Sidecar | SupervisorTopology::ProxyPod => 0o440, }; volumes.push(serde_json::json!({ "name": "openshell-client-tls", @@ -2226,7 +2833,7 @@ fn sandbox_template_to_k8s_with_gpu_requirements( // supervisor containers run with the sandbox GID and need group-read access. let sa_token_default_mode = match params.supervisor_topology { SupervisorTopology::Combined => 0o400, - SupervisorTopology::Sidecar => 0o440, + SupervisorTopology::Sidecar | SupervisorTopology::ProxyPod => 0o440, }; volumes.push(serde_json::json!({ "name": "openshell-sa-token", @@ -2244,15 +2851,7 @@ fn sandbox_template_to_k8s_with_gpu_requirements( spec.insert("volumes".to_string(), serde_json::Value::Array(volumes)); // Add hostAliases so sandbox pods can reach the Docker host. - if !params.host_gateway_ip.is_empty() { - spec.insert( - "hostAliases".to_string(), - serde_json::json!([{ - "ip": params.host_gateway_ip, - "hostnames": ["host.docker.internal", "host.openshell.internal"] - }]), - ); - } + apply_host_gateway_aliases(&mut spec, params.host_gateway_ip); let mut template_value = serde_json::Map::new(); if !metadata.is_empty() { @@ -2281,6 +2880,9 @@ fn sandbox_template_to_k8s_with_gpu_requirements( params, ); } + SupervisorTopology::ProxyPod => { + apply_supervisor_proxy_pod_topology(&mut result, params); + } } // Inject workspace persistence (init container + PVC volume mount) so @@ -2324,64 +2926,567 @@ fn apply_pod_driver_config( *tolerations = serde_json::Value::Array(config.tolerations.clone()); } } -} - -fn apply_agent_driver_resources( - container: &mut serde_json::Map, - resources: &KubernetesContainerResourceConfig, -) { - if resources.requests.is_empty() && resources.limits.is_empty() { - return; +} + +fn apply_agent_driver_resources( + container: &mut serde_json::Map, + resources: &KubernetesContainerResourceConfig, +) { + if resources.requests.is_empty() && resources.limits.is_empty() { + return; + } + + let target = container + .entry("resources".to_string()) + .or_insert_with(|| serde_json::json!({})); + apply_resource_quantity_map(target, "requests", &resources.requests); + apply_resource_quantity_map(target, "limits", &resources.limits); +} + +fn merge_string_map(target: &mut serde_json::Value, values: &BTreeMap) { + if !target.is_object() { + *target = serde_json::json!({}); + } + let target = target + .as_object_mut() + .expect("target was converted to object"); + for (key, value) in values { + target + .entry(key.clone()) + .or_insert_with(|| serde_json::json!(value)); + } +} + +fn apply_resource_quantity_map( + target: &mut serde_json::Value, + section: &str, + values: &BTreeMap, +) { + if values.is_empty() { + return; + } + if !target.is_object() { + *target = serde_json::json!({}); + } + let target = target + .as_object_mut() + .expect("target was converted to object"); + let section_value = target + .entry(section.to_string()) + .or_insert_with(|| serde_json::json!({})); + merge_string_map(section_value, values); +} + +fn image_pull_secret_refs(secrets: &[String]) -> Vec { + secrets + .iter() + .map(|secret| secret.trim()) + .filter(|secret| !secret.is_empty()) + .map(|secret| serde_json::json!({ "name": secret })) + .collect() +} + +fn k8s_object(value: serde_json::Value) -> T +where + T: DeserializeOwned, +{ + serde_json::from_value(value).expect("driver rendered an invalid Kubernetes object") +} + +fn generate_proxy_pod_ca() -> Result<(String, String), KubernetesDriverError> { + let ca_key = KeyPair::generate().map_err(|err| { + KubernetesDriverError::Message(format!("failed to generate CA key: {err}")) + })?; + + let mut params = CertificateParams::default(); + params.is_ca = IsCa::Ca(rcgen::BasicConstraints::Unconstrained); + params + .distinguished_name + .push(DnType::CommonName, "OpenShell Proxy Pod Sandbox CA"); + params + .distinguished_name + .push(DnType::OrganizationName, "OpenShell"); + params.key_usages = vec![KeyUsagePurpose::KeyCertSign, KeyUsagePurpose::CrlSign]; + + let ca_cert = params.self_signed(&ca_key).map_err(|err| { + KubernetesDriverError::Message(format!("failed to generate CA certificate: {err}")) + })?; + Ok((ca_cert.pem(), ca_key.serialize_pem())) +} + +fn proxy_pod_owner_reference( + sandbox_cr: &DynamicObject, + api_version: &str, + controller: bool, +) -> Result { + let name = + sandbox_cr.metadata.name.as_deref().ok_or_else(|| { + KubernetesDriverError::Message("created Sandbox is missing name".into()) + })?; + let uid = + sandbox_cr.metadata.uid.as_deref().ok_or_else(|| { + KubernetesDriverError::Message("created Sandbox is missing uid".into()) + })?; + Ok(serde_json::json!({ + "apiVersion": sandbox_cr + .types + .as_ref() + .map_or(api_version, |types| types.api_version.as_str()), + "kind": SANDBOX_KIND, + "name": name, + "uid": uid, + "controller": controller, + "blockOwnerDeletion": false, + })) +} + +fn proxy_pod_labels(sandbox_id: &str, role: &str) -> serde_json::Value { + let mut labels = serde_json::Map::new(); + labels.insert( + LABEL_MANAGED_BY.to_string(), + serde_json::json!(LABEL_MANAGED_BY_VALUE), + ); + labels.insert(LABEL_SANDBOX_ID.to_string(), serde_json::json!(sandbox_id)); + labels.insert(LABEL_SANDBOX_ROLE.to_string(), serde_json::json!(role)); + serde_json::Value::Object(labels) +} + +fn proxy_pod_match_labels(sandbox_id: &str, role: &str) -> serde_json::Value { + let mut labels = serde_json::Map::new(); + labels.insert(LABEL_SANDBOX_ID.to_string(), serde_json::json!(sandbox_id)); + labels.insert(LABEL_SANDBOX_ROLE.to_string(), serde_json::json!(role)); + serde_json::Value::Object(labels) +} + +fn proxy_pod_object_meta( + name: &str, + namespace: &str, + sandbox_id: &str, + role: &str, + owner_ref: serde_json::Value, +) -> serde_json::Value { + serde_json::json!({ + "name": name, + "namespace": namespace, + "labels": proxy_pod_labels(sandbox_id, role), + "annotations": { + "openshell.io/sandbox-id": sandbox_id + }, + "ownerReferences": [owner_ref] + }) +} + +fn proxy_pod_supervisor_env( + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, +) -> Vec { + let mut env = Vec::new(); + apply_required_env( + &mut env, + params.sandbox_id, + params.sandbox_name, + params.grpc_endpoint, + "", + false, + provider_spiffe_socket_path(params), + ); + if !params.client_tls_secret_name.is_empty() { + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_CA, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/ca.crt"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_CERT, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/tls.crt"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::TLS_KEY, + &format!("{SIDECAR_CLIENT_TLS_MOUNT_PATH}/tls.key"), + ); + } + copy_log_level_env(&mut env, template_environment, spec_environment); + upsert_env( + &mut env, + openshell_core::sandbox_env::SUPERVISOR_TOPOLOGY, + "proxy-pod", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE, + PROXY_POD_NETWORK_ENFORCEMENT_MODE, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::NETWORK_BINARY_IDENTITY, + "relaxed", + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR, + PROXY_POD_GATEWAY_FORWARD_ADDR, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::PROXY_BIND_ADDR, + &format!("0.0.0.0:{PROXY_POD_PROXY_PORT}"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::PROXY_TLS_DIR, + SIDECAR_TLS_MOUNT_PATH, + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::PROXY_CA_CERT_PATH, + &format!("{PROXY_POD_CA_SECRET_MOUNT_PATH}/{PROXY_POD_CA_CERT_FILE}"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::PROXY_CA_KEY_PATH, + &format!("{PROXY_POD_CA_SECRET_MOUNT_PATH}/{PROXY_POD_CA_KEY_FILE}"), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::SANDBOX_UID, + ¶ms.sandbox_uid.to_string(), + ); + upsert_env( + &mut env, + openshell_core::sandbox_env::SANDBOX_GID, + ¶ms.sandbox_gid.to_string(), + ); + env +} + +fn proxy_pod_ca_secret( + names: &ProxyPodResourceNames, + params: &SandboxPodParams<'_>, + owner_ref: serde_json::Value, + cert_pem: &str, + key_pem: &str, +) -> Secret { + let mut string_data = serde_json::Map::new(); + string_data.insert( + PROXY_POD_CA_CERT_FILE.to_string(), + serde_json::json!(cert_pem), + ); + string_data.insert( + PROXY_POD_CA_KEY_FILE.to_string(), + serde_json::json!(key_pem), + ); + k8s_object(serde_json::json!({ + "apiVersion": "v1", + "kind": "Secret", + "metadata": { + "name": names.proxy_ca_secret, + "namespace": params.namespace, + "labels": proxy_pod_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR), + "ownerReferences": [owner_ref], + }, + "type": "Opaque", + "stringData": serde_json::Value::Object(string_data) + })) +} + +fn proxy_pod_supervisor_service( + names: &ProxyPodResourceNames, + params: &SandboxPodParams<'_>, + owner_ref: serde_json::Value, +) -> Service { + k8s_object(serde_json::json!({ + "apiVersion": "v1", + "kind": "Service", + "metadata": { + "name": names.service, + "namespace": params.namespace, + "labels": proxy_pod_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR), + "ownerReferences": [owner_ref], + }, + "spec": { + "clusterIP": "None", + "publishNotReadyAddresses": true, + "selector": proxy_pod_match_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR), + "ports": [ + { + "name": "http-proxy", + "port": PROXY_POD_PROXY_PORT, + "targetPort": PROXY_POD_PROXY_PORT, + "protocol": "TCP" + }, + { + "name": "gateway-forward", + "port": PROXY_POD_GATEWAY_FORWARD_PORT, + "targetPort": PROXY_POD_GATEWAY_FORWARD_PORT, + "protocol": "TCP" + } + ] + } + })) +} + +fn proxy_pod_supervisor_deployment( + names: &ProxyPodResourceNames, + template_environment: &std::collections::HashMap, + spec_environment: &std::collections::HashMap, + params: &SandboxPodParams<'_>, + owner_ref: serde_json::Value, +) -> Deployment { + let mut container = serde_json::json!({ + "name": SUPERVISOR_NETWORK_SIDECAR_NAME, + "image": params.supervisor_image, + "command": [ + SUPERVISOR_IMAGE_BINARY_PATH, + "--mode=network", + ], + "env": proxy_pod_supervisor_env(template_environment, spec_environment, params), + "ports": [ + {"name": "http-proxy", "containerPort": PROXY_POD_PROXY_PORT, "protocol": "TCP"}, + {"name": "gateway-fwd", "containerPort": PROXY_POD_GATEWAY_FORWARD_PORT, "protocol": "TCP"} + ], + "readinessProbe": { + "tcpSocket": {"port": PROXY_POD_PROXY_PORT}, + "periodSeconds": 2, + "failureThreshold": 30 + }, + "securityContext": { + "runAsUser": params.proxy_uid, + "runAsGroup": params.sandbox_gid, + "runAsNonRoot": true, + "allowPrivilegeEscalation": false, + "capabilities": { + "drop": ["ALL"] + } + }, + "volumeMounts": [ + { + "name": "openshell-sa-token", + "mountPath": "/var/run/secrets/openshell", + "readOnly": true + }, + { + "name": "openshell-proxy-pod-ca-source", + "mountPath": PROXY_POD_CA_SECRET_MOUNT_PATH, + "readOnly": true + }, + proxy_pod_ca_tls_volume_mount(), + ] + }); + if !params.supervisor_image_pull_policy.is_empty() { + container["imagePullPolicy"] = serde_json::json!(params.supervisor_image_pull_policy); + } + if !params.client_tls_secret_name.is_empty() { + container["volumeMounts"] + .as_array_mut() + .expect("volumeMounts is an array") + .push(serde_json::json!({ + "name": "openshell-client-tls", + "mountPath": SIDECAR_CLIENT_TLS_MOUNT_PATH, + "readOnly": true + })); + } + if params.provider_spiffe_enabled { + container["volumeMounts"] + .as_array_mut() + .expect("volumeMounts is an array") + .push(serde_json::json!({ + "name": SPIFFE_WORKLOAD_API_VOLUME_NAME, + "mountPath": spiffe_socket_mount_path(params.provider_spiffe_workload_api_socket_path), + "readOnly": true, + })); + } + if let Some(profile) = params.app_armor_profile { + container["securityContext"]["appArmorProfile"] = app_armor_profile_to_k8s(profile); + } + + let mut spec = serde_json::json!({ + "serviceAccountName": params.service_account_name, + "automountServiceAccountToken": false, + "securityContext": { + "fsGroup": params.sandbox_gid + }, + "containers": [container], + "volumes": [ + { + "name": "openshell-sa-token", + "projected": { + "sources": [{ + "serviceAccountToken": { + "audience": "openshell-gateway", + "expirationSeconds": params.sa_token_ttl_secs, + "path": "token" + } + }], + "defaultMode": 0o440 + } + }, + { + "name": "openshell-proxy-pod-ca-source", + "secret": { + "secretName": names.proxy_ca_secret, + "defaultMode": 0o440 + } + }, + { + "name": "openshell-proxy-pod-tls", + "emptyDir": {} + } + ] + }); + if !params.default_runtime_class_name.is_empty() { + spec["runtimeClassName"] = serde_json::json!(params.default_runtime_class_name); + } + if let Some(spec_obj) = spec.as_object_mut() { + apply_host_gateway_aliases(spec_obj, params.host_gateway_ip); } - - let target = container - .entry("resources".to_string()) - .or_insert_with(|| serde_json::json!({})); - apply_resource_quantity_map(target, "requests", &resources.requests); - apply_resource_quantity_map(target, "limits", &resources.limits); -} - -fn merge_string_map(target: &mut serde_json::Value, values: &BTreeMap) { - if !target.is_object() { - *target = serde_json::json!({}); + let image_pull_secrets = image_pull_secret_refs(params.image_pull_secrets); + if !image_pull_secrets.is_empty() { + spec["imagePullSecrets"] = serde_json::Value::Array(image_pull_secrets); } - let target = target - .as_object_mut() - .expect("target was converted to object"); - for (key, value) in values { - target - .entry(key.clone()) - .or_insert_with(|| serde_json::json!(value)); + if !params.client_tls_secret_name.is_empty() { + spec["volumes"] + .as_array_mut() + .expect("volumes is an array") + .push(serde_json::json!({ + "name": "openshell-client-tls", + "secret": { + "secretName": params.client_tls_secret_name, + "defaultMode": 0o440 + } + })); + } + if params.provider_spiffe_enabled { + spec["volumes"] + .as_array_mut() + .expect("volumes is an array") + .push(serde_json::json!({ + "name": SPIFFE_WORKLOAD_API_VOLUME_NAME, + "csi": { + "driver": "csi.spiffe.io", + "readOnly": true + } + })); } + + k8s_object(serde_json::json!({ + "apiVersion": "apps/v1", + "kind": "Deployment", + "metadata": proxy_pod_object_meta( + &names.supervisor_deployment, + params.namespace, + params.sandbox_id, + SANDBOX_ROLE_SUPERVISOR, + owner_ref + ), + "spec": { + "replicas": 1, + "selector": { + "matchLabels": proxy_pod_match_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR) + }, + "template": { + "metadata": { + "labels": proxy_pod_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR), + "annotations": { + "openshell.io/sandbox-id": params.sandbox_id + } + }, + "spec": spec + } + } + })) } -fn apply_resource_quantity_map( - target: &mut serde_json::Value, - section: &str, - values: &BTreeMap, -) { - if values.is_empty() { - return; - } - if !target.is_object() { - *target = serde_json::json!({}); - } - let target = target - .as_object_mut() - .expect("target was converted to object"); - let section_value = target - .entry(section.to_string()) - .or_insert_with(|| serde_json::json!({})); - merge_string_map(section_value, values); +fn proxy_pod_agent_egress_network_policy( + names: &ProxyPodResourceNames, + params: &SandboxPodParams<'_>, + owner_ref: serde_json::Value, +) -> NetworkPolicy { + k8s_object(serde_json::json!({ + "apiVersion": "networking.k8s.io/v1", + "kind": "NetworkPolicy", + "metadata": { + "name": names.agent_egress_network_policy, + "namespace": params.namespace, + "labels": proxy_pod_labels(params.sandbox_id, SANDBOX_ROLE_AGENT), + "ownerReferences": [owner_ref], + }, + "spec": { + "podSelector": { + "matchLabels": proxy_pod_match_labels(params.sandbox_id, SANDBOX_ROLE_AGENT) + }, + "policyTypes": ["Egress"], + "egress": [ + { + "to": [{ + "podSelector": { + "matchLabels": proxy_pod_match_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR) + } + }], + "ports": [ + {"protocol": "TCP", "port": PROXY_POD_PROXY_PORT}, + {"protocol": "TCP", "port": PROXY_POD_GATEWAY_FORWARD_PORT} + ] + }, + { + "to": [{ + "namespaceSelector": {"matchLabels": {"kubernetes.io/metadata.name": "kube-system"}}, + "podSelector": {"matchLabels": {"k8s-app": "kube-dns"}} + }], + "ports": [ + {"protocol": "UDP", "port": 53}, + {"protocol": "TCP", "port": 53} + ] + }, + { + "to": [{ + "namespaceSelector": {"matchLabels": {"kubernetes.io/metadata.name": "kube-system"}}, + "podSelector": {"matchLabels": {"k8s-app": "coredns"}} + }], + "ports": [ + {"protocol": "UDP", "port": 53}, + {"protocol": "TCP", "port": 53} + ] + } + ] + } + })) } -fn image_pull_secret_refs(secrets: &[String]) -> Vec { - secrets - .iter() - .map(|secret| secret.trim()) - .filter(|secret| !secret.is_empty()) - .map(|secret| serde_json::json!({ "name": secret })) - .collect() +fn proxy_pod_supervisor_ingress_network_policy( + names: &ProxyPodResourceNames, + params: &SandboxPodParams<'_>, + owner_ref: serde_json::Value, +) -> NetworkPolicy { + k8s_object(serde_json::json!({ + "apiVersion": "networking.k8s.io/v1", + "kind": "NetworkPolicy", + "metadata": { + "name": names.supervisor_ingress_network_policy, + "namespace": params.namespace, + "labels": proxy_pod_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR), + "ownerReferences": [owner_ref], + }, + "spec": { + "podSelector": { + "matchLabels": proxy_pod_match_labels(params.sandbox_id, SANDBOX_ROLE_SUPERVISOR) + }, + "policyTypes": ["Ingress"], + "ingress": [{ + "from": [{ + "podSelector": { + "matchLabels": proxy_pod_match_labels(params.sandbox_id, SANDBOX_ROLE_AGENT) + } + }], + "ports": [ + {"protocol": "TCP", "port": PROXY_POD_PROXY_PORT}, + {"protocol": "TCP", "port": PROXY_POD_GATEWAY_FORWARD_PORT} + ] + }] + } + })) } fn app_armor_profile_to_k8s(profile: &AppArmorProfile) -> serde_json::Value { @@ -3131,6 +4236,7 @@ mod tests { grpc_endpoint: "https://openshell-gateway.openshell.svc:8080", client_tls_secret_name: "openshell-client-tls", proxy_uid: 2200, + namespace: "default", sandbox_uid: 1500, sandbox_gid: 1500, ..SandboxPodParams::default() @@ -3418,15 +4524,272 @@ mod tests { let params = SandboxPodParams { supervisor_topology: SupervisorTopology::Sidecar, proxy_uid: 1500, + namespace: "default", sandbox_uid: 1500, ..SandboxPodParams::default() }; - let err = validate_sidecar_proxy_identity(¶ms).unwrap_err(); + let err = validate_proxy_identity(¶ms).unwrap_err(); assert!(matches!(err, KubernetesDriverError::Precondition(_))); assert!(err.to_string().contains("proxy_uid")); } + #[test] + fn proxy_pod_topology_renders_process_agent_with_proxy_service() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::ProxyPod, + supervisor_sideload_method: SupervisorSideloadMethod::InitContainer, + supervisor_image: "supervisor-image:latest", + namespace: "agents", + sandbox_id: "sandbox-123", + sandbox_name: "example-sandbox", + grpc_endpoint: "https://openshell-gateway.openshell.svc:8080", + proxy_uid: 2200, + sandbox_uid: 1500, + sandbox_gid: 1500, + host_gateway_ip: "172.17.0.1", + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate { + image: "agent-image:latest".to_string(), + ..SandboxTemplate::default() + }, + false, + &std::collections::HashMap::new(), + false, + ¶ms, + ); + + let names = proxy_pod_resource_names("example-sandbox"); + let service_dns = proxy_pod_service_dns(&names.service, "agents"); + let agent = &pod_template["spec"]["containers"][0]; + + assert_eq!( + pod_template["metadata"]["labels"][LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_AGENT + ); + assert_eq!( + agent["command"], + serde_json::json!([ + format!("{SUPERVISOR_MOUNT_PATH}/openshell-sandbox"), + "--mode=process" + ]) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::ENDPOINT), + Some(format!("https://{service_dns}:18080").as_str()) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::GATEWAY_TLS_SERVER_NAME), + Some("openshell-gateway.openshell.svc") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROXY_URL), + Some(format!("http://{service_dns}:3128").as_str()) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SUPERVISOR_READY_ADDR), + Some(format!("{service_dns}:3128").as_str()) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE), + Some(PROXY_POD_NETWORK_ENFORCEMENT_MODE) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE), + Some("network-only") + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::SSH_SOCKET_PATH), + Some(PROXY_POD_SSH_SOCKET_FILE) + ); + + let containers = pod_template["spec"]["containers"].as_array().unwrap(); + assert_eq!(containers.len(), 1); + let volumes = pod_template["spec"]["volumes"].as_array().unwrap(); + assert!(volumes.iter().any(|volume| { + volume["name"] == "openshell-proxy-pod-ca-source" + && volume["secret"]["secretName"] == names.proxy_ca_secret + })); + assert!(volumes.iter().any(|volume| { + volume["name"] == "openshell-proxy-pod-tls" && volume["emptyDir"].is_object() + })); + + let affinity = &pod_template["spec"]["affinity"]["podAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"] + [0]; + assert_eq!( + affinity["labelSelector"]["matchLabels"][LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_SUPERVISOR + ); + assert_eq!(affinity["topologyKey"], "kubernetes.io/hostname"); + } + + #[test] + fn proxy_pod_topology_full_process_enforcement_keeps_combined_agent_permissions() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::ProxyPod, + process_enforcement: ProcessEnforcementMode::Full, + supervisor_sideload_method: SupervisorSideloadMethod::InitContainer, + supervisor_image: "supervisor-image:latest", + namespace: "agents", + sandbox_name: "example-sandbox", + grpc_endpoint: "https://openshell-gateway.openshell.svc:8080", + sandbox_uid: 1500, + sandbox_gid: 1500, + proxy_uid: 2200, + ..SandboxPodParams::default() + }; + let pod_template = sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + &std::collections::HashMap::new(), + false, + ¶ms, + ); + + let agent = &pod_template["spec"]["containers"][0]; + let sc = &agent["securityContext"]; + assert_eq!(sc["runAsUser"], 0); + assert!(sc.get("runAsGroup").is_none()); + assert!(sc.get("runAsNonRoot").is_none()); + assert!(sc.get("allowPrivilegeEscalation").is_none()); + assert_eq!( + sc["capabilities"], + serde_json::json!({ + "add": ["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"] + }) + ); + assert_eq!( + rendered_env(agent, openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE), + Some("full") + ); + } + + #[test] + fn proxy_pod_companion_resources_bind_one_agent_to_one_supervisor() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::ProxyPod, + supervisor_image: "supervisor-image:latest", + namespace: "agents", + service_account_name: "openshell-sandbox", + sandbox_id: "sandbox-123", + sandbox_name: "example-sandbox", + grpc_endpoint: "http://openshell-gateway.openshell.svc:8080", + proxy_uid: 2200, + sandbox_uid: 1500, + sandbox_gid: 1500, + host_gateway_ip: "172.17.0.1", + ..SandboxPodParams::default() + }; + let names = proxy_pod_resource_names(params.sandbox_name); + let owner_ref = serde_json::json!({ + "apiVersion": "agents.x-k8s.io/v1beta1", + "kind": "Sandbox", + "name": params.sandbox_name, + "uid": "sandbox-cr-uid", + "controller": true, + "blockOwnerDeletion": false + }); + + let supervisor = serde_json::to_value(proxy_pod_supervisor_deployment( + &names, + &std::collections::HashMap::new(), + &std::collections::HashMap::new(), + ¶ms, + owner_ref.clone(), + )) + .unwrap(); + assert_eq!( + supervisor["metadata"]["ownerReferences"][0]["controller"], + true + ); + assert_eq!( + supervisor["metadata"]["annotations"]["openshell.io/sandbox-id"], + "sandbox-123" + ); + assert_eq!( + supervisor["metadata"]["labels"][LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_SUPERVISOR + ); + assert_eq!(supervisor["kind"], "Deployment"); + assert_eq!(supervisor["spec"]["replicas"], 1); + assert_eq!( + supervisor["spec"]["selector"]["matchLabels"][LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_SUPERVISOR + ); + assert_eq!( + supervisor["spec"]["template"]["metadata"]["labels"][LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_SUPERVISOR + ); + assert_eq!( + supervisor["spec"]["template"]["spec"]["hostAliases"][0]["ip"], + params.host_gateway_ip + ); + let hostnames = supervisor["spec"]["template"]["spec"]["hostAliases"][0]["hostnames"] + .as_array() + .unwrap(); + assert!(hostnames.contains(&serde_json::json!("host.openshell.internal"))); + let container = &supervisor["spec"]["template"]["spec"]["containers"][0]; + assert_eq!( + rendered_env(container, openshell_core::sandbox_env::PROXY_BIND_ADDR), + Some("0.0.0.0:3128") + ); + assert_eq!( + rendered_env(container, openshell_core::sandbox_env::GATEWAY_FORWARD_ADDR), + Some(PROXY_POD_GATEWAY_FORWARD_ADDR) + ); + + let agent_egress = serde_json::to_value(proxy_pod_agent_egress_network_policy( + &names, + ¶ms, + owner_ref.clone(), + )) + .unwrap(); + assert_eq!( + agent_egress["spec"]["policyTypes"], + serde_json::json!(["Egress"]) + ); + assert_eq!( + agent_egress["spec"]["podSelector"]["matchLabels"][LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_AGENT + ); + assert_eq!( + agent_egress["spec"]["egress"][0]["to"][0]["podSelector"]["matchLabels"] + [LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_SUPERVISOR + ); + + let supervisor_ingress = serde_json::to_value(proxy_pod_supervisor_ingress_network_policy( + &names, ¶ms, owner_ref, + )) + .unwrap(); + assert_eq!( + supervisor_ingress["spec"]["policyTypes"], + serde_json::json!(["Ingress"]) + ); + assert_eq!( + supervisor_ingress["spec"]["ingress"][0]["from"][0]["podSelector"]["matchLabels"] + [LABEL_SANDBOX_ROLE], + SANDBOX_ROLE_AGENT + ); + } + + #[test] + fn proxy_pod_topology_rejects_proxy_uid_matching_sandbox_uid() { + let params = SandboxPodParams { + supervisor_topology: SupervisorTopology::ProxyPod, + proxy_uid: 1500, + namespace: "default", + sandbox_uid: 1500, + ..SandboxPodParams::default() + }; + + let err = validate_proxy_identity(¶ms).unwrap_err(); + assert!(matches!(err, KubernetesDriverError::Precondition(_))); + assert!(err.to_string().contains("proxy-pod")); + } + /// Regression test: TLS mount path must match env var paths. /// The volume is mounted at a specific path and the env vars must point to /// files within that same path, otherwise the sandbox will fail to start diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 3ff260c7c..46bce8e78 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -74,6 +74,7 @@ use tokio::sync::mpsc::UnboundedSender; use tokio::time::timeout; const SIDECAR_NETWORK_ENFORCEMENT_MODE: &str = "sidecar-nftables"; +const PROXY_POD_NETWORK_ENFORCEMENT_MODE: &str = "proxy-pod"; const SIDECAR_TLS_DIR: &str = "/etc/openshell-tls/proxy"; const SIDECAR_CA_CERT: &str = "openshell-ca.pem"; const SIDECAR_CA_BUNDLE: &str = "ca-bundle.pem"; @@ -135,15 +136,23 @@ pub async fn run_sandbox( } } + let external_network_enforcement = external_network_enforcement_enabled(); let sidecar_network_enforcement = sidecar_network_enforcement_enabled(); let process_enforcement_mode = process_enforcement_mode(); let sidecar_ready_file = supervisor_ready_file(); + let supervisor_ready_addr = supervisor_ready_addr(); if process_enabled && !network_enabled && let Some(path) = sidecar_ready_file.as_deref() { wait_for_supervisor_ready(path).await?; } + if process_enabled + && !network_enabled + && let Some(addr) = supervisor_ready_addr.as_deref() + { + wait_for_supervisor_ready_addr(addr).await?; + } // Load policy and initialize OPA engine let openshell_endpoint_for_proxy = openshell_endpoint.clone(); @@ -284,7 +293,7 @@ pub async fn run_sandbox( // it via setns(). The RAII handle lives in this frame for the duration // of the sandbox. #[cfg(target_os = "linux")] - let netns = if network_enabled && !sidecar_network_enforcement { + let netns = if network_enabled && !external_network_enforcement { openshell_supervisor_process::netns::create_netns_for_proxy(&policy)? } else { None @@ -354,9 +363,9 @@ pub async fn run_sandbox( None }; - let _gateway_forward = if network_enabled && sidecar_network_enforcement { + let _gateway_forward = if network_enabled && external_network_enforcement { let endpoint = openshell_endpoint_for_proxy.as_deref().ok_or_else(|| { - miette::miette!("sidecar network enforcement requires an OpenShell gateway endpoint") + miette::miette!("external network enforcement requires an OpenShell gateway endpoint") })?; Some(start_gateway_forward_from_env(endpoint).await?) } else { @@ -364,10 +373,10 @@ pub async fn run_sandbox( }; #[cfg(target_os = "linux")] - if network_enabled && sidecar_network_enforcement { + if network_enabled && external_network_enforcement { if !matches!(policy.network.mode, NetworkMode::Proxy) { return Err(miette::miette!( - "sidecar network enforcement requires proxy network mode" + "external network enforcement requires proxy network mode" )); } if let Some(path) = sidecar_ready_file.as_deref() { @@ -376,9 +385,9 @@ pub async fn run_sandbox( } #[cfg(not(target_os = "linux"))] - if network_enabled && sidecar_network_enforcement { + if network_enabled && external_network_enforcement { return Err(miette::miette!( - "sidecar network enforcement is only supported on Linux" + "external network enforcement is only supported on Linux" )); } @@ -539,7 +548,7 @@ pub async fn run_sandbox( .as_ref() .and_then(|n| n.ca_file_paths.clone()) .or_else(|| { - sidecar_network_enforcement + external_network_enforcement .then(sidecar_ca_file_paths) .flatten() }); @@ -620,6 +629,15 @@ fn sidecar_network_enforcement_enabled() -> bool { .is_ok_and(|value| value == SIDECAR_NETWORK_ENFORCEMENT_MODE) } +fn external_network_enforcement_enabled() -> bool { + std::env::var(openshell_core::sandbox_env::NETWORK_ENFORCEMENT_MODE).is_ok_and(|value| { + matches!( + value.as_str(), + SIDECAR_NETWORK_ENFORCEMENT_MODE | PROXY_POD_NETWORK_ENFORCEMENT_MODE + ) + }) +} + fn process_enforcement_mode() -> ProcessEnforcementMode { match std::env::var(openshell_core::sandbox_env::PROCESS_ENFORCEMENT_MODE) .unwrap_or_else(|_| "full".to_string()) @@ -636,6 +654,12 @@ fn supervisor_ready_file() -> Option { .filter(|value| !value.is_empty()) } +fn supervisor_ready_addr() -> Option { + std::env::var(openshell_core::sandbox_env::SUPERVISOR_READY_ADDR) + .ok() + .filter(|value| !value.is_empty()) +} + fn entrypoint_pid_file() -> Option { std::env::var(openshell_core::sandbox_env::ENTRYPOINT_PID_FILE) .ok() @@ -684,6 +708,26 @@ async fn wait_for_supervisor_ready(path: &str) -> Result<()> { } } +async fn wait_for_supervisor_ready_addr(addr: &str) -> Result<()> { + let deadline = tokio::time::Instant::now() + Duration::from_secs(SIDECAR_READY_TIMEOUT_SECS); + loop { + match TcpStream::connect(addr).await { + Ok(_) => { + info!(addr, "Network supervisor TCP endpoint is ready"); + return Ok(()); + } + Err(err) if tokio::time::Instant::now() >= deadline => { + return Err(miette::miette!( + "timed out waiting for network supervisor TCP endpoint {addr}: {err}" + )); + } + Err(_) => { + tokio::time::sleep(Duration::from_millis(250)).await; + } + } + } +} + #[cfg(target_os = "linux")] fn write_supervisor_ready(path: &str) -> Result<()> { let ready_path = std::path::Path::new(path); diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 9a165c643..3303e5c63 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -172,8 +172,9 @@ struct Args { #[arg(long, default_value = DEFAULT_MODE)] mode: Mode, - /// UID that the long-running Kubernetes network sidecar will run as. - /// `--mode=network-init` installs nftables rules that exempt this UID. + /// UID that the long-running Kubernetes network proxy will run as. + /// In sidecar topology, `--mode=network-init` installs nftables rules + /// that exempt this UID. #[arg(long, env = "OPENSHELL_PROXY_UID", default_value_t = 1337)] proxy_uid: u32, @@ -313,18 +314,18 @@ fn copy_sidecar_client_tls_if_present( #[cfg(target_os = "linux")] fn run_network_init( - proxy_uid: u32, - proxy_gid: u32, + proxy_user_id: u32, + proxy_group_id: u32, sidecar_state_dir: &str, sidecar_tls_dir: &str, ) -> Result<()> { - if proxy_uid < openshell_policy::MIN_SANDBOX_UID { + if proxy_user_id < openshell_policy::MIN_SANDBOX_UID { return Err(miette::miette!( "--proxy-uid must be at least {}", openshell_policy::MIN_SANDBOX_UID )); } - if proxy_gid < openshell_policy::MIN_SANDBOX_UID { + if proxy_group_id < openshell_policy::MIN_SANDBOX_UID { return Err(miette::miette!( "--proxy-gid must be at least {}", openshell_policy::MIN_SANDBOX_UID @@ -333,15 +334,15 @@ fn run_network_init( let sidecar_state_dir = Path::new(sidecar_state_dir); let sidecar_tls_dir = Path::new(sidecar_tls_dir); - prepare_sidecar_directory(sidecar_state_dir, proxy_uid, proxy_gid, 0o775)?; - prepare_sidecar_directory(sidecar_tls_dir, proxy_uid, proxy_gid, 0o755)?; + prepare_sidecar_directory(sidecar_state_dir, proxy_user_id, proxy_group_id, 0o775)?; + prepare_sidecar_directory(sidecar_tls_dir, proxy_user_id, proxy_group_id, 0o755)?; copy_sidecar_client_tls_if_present( Path::new(CLIENT_TLS_DIR), sidecar_tls_dir, - proxy_uid, - proxy_gid, + proxy_user_id, + proxy_group_id, )?; - openshell_supervisor_process::netns::install_sidecar_bypass_rules(proxy_uid) + openshell_supervisor_process::netns::install_sidecar_bypass_rules(proxy_user_id) } #[cfg(not(target_os = "linux"))] @@ -385,10 +386,10 @@ fn main() -> Result<()> { let args = Args::parse(); if args.mode.network_init { - let proxy_gid = args.proxy_gid.unwrap_or(args.proxy_uid); + let proxy_group_id = args.proxy_gid.unwrap_or(args.proxy_uid); return run_network_init( args.proxy_uid, - proxy_gid, + proxy_group_id, &args.sidecar_state_dir, &args.sidecar_tls_dir, ); diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs index eed0e5f08..f27b90067 100644 --- a/crates/openshell-server/src/auth/k8s_sa.rs +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -5,8 +5,9 @@ //! //! Path-scoped to `IssueSandboxToken`. Validates a projected SA token //! presented by a sandbox pod, reads the pod's `openshell.io/sandbox-id` -//! annotation, verifies the pod is controlled by the corresponding Sandbox CR, -//! and returns a [`Principal::Sandbox`] with +//! annotation, verifies the pod is controlled by the corresponding Sandbox CR +//! either directly or through a supervisor Deployment controller chain, and +//! returns a [`Principal::Sandbox`] with //! [`SandboxIdentitySource::K8sServiceAccount`]. The `IssueSandboxToken` handler //! then mints a gateway-signed JWT for that sandbox id; subsequent gRPC calls //! from the supervisor use the gateway-minted JWT validated by @@ -19,10 +20,11 @@ use super::authenticator::Authenticator; use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; use async_trait::async_trait; use k8s_openapi::api::{ + apps::v1::{Deployment, ReplicaSet}, authentication::v1::{TokenReview, TokenReviewSpec, TokenReviewStatus, UserInfo}, core::v1::Pod, }; -use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; use kube::Error as KubeError; use kube::api::{Api, ApiResource, PostParams}; use kube::core::{DynamicObject, gvk::GroupVersionKind}; @@ -45,7 +47,10 @@ const SANDBOX_API_VERSION_V1BETA1: &str = "v1beta1"; const SANDBOX_API_VERSION_V1ALPHA1: &str = "v1alpha1"; const SANDBOX_API_VERSION_FULL_V1BETA1: &str = "agents.x-k8s.io/v1beta1"; const SANDBOX_API_VERSION_FULL_V1ALPHA1: &str = "agents.x-k8s.io/v1alpha1"; +const APPS_API_VERSION_FULL_V1: &str = "apps/v1"; const SANDBOX_KIND: &str = "Sandbox"; +const REPLICA_SET_KIND: &str = "ReplicaSet"; +const DEPLOYMENT_KIND: &str = "Deployment"; const SANDBOX_ID_LABEL: &str = "openshell.ai/sandbox-id"; const POD_NAME_EXTRA: &str = "authentication.kubernetes.io/pod-name"; const POD_UID_EXTRA: &str = "authentication.kubernetes.io/pod-uid"; @@ -148,11 +153,21 @@ struct SandboxOwnerReference { uid: String, } +#[derive(Debug, Clone, PartialEq, Eq)] +struct ControllerOwnerReference { + api_version: String, + kind: String, + name: String, + uid: String, +} + /// Resolver backed by the apiserver's `TokenReview` API and `kube::Client` /// for the per-pod annotation lookup. pub struct LiveK8sResolver { token_reviews_api: Api, pods_api: Api, + replica_sets_api: Api, + deployments_api: Api, sandboxes_api_v1beta1: Api, sandboxes_api_v1alpha1: Api, expected_audience: String, @@ -169,6 +184,8 @@ impl LiveK8sResolver { ) -> Self { let token_reviews_api: Api = Api::all(client.clone()); let pods_api: Api = Api::namespaced(client.clone(), namespace); + let replica_sets_api: Api = Api::namespaced(client.clone(), namespace); + let deployments_api: Api = Api::namespaced(client.clone(), namespace); let sandbox_gvk_v1beta1 = GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION_V1BETA1, SANDBOX_KIND); let sandbox_resource_v1beta1 = ApiResource::from_gvk(&sandbox_gvk_v1beta1); @@ -185,6 +202,8 @@ impl LiveK8sResolver { Self { token_reviews_api, pods_api, + replica_sets_api, + deployments_api, sandboxes_api_v1beta1, sandboxes_api_v1alpha1, expected_audience, @@ -214,6 +233,129 @@ impl LiveK8sResolver { Ok(None) } + + async fn sandbox_owner_for_pod( + &self, + pod: &Pod, + pod_name: &str, + ) -> Result { + match direct_sandbox_owner_reference(pod) { + Ok(owner) => Ok(owner), + Err(err) => { + let Some(controller) = controller_owner_reference( + pod.metadata.owner_references.as_deref().unwrap_or_default(), + ) else { + return Err(err); + }; + if controller.api_version != APPS_API_VERSION_FULL_V1 + || controller.kind != REPLICA_SET_KIND + { + return Err(err); + } + self.sandbox_owner_for_replica_set_controller(&controller, pod_name) + .await + } + } + } + + async fn sandbox_owner_for_replica_set_controller( + &self, + replica_set_owner: &ControllerOwnerReference, + pod_name: &str, + ) -> Result { + let replica_set = self + .replica_sets_api + .get_opt(&replica_set_owner.name) + .await + .map_err(|e| { + warn!( + pod = %pod_name, + replica_set = %replica_set_owner.name, + error = %e, + "failed to fetch ReplicaSet for pod identity validation" + ); + Status::internal(format!("replicaset GET failed: {e}")) + })? + .ok_or_else(|| { + warn!( + pod = %pod_name, + replica_set = %replica_set_owner.name, + "pod controller ReplicaSet was not found" + ); + Status::permission_denied("pod controller ReplicaSet not found") + })?; + validate_object_uid( + replica_set.metadata.uid.as_deref().unwrap_or_default(), + &replica_set_owner.uid, + "pod controller ReplicaSet UID mismatch", + )?; + + let deployment_owner = controller_owner_reference( + replica_set + .metadata + .owner_references + .as_deref() + .unwrap_or_default(), + ) + .ok_or_else(|| { + warn!( + pod = %pod_name, + replica_set = %replica_set_owner.name, + "ReplicaSet has no controlling Deployment ownerReference" + ); + Status::permission_denied("ReplicaSet is not controlled by a Deployment") + })?; + if deployment_owner.api_version != APPS_API_VERSION_FULL_V1 + || deployment_owner.kind != DEPLOYMENT_KIND + { + warn!( + pod = %pod_name, + replica_set = %replica_set_owner.name, + owner_api_version = %deployment_owner.api_version, + owner_kind = %deployment_owner.kind, + "ReplicaSet controller is not an apps/v1 Deployment" + ); + return Err(Status::permission_denied( + "ReplicaSet is not controlled by a Deployment", + )); + } + + let deployment = self + .deployments_api + .get_opt(&deployment_owner.name) + .await + .map_err(|e| { + warn!( + pod = %pod_name, + deployment = %deployment_owner.name, + error = %e, + "failed to fetch Deployment for pod identity validation" + ); + Status::internal(format!("deployment GET failed: {e}")) + })? + .ok_or_else(|| { + warn!( + pod = %pod_name, + deployment = %deployment_owner.name, + "ReplicaSet controller Deployment was not found" + ); + Status::permission_denied("ReplicaSet controller Deployment not found") + })?; + validate_object_uid( + deployment.metadata.uid.as_deref().unwrap_or_default(), + &deployment_owner.uid, + "ReplicaSet controller Deployment UID mismatch", + )?; + + sandbox_owner_reference_from_owner_refs( + deployment + .metadata + .owner_references + .as_deref() + .unwrap_or_default(), + "Deployment", + ) + } } #[async_trait] @@ -293,7 +435,7 @@ impl K8sIdentityResolver for LiveK8sResolver { let sandbox_id = pod_sandbox_id(&pod)?; - let owner = sandbox_owner_reference(&pod)?; + let owner = self.sandbox_owner_for_pod(&pod, &identity.pod_name).await?; let sandbox_cr = self.get_sandbox_cr_for_owner(&owner).await.map_err(|e| { warn!( pod = %identity.pod_name, @@ -406,8 +548,18 @@ fn pod_sandbox_id(pod: &Pod) -> Result { } #[allow(clippy::result_large_err)] -fn sandbox_owner_reference(pod: &Pod) -> Result { - let owner_refs = pod.metadata.owner_references.as_deref().unwrap_or_default(); +fn direct_sandbox_owner_reference(pod: &Pod) -> Result { + sandbox_owner_reference_from_owner_refs( + pod.metadata.owner_references.as_deref().unwrap_or_default(), + "pod", + ) +} + +#[allow(clippy::result_large_err)] +fn sandbox_owner_reference_from_owner_refs( + owner_refs: &[OwnerReference], + object_kind: &str, +) -> Result { let mut sandbox_refs = owner_refs .iter() .filter(|owner| is_supported_sandbox_owner_reference(owner)); @@ -424,27 +576,28 @@ fn sandbox_owner_reference(pod: &Pod) -> Result { SANDBOX_API_VERSION_FULL_V1BETA1, SANDBOX_API_VERSION_FULL_V1ALPHA1, ], - "pod Sandbox ownerReference uses unsupported apiVersion" + object_kind = %object_kind, + "Sandbox ownerReference uses unsupported apiVersion" ); } - return Err(Status::permission_denied( - "pod is not controlled by an OpenShell Sandbox", - )); + return Err(Status::permission_denied(format!( + "{object_kind} is not controlled by an OpenShell Sandbox" + ))); }; if sandbox_refs.next().is_some() { - return Err(Status::permission_denied( - "pod has multiple OpenShell Sandbox owners", - )); + return Err(Status::permission_denied(format!( + "{object_kind} has multiple OpenShell Sandbox owners" + ))); } if owner.controller != Some(true) { - return Err(Status::permission_denied( - "pod Sandbox ownerReference is not controlling", - )); + return Err(Status::permission_denied(format!( + "{object_kind} Sandbox ownerReference is not controlling" + ))); } if owner.name.is_empty() || owner.uid.is_empty() { - return Err(Status::permission_denied( - "pod Sandbox ownerReference is incomplete", - )); + return Err(Status::permission_denied(format!( + "{object_kind} Sandbox ownerReference is incomplete" + ))); } Ok(SandboxOwnerReference { api_version: owner.api_version.clone(), @@ -453,9 +606,32 @@ fn sandbox_owner_reference(pod: &Pod) -> Result { }) } -fn is_supported_sandbox_owner_reference( - owner: &k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference, -) -> bool { +fn controller_owner_reference(owner_refs: &[OwnerReference]) -> Option { + let owner = owner_refs + .iter() + .find(|owner| owner.controller == Some(true))?; + Some(ControllerOwnerReference { + api_version: owner.api_version.clone(), + kind: owner.kind.clone(), + name: owner.name.clone(), + uid: owner.uid.clone(), + }) +} + +#[allow(clippy::result_large_err)] +fn validate_object_uid(actual_uid: &str, expected_uid: &str, message: &str) -> Result<(), Status> { + if actual_uid != expected_uid { + warn!( + expected_uid = %expected_uid, + actual_uid = %actual_uid, + %message + ); + return Err(Status::permission_denied(message.to_string())); + } + Ok(()) +} + +fn is_supported_sandbox_owner_reference(owner: &OwnerReference) -> bool { owner.kind == SANDBOX_KIND && matches!( owner.api_version.as_str(), @@ -629,6 +805,17 @@ mod tests { } } + fn app_controller_owner(kind: &str, name: &str, uid: &str) -> OwnerReference { + OwnerReference { + api_version: APPS_API_VERSION_FULL_V1.to_string(), + block_owner_deletion: None, + controller: Some(true), + kind: kind.to_string(), + name: name.to_string(), + uid: uid.to_string(), + } + } + fn pod_with_owner_refs(owner_references: Vec) -> Pod { Pod { metadata: ObjectMeta { @@ -780,7 +967,7 @@ mod tests { fn sandbox_owner_reference_extracts_controlling_sandbox_owner() { let pod = pod_with_owner_refs(vec![sandbox_owner("sandbox-a", "cr-uid-a")]); - let owner = sandbox_owner_reference(&pod).expect("expected Sandbox owner"); + let owner = direct_sandbox_owner_reference(&pod).expect("expected Sandbox owner"); assert_eq!( owner, @@ -800,7 +987,7 @@ mod tests { "cr-uid-a", )]); - let owner = sandbox_owner_reference(&pod).expect("expected v1alpha1 Sandbox owner"); + let owner = direct_sandbox_owner_reference(&pod).expect("expected v1alpha1 Sandbox owner"); assert_eq!( owner, @@ -816,7 +1003,7 @@ mod tests { fn sandbox_owner_reference_rejects_missing_owner() { let pod = pod_with_owner_refs(vec![]); - let err = sandbox_owner_reference(&pod).expect_err("missing owner must fail"); + let err = direct_sandbox_owner_reference(&pod).expect_err("missing owner must fail"); assert_eq!(err.code(), tonic::Code::PermissionDenied); } @@ -829,8 +1016,8 @@ mod tests { "cr-uid-a", )]); - let err = - sandbox_owner_reference(&pod).expect_err("unsupported apiVersion must fail closed"); + let err = direct_sandbox_owner_reference(&pod) + .expect_err("unsupported apiVersion must fail closed"); assert_eq!(err.code(), tonic::Code::PermissionDenied); } @@ -841,7 +1028,7 @@ mod tests { owner.controller = Some(false); let pod = pod_with_owner_refs(vec![owner]); - let err = sandbox_owner_reference(&pod).expect_err("non-controller owner must fail"); + let err = direct_sandbox_owner_reference(&pod).expect_err("non-controller owner must fail"); assert_eq!(err.code(), tonic::Code::PermissionDenied); } @@ -853,11 +1040,50 @@ mod tests { sandbox_owner("sandbox-b", "cr-uid-b"), ]); - let err = sandbox_owner_reference(&pod).expect_err("multiple owners must fail"); + let err = direct_sandbox_owner_reference(&pod).expect_err("multiple owners must fail"); assert_eq!(err.code(), tonic::Code::PermissionDenied); } + #[test] + fn controller_owner_reference_extracts_controlling_apps_owner() { + let pod = pod_with_owner_refs(vec![app_controller_owner( + REPLICA_SET_KIND, + "supervisor-rs", + "rs-uid", + )]); + + let owner = controller_owner_reference(pod.metadata.owner_references.as_deref().unwrap()) + .expect("expected controller owner"); + + assert_eq!( + owner, + ControllerOwnerReference { + api_version: APPS_API_VERSION_FULL_V1.to_string(), + kind: REPLICA_SET_KIND.to_string(), + name: "supervisor-rs".to_string(), + uid: "rs-uid".to_string(), + } + ); + } + + #[test] + fn sandbox_owner_reference_from_deployment_requires_controlling_sandbox_owner() { + let deployment_owner_refs = vec![sandbox_owner("sandbox-a", "cr-uid-a")]; + + let owner = sandbox_owner_reference_from_owner_refs(&deployment_owner_refs, "Deployment") + .expect("expected Deployment Sandbox owner"); + + assert_eq!( + owner, + SandboxOwnerReference { + api_version: SANDBOX_API_VERSION_FULL_V1BETA1.to_string(), + name: "sandbox-a".to_string(), + uid: "cr-uid-a".to_string(), + } + ); + } + #[test] fn validate_sandbox_owner_reference_requires_matching_cr_uid_and_label() { let owner = SandboxOwnerReference { diff --git a/crates/openshell-supervisor-network/src/l7/tls.rs b/crates/openshell-supervisor-network/src/l7/tls.rs index 70e198f42..c211200a8 100644 --- a/crates/openshell-supervisor-network/src/l7/tls.rs +++ b/crates/openshell-supervisor-network/src/l7/tls.rs @@ -63,6 +63,28 @@ impl SandboxCa { }) } + /// Load an existing CA certificate and private key from PEM. + pub fn from_pem(ca_cert_pem: &str, ca_key_pem: &str) -> Result { + let ca_key = KeyPair::from_pem(ca_key_pem).into_diagnostic()?; + let ca_cert = CertificateParams::from_ca_cert_pem(ca_cert_pem) + .into_diagnostic()? + .self_signed(&ca_key) + .into_diagnostic()?; + + Ok(Self { + ca_cert, + ca_key, + ca_cert_pem: ca_cert_pem.to_string(), + }) + } + + /// Load an existing CA certificate and private key from files. + pub fn from_files(cert_path: &Path, key_path: &Path) -> Result { + let ca_cert_pem = std::fs::read_to_string(cert_path).into_diagnostic()?; + let ca_key_pem = std::fs::read_to_string(key_path).into_diagnostic()?; + Self::from_pem(&ca_cert_pem, &ca_key_pem) + } + /// Returns the CA certificate in PEM format. pub fn cert_pem(&self) -> &str { &self.ca_cert_pem @@ -519,4 +541,18 @@ mod tests { "bundle should contain at least one cert", ); } + + #[test] + fn sandbox_ca_loads_from_pem() { + let ca = SandboxCa::generate().unwrap(); + let key_pem = ca.ca_key.serialize_pem(); + let loaded = SandboxCa::from_pem(ca.cert_pem(), &key_pem).unwrap(); + + assert_eq!(loaded.cert_pem(), ca.cert_pem()); + assert!( + CertCache::new(loaded) + .get_or_generate("example.com") + .is_ok() + ); + } } diff --git a/crates/openshell-supervisor-network/src/proxy.rs b/crates/openshell-supervisor-network/src/proxy.rs index c38ecbd3a..b0b112142 100644 --- a/crates/openshell-supervisor-network/src/proxy.rs +++ b/crates/openshell-supervisor-network/src/proxy.rs @@ -1470,25 +1470,25 @@ fn resolve_owner_identity( } #[cfg(target_os = "linux")] -fn collect_ancestor_identities(pid: u32, stop_pid: u32) -> Vec<(u32, PathBuf)> { +fn collect_ancestor_identities(start_pid: u32, stop_pid: u32) -> Vec<(u32, PathBuf)> { const MAX_DEPTH: usize = 64; let mut ancestors = Vec::new(); - let mut current = pid; + let mut current = start_pid; for _ in 0..MAX_DEPTH { - let ppid = match crate::procfs::read_ppid(current) { - Some(p) if p > 0 && p != current => p, + let parent_pid = match crate::procfs::read_ppid(current) { + Some(parent) if parent > 0 && parent != current => parent, _ => break, }; - if let Ok(path) = crate::procfs::binary_path(ppid.cast_signed()) { - ancestors.push((ppid, path)); + if let Ok(path) = crate::procfs::binary_path(parent_pid.cast_signed()) { + ancestors.push((parent_pid, path)); } - if ppid == stop_pid || ppid == 1 { + if parent_pid == stop_pid || parent_pid == 1 { break; } - current = ppid; + current = parent_pid; } ancestors @@ -7985,8 +7985,10 @@ network_policies: /// was now bound to a *different* binary on disk than the one that was /// TOFU-cached. After the strip, `binary_path()` returns a path that /// stats fine, the cache rehashes the new bytes, and the hash mismatch - /// surfaces as a `Binary integrity violation` error — the contract this - /// PR is trying to establish. + /// either surfaces as a `Binary integrity violation` error or continues + /// verifying the original process image through `/proc//exe`. The + /// stable contract is that the old `"Failed to stat ... (deleted)"` + /// failure mode does not leak out of identity resolution. /// /// Test shape (from the review comment on the initial PR): /// 1. Start a `TcpListener` in the test process. @@ -8002,14 +8004,14 @@ network_policies: /// executes from its in-memory image), but `/proc//exe` will /// now readlink to `" (deleted)"` OR the overwritten file, depending /// on whether the filesystem reused the inode. - /// 7. Call `resolve_process_identity` and assert: - /// - the error reason contains `"Binary integrity violation"` (the - /// cache detected the tampered on-disk bytes), and - /// - the error reason does NOT contain `"Failed to stat"` or - /// `"(deleted)"` (the old pre-strip failure mode). + /// 7. Call `resolve_process_identity` and assert the old pre-strip + /// failure mode does not return `"Failed to stat"` or a `"(deleted)"` + /// path. Depending on kernel/filesystem behavior, the process image + /// may still verify against the deleted executable inode, or the + /// replacement bytes may surface as a binary-integrity violation. #[cfg(target_os = "linux")] #[test] - fn resolve_process_identity_surfaces_binary_integrity_violation_on_hot_swap() { + fn resolve_process_identity_handles_hot_swap_without_deleted_path_leak() { use crate::identity::BinaryIdentityCache; use std::io::Read; use std::net::TcpListener; @@ -8041,9 +8043,10 @@ network_policies: assert!(!v1_hash.is_empty()); // 4. Spawn the temp bash with a /dev/tcp one-liner that opens a real - // connection to the listener and sleeps to keep it open. The - // `read -t` blocks on stdin so the shell stays resident. - let script = format!("exec 3<>/dev/tcp/127.0.0.1/{listener_port}; sleep 30 <&3"); + // connection to the listener. The `read -t` built-in blocks on + // FD 3 so the shell stays resident without forking an external + // process that would inherit the socket. + let script = format!("exec 3<>/dev/tcp/127.0.0.1/{listener_port}; read -r -t 30 _ <&3"); let mut child = Command::new(&bash_v1) .arg("-c") .arg(&script) @@ -8086,9 +8089,8 @@ network_policies: let tampered_bytes = b"#!/bin/sh\n# tampered bash v2 from hotswap test\nexit 0\n"; std::fs::write(&bash_v1, tampered_bytes).expect("write replacement bytes"); - // 7. Resolve identity through the real helper and assert the - // contract: we want "Binary integrity violation", not - // "Failed to stat ... (deleted)". + // 7. Resolve identity through the real helper and assert the stable + // contract: never return "Failed to stat ... (deleted)". let test_pid = std::process::id(); let result = resolve_process_identity(test_pid, peer_port, &cache); @@ -8098,10 +8100,17 @@ network_policies: let _ = child.wait(); match result { - Ok(_) => panic!( - "resolve_process_identity unexpectedly succeeded after hot-swap; \ - the cache should have detected the tampered on-disk bytes" - ), + Ok(identity) => { + assert_eq!( + identity.bin_hash, v1_hash, + "successful hot-swap resolution should still verify the original process image" + ); + assert!( + !identity.bin_path.to_string_lossy().contains("(deleted)"), + "resolved binary path still tainted: {}", + identity.bin_path.display() + ); + } Err(err) => { assert!( err.reason.contains("Binary integrity violation"), diff --git a/crates/openshell-supervisor-network/src/run.rs b/crates/openshell-supervisor-network/src/run.rs index 8e17758bd..5a8123415 100644 --- a/crates/openshell-supervisor-network/src/run.rs +++ b/crates/openshell-supervisor-network/src/run.rs @@ -54,6 +54,38 @@ pub struct Networking { pub policy_local_ctx: Arc, } +fn sandbox_ca_for_proxy() -> Result { + let cert_path = std::env::var(openshell_core::sandbox_env::PROXY_CA_CERT_PATH).ok(); + let key_path = std::env::var(openshell_core::sandbox_env::PROXY_CA_KEY_PATH).ok(); + match (cert_path, key_path) { + (Some(cert_path), Some(key_path)) => SandboxCa::from_files( + std::path::Path::new(&cert_path), + std::path::Path::new(&key_path), + ), + (None, None) => SandboxCa::generate(), + _ => Err(miette::miette!( + "{} and {} must be set together", + openshell_core::sandbox_env::PROXY_CA_CERT_PATH, + openshell_core::sandbox_env::PROXY_CA_KEY_PATH + )), + } +} + +fn explicit_proxy_bind_addr() -> Result> { + let Some(value) = std::env::var(openshell_core::sandbox_env::PROXY_BIND_ADDR) + .ok() + .filter(|value| !value.trim().is_empty()) + else { + return Ok(None); + }; + value.parse::().map(Some).map_err(|err| { + miette::miette!( + "invalid {} value {value:?}: {err}", + openshell_core::sandbox_env::PROXY_BIND_ADDR + ) + }) +} + /// Set up the networking stack: ephemeral CA + TLS state, proxy server, /// and the SSH-side proxy URL / netns FD. /// @@ -196,10 +228,10 @@ pub async fn run_networking( // the proxy, so it's owned here. let identity_cache = opa_engine.map(|_| Arc::new(BinaryIdentityCache::new())); - // Generate ephemeral CA and TLS state for HTTPS L7 inspection. - // The CA cert is written to disk so sandbox processes can trust it. + // Generate or load a CA and TLS state for HTTPS L7 inspection. The CA cert + // is written to disk so sandbox processes can trust it. let (tls_state, ca_file_paths) = if matches!(policy.network.mode, NetworkMode::Proxy) { - match SandboxCa::generate() { + match sandbox_ca_for_proxy() { Ok(ca) => { let tls_dir = std::env::var(openshell_core::sandbox_env::PROXY_TLS_DIR) .unwrap_or_else(|_| "/etc/openshell-tls".to_string()); @@ -219,7 +251,7 @@ pub async fn run_networking( .severity(SeverityId::Informational) .status(StatusId::Success) .state(StateId::Enabled, "enabled") - .message("TLS termination enabled: ephemeral CA generated") + .message("TLS termination enabled") .build() ); (Some(state), Some(paths)) @@ -246,7 +278,7 @@ pub async fn run_networking( .status(StatusId::Failure) .state(StateId::Disabled, "disabled") .message(format!( - "Failed to generate ephemeral CA, TLS termination disabled: {e}" + "Failed to initialize proxy CA, TLS termination disabled: {e}" )) .build() ); @@ -275,9 +307,11 @@ pub async fn run_networking( // originating inside the namespace can reach the proxy. Otherwise the // proxy falls back to the policy-declared http_addr (loopback in // tests, etc.). - let bind_addr = proxy_bind_ip.map(|ip| { - let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); - SocketAddr::new(ip, port) + let bind_addr = explicit_proxy_bind_addr()?.or_else(|| { + proxy_bind_ip.map(|ip| { + let port = proxy_policy.http_addr.map_or(3128, |addr| addr.port()); + SocketAddr::new(ip, port) + }) }); // Build inference context for local routing of intercepted inference calls. diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index 3c4be356f..dc6a396eb 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -42,6 +42,7 @@ seccompiler = "0.5" tempfile = "3" [dev-dependencies] +temp-env = "0.3" tempfile = "3" [lints] diff --git a/crates/openshell-supervisor-process/src/netns/mod.rs b/crates/openshell-supervisor-process/src/netns/mod.rs index 86a5406ad..d3d9063a0 100644 --- a/crates/openshell-supervisor-process/src/netns/mod.rs +++ b/crates/openshell-supervisor-process/src/netns/mod.rs @@ -470,7 +470,7 @@ pub fn create_netns_for_proxy( /// Install pod-network bypass enforcement for Kubernetes sidecar topology. /// /// This runs in the current network namespace, not in a per-workload netns. -/// The rules allow loopback and the sidecar proxy UID, then reject direct +/// The rules allow loopback and the proxy UID, then reject direct /// TCP/UDP egress from other UIDs so traffic must use the sidecar's local /// proxy. /// diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index bd8be04c8..127a657c2 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -55,6 +55,10 @@ const SUPERVISOR_ONLY_ENV_VARS: &[&str] = &[ openshell_core::sandbox_env::TLS_CERT, openshell_core::sandbox_env::TLS_KEY, openshell_core::sandbox_env::PROVIDER_SPIFFE_WORKLOAD_API_SOCKET, + openshell_core::sandbox_env::PROXY_URL, + openshell_core::sandbox_env::PROXY_BIND_ADDR, + openshell_core::sandbox_env::PROXY_CA_CERT_PATH, + openshell_core::sandbox_env::PROXY_CA_KEY_PATH, ]; pub fn is_supervisor_only_env_var(key: &str) -> bool { @@ -76,6 +80,35 @@ fn inject_provider_env(cmd: &mut Command, provider_env: &HashMap } } +fn configured_proxy_url( + policy: &SandboxPolicy, + netns_proxy_enabled: bool, +) -> Result> { + if !matches!(policy.network.mode, NetworkMode::Proxy) { + return Ok(None); + } + + if let Ok(proxy_url) = std::env::var(openshell_core::sandbox_env::PROXY_URL) { + let trimmed = proxy_url.trim(); + if !trimmed.is_empty() { + return Ok(Some(trimmed.to_string())); + } + } + + let proxy = policy.network.proxy.as_ref().ok_or_else(|| { + miette::miette!("Network mode is set to proxy but no proxy configuration was provided") + })?; + + if netns_proxy_enabled { + let port = proxy.http_addr.map_or(3128, |addr| addr.port()); + return Ok(Some(format!("http://10.200.0.1:{port}"))); + } + + Ok(proxy + .http_addr + .map(|http_addr| format!("http://{http_addr}"))) +} + #[cfg(unix)] pub fn harden_child_process() -> Result<()> { use rustix::process::{Resource, Rlimit, setrlimit}; @@ -545,27 +578,11 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { - let proxy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!( - "Network mode is set to proxy but no proxy configuration was provided" - ) - })?; - // When using network namespace, set proxy URL to the veth host IP - if netns_fd.is_some() { - // The proxy is on 10.200.0.1:3128 (or configured port) - let port = proxy.http_addr.map_or(3128, |addr| addr.port()); - let proxy_url = format!("http://10.200.0.1:{port}"); - // Both uppercase and lowercase variants: curl/wget use uppercase, - // gRPC C-core (libgrpc) checks lowercase http_proxy/https_proxy. - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } - } else if let Some(http_addr) = proxy.http_addr { - let proxy_url = format!("http://{http_addr}"); - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } + if let Some(proxy_url) = configured_proxy_url(policy, netns_fd.is_some())? { + // Both uppercase and lowercase variants: curl/wget use uppercase, + // gRPC C-core (libgrpc) checks lowercase http_proxy/https_proxy. + for (key, value) in child_env::proxy_env_vars(&proxy_url) { + cmd.env(key, value); } } @@ -700,17 +717,9 @@ impl ProcessHandle { cmd.current_dir(dir); } - if matches!(policy.network.mode, NetworkMode::Proxy) { - let proxy = policy.network.proxy.as_ref().ok_or_else(|| { - miette::miette!( - "Network mode is set to proxy but no proxy configuration was provided" - ) - })?; - if let Some(http_addr) = proxy.http_addr { - let proxy_url = format!("http://{http_addr}"); - for (key, value) in child_env::proxy_env_vars(&proxy_url) { - cmd.env(key, value); - } + if let Some(proxy_url) = configured_proxy_url(policy, false)? { + for (key, value) in child_env::proxy_env_vars(&proxy_url) { + cmd.env(key, value); } } diff --git a/crates/openshell-supervisor-process/src/run.rs b/crates/openshell-supervisor-process/src/run.rs index bd3fea91f..3b322413a 100644 --- a/crates/openshell-supervisor-process/src/run.rs +++ b/crates/openshell-supervisor-process/src/run.rs @@ -413,6 +413,13 @@ fn ssh_proxy_url_for_policy( return None; } + if let Ok(proxy_url) = std::env::var(openshell_core::sandbox_env::PROXY_URL) { + let trimmed = proxy_url.trim(); + if !trimmed.is_empty() { + return Some(trimmed.to_string()); + } + } + let proxy = policy.network.proxy.as_ref()?; if let Some(host) = netns_proxy_host { let port = proxy.http_addr.map_or(3128, |addr| addr.port()); @@ -486,6 +493,8 @@ mod tests { FilesystemPolicy, LandlockPolicy, NetworkMode, NetworkPolicy, ProcessPolicy, ProxyPolicy, }; + static PROXY_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + fn policy(mode: NetworkMode, http_addr: Option) -> SandboxPolicy { SandboxPolicy { version: 1, @@ -501,30 +510,56 @@ mod tests { } } + fn with_proxy_url(proxy_url: Option<&str>, test: F) -> T + where + F: FnOnce() -> T, + { + let _guard = PROXY_ENV_LOCK.lock().expect("proxy env lock poisoned"); + temp_env::with_var(openshell_core::sandbox_env::PROXY_URL, proxy_url, test) + } + #[test] fn ssh_proxy_url_uses_policy_addr_without_netns() { - let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 3128).into())); + with_proxy_url(None, || { + let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 3128).into())); - assert_eq!( - ssh_proxy_url_for_policy(&policy, None).as_deref(), - Some("http://127.0.0.1:3128") - ); + assert_eq!( + ssh_proxy_url_for_policy(&policy, None).as_deref(), + Some("http://127.0.0.1:3128") + ); + }); } #[test] fn ssh_proxy_url_prefers_netns_host_with_policy_port() { - let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 8080).into())); + with_proxy_url(None, || { + let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 8080).into())); - assert_eq!( - ssh_proxy_url_for_policy(&policy, Some([10, 200, 0, 1].into())).as_deref(), - Some("http://10.200.0.1:8080") - ); + assert_eq!( + ssh_proxy_url_for_policy(&policy, Some([10, 200, 0, 1].into())).as_deref(), + Some("http://10.200.0.1:8080") + ); + }); } #[test] fn ssh_proxy_url_skips_non_proxy_mode() { - let policy = policy(NetworkMode::Allow, Some(([127, 0, 0, 1], 3128).into())); + with_proxy_url(None, || { + let policy = policy(NetworkMode::Allow, Some(([127, 0, 0, 1], 3128).into())); + + assert_eq!(ssh_proxy_url_for_policy(&policy, None), None); + }); + } + + #[test] + fn ssh_proxy_url_prefers_env_override() { + with_proxy_url(Some("http://openshell-supervisor.default.svc:3128"), || { + let policy = policy(NetworkMode::Proxy, Some(([127, 0, 0, 1], 8080).into())); - assert_eq!(ssh_proxy_url_for_policy(&policy, None), None); + assert_eq!( + ssh_proxy_url_for_policy(&policy, Some([10, 200, 0, 1].into())).as_deref(), + Some("http://openshell-supervisor.default.svc:3128") + ); + }); } } diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index 0ed095915..d22e46ca3 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -238,9 +238,9 @@ add `ci/values-spire.yaml` to the OpenShell release values files. | supervisor.image.repository | string | `"ghcr.io/nvidia/openshell/supervisor"` | Supervisor image repository. | | supervisor.image.tag | string | `""` | Supervisor image tag. Defaults to the chart appVersion when empty. | | supervisor.processEnforcement | string | `"network-only"` | Process/filesystem controls applied by the agent process supervisor in non-combined topologies. "network-only" keeps the low-permission agent shape; "full" grants combined-mode process/filesystem controls. | -| supervisor.proxyUid | int | `1337` | UID for the long-running network sidecar in sidecar topology. The network init container installs nftables rules that exempt this UID. | +| supervisor.proxyUid | int | `1337` | UID for the long-running network sidecar or proxy supervisor pod. In sidecar topology, the network init container installs nftables rules that exempt this UID. | | supervisor.sideloadMethod | string | `""` | How the supervisor binary is delivered into sandbox pods. Empty (default) = auto-detect from cluster version: K8s >= v1.35 -> "image-volume" (ImageVolume enabled by default; GA in v1.36) K8s < v1.35 -> "init-container" (copies via init container + emptyDir) On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, set this to "image-volume" explicitly. | -| supervisor.topology | string | `"combined"` | Supervisor pod topology for Kubernetes sandboxes. "combined" runs the current single supervisor container in the agent pod. "sidecar" runs network enforcement in a dedicated sidecar and the process supervisor as a low-capability wrapper in the agent container. | +| supervisor.topology | string | `"combined"` | Supervisor pod topology for Kubernetes sandboxes. "combined" runs the current single supervisor container in the agent pod. "sidecar" runs network enforcement in a dedicated sidecar and the process supervisor as a low-capability wrapper in the agent container. "proxy-pod" runs network enforcement in a separate supervisor Deployment and restricts the agent pod to that supervisor through NetworkPolicy. | | tolerations | list | `[]` | Tolerations for the gateway pod. | | workload.allowMultiReplicaStatefulSet | bool | `false` | Allow replicaCount > 1 while rendering a StatefulSet. Prefer workload.kind=deployment for external database-backed multi-replica gateways; this override exists for operators who explicitly require StatefulSet identity or storage semantics. | | workload.kind | string | `"statefulset"` | Gateway workload controller kind. Use `statefulset` for the default SQLite database, or `deployment` when server.externalDbSecret points at an external database. | diff --git a/deploy/helm/openshell/ci/values-proxy-pod.yaml b/deploy/helm/openshell/ci/values-proxy-pod.yaml new file mode 100644 index 000000000..b7cb533fd --- /dev/null +++ b/deploy/helm/openshell/ci/values-proxy-pod.yaml @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# CI/dev overlay for exercising the Kubernetes proxy-pod topology. +# +# This topology relies on Kubernetes NetworkPolicy enforcement: the agent pod is +# isolated to its paired supervisor pod plus DNS. The local k3s/k3d workflow +# must therefore run with the k3s network policy controller enabled, or with a +# custom policy-enforcing CNI installed before deploying this profile. +# +# Merge after values.yaml and ci/values-skaffold.yaml: +# helm install ... -f values.yaml -f ci/values-skaffold.yaml -f ci/values-proxy-pod.yaml +# +# Or set: +# OPENSHELL_E2E_KUBE_EXTRA_VALUES=deploy/helm/openshell/ci/values-proxy-pod.yaml +# before running `mise run e2e:kubernetes`. +supervisor: + topology: proxy-pod diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index 76a9e7a5b..cf99be69a 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -121,6 +121,11 @@ deploy: #- ci/values-spire.yaml # To exercise the Kubernetes supervisor sidecar topology: #- ci/values-sidecar.yaml + # To exercise proxy-pod topology, use the proxy-pod Skaffold profile + # against a cluster with NetworkPolicy enforcement enabled. Stock k3s + # includes its embedded network policy controller; if you replace the + # CNI, install a policy-enforcing CNI before deploying this profile. + #- ci/values-proxy-pod.yaml # To test multi-replica external PostgreSQL behavior: #- ci/values-high-availability.yaml setValueTemplates: @@ -134,3 +139,8 @@ profiles: - op: add path: /deploy/helm/releases/0/valuesFiles/- value: ci/values-sidecar.yaml + - name: proxy-pod + patches: + - op: add + path: /deploy/helm/releases/0/valuesFiles/- + value: ci/values-proxy-pod.yaml diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 9637c3328..e065e1f75 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -115,7 +115,7 @@ data: supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} supervisor_topology = {{ .Values.supervisor.topology | default "combined" | quote }} process_enforcement = {{ .Values.supervisor.processEnforcement | default "network-only" | quote }} - proxy_uid = {{ .Values.supervisor.proxyUid | default 1337 }} + proxy_uid = {{ .Values.supervisor.proxyUid | default 1337 }} sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} {{- if .Values.server.providerTokenGrants.spiffe.enabled }} provider_spiffe_workload_api_socket_path = {{ .Values.server.providerTokenGrants.spiffe.workloadApiSocketPath | quote }} diff --git a/deploy/helm/openshell/templates/role.yaml b/deploy/helm/openshell/templates/role.yaml index 5ecc4428a..76d07e992 100644 --- a/deploy/helm/openshell/templates/role.yaml +++ b/deploy/helm/openshell/templates/role.yaml @@ -35,10 +35,54 @@ rules: # returned pod name and UID to the pod's `openshell.io/sandbox-id` # annotation. patch is intentionally NOT granted — the annotation is set # once at pod create and must remain immutable for the lifetime of the - # sandbox. + # sandbox. create/delete/list/watch are intentionally not granted; the Agent + # Sandbox controller creates agent pods, and proxy-pod supervisors are + # managed through per-sandbox Deployments. - apiGroups: - "" resources: - pods verbs: - get + # Proxy-pod topology creates one supervisor Deployment, one supervisor + # Service, and one CA Secret per sandbox. All are owner-referenced to the + # Sandbox CR for garbage collection. The gateway also reads the generated + # ReplicaSet during K8s ServiceAccount bootstrap to verify the supervisor + # pod's Pod -> ReplicaSet -> Deployment -> Sandbox owner chain. + - apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - replicasets + verbs: + - get + - apiGroups: + - "" + resources: + - services + - secrets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - create + - delete + - get + - list + - watch diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml index 509eb4279..1619b8ffb 100644 --- a/deploy/helm/openshell/tests/gateway_config_test.yaml +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -92,6 +92,15 @@ tests: path: data["gateway.toml"] pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"sidecar"' + - it: renders proxy-pod supervisor topology under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + set: + supervisor.topology: proxy-pod + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"proxy-pod"' + - it: renders process enforcement under [openshell.drivers.kubernetes] template: templates/gateway-config.yaml set: diff --git a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml index ee89fce53..d2a3f27dd 100644 --- a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml +++ b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml @@ -57,6 +57,49 @@ tests: path: metadata.namespace value: other-ns + - it: grants only pod get for sandbox token bootstrap + template: templates/role.yaml + asserts: + - contains: + path: rules + content: + apiGroups: + - "" + resources: + - pods + verbs: + - get + + - it: grants sandbox RBAC for proxy-pod supervisor Deployments + template: templates/role.yaml + asserts: + - contains: + path: rules + content: + apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - watch + + - it: grants ReplicaSet get for proxy-pod supervisor token bootstrap + template: templates/role.yaml + asserts: + - contains: + path: rules + content: + apiGroups: + - apps + resources: + - replicasets + verbs: + - get + - it: uses explicit sandboxNamespace for sandbox RoleBinding template: templates/rolebinding.yaml set: diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index c670a97b8..2f1c16c40 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -48,13 +48,16 @@ supervisor: # "combined" runs the current single supervisor container in the agent pod. # "sidecar" runs network enforcement in a dedicated sidecar and the process # supervisor as a low-capability wrapper in the agent container. + # "proxy-pod" runs network enforcement in a separate supervisor Deployment and + # restricts the agent pod to that supervisor through NetworkPolicy. topology: "combined" # -- Process/filesystem controls applied by the agent process supervisor in # non-combined topologies. "network-only" keeps the low-permission agent # shape; "full" grants combined-mode process/filesystem controls. processEnforcement: "network-only" - # -- UID for the long-running network sidecar in sidecar topology. The - # network init container installs nftables rules that exempt this UID. + # -- UID for the long-running network sidecar or proxy supervisor pod. In sidecar + # topology, the network init container installs nftables rules that exempt + # this UID. proxyUid: 1337 # -- Image pull secrets attached to gateway and helper pods. diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index cc886c168..d9fd86c74 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -161,7 +161,8 @@ The most commonly changed values are: | `pkiInitJob.serverDnsNames` / `certManager.serverDnsNames` | Additional gateway server DNS SANs. Wildcard SANs also enable sandbox service URLs under that domain. | | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect based on cluster version: clusters running Kubernetes 1.35 or later use `image-volume` (ImageVolume GA in 1.36); older clusters use `init-container`. Set explicitly to `image-volume` on Kubernetes 1.33 or 1.34 with the ImageVolume feature gate enabled, or to `init-container` to force the legacy path on any version. | | `supervisor.topology` | Sandbox pod topology. Refer to [Topology](/kubernetes/topology). | -| `supervisor.proxyUid` | Non-root UID for the long-running network sidecar when `supervisor.topology=sidecar`. The UID must not match the sandbox UID. | +| `supervisor.processEnforcement` | Process/filesystem controls for non-combined topologies. Leave as `network-only` for the low-permission agent shape, or set `full` to keep combined-mode process/filesystem guards with added agent-container permissions. | +| `supervisor.proxyUid` | Non-root UID for the long-running network sidecar or proxy supervisor pod. The UID must not match the sandbox UID. | Use a values file for repeatable deployments: @@ -215,6 +216,10 @@ The namespaced Role covers sandbox lifecycle and identity: | `agents.x-k8s.io` | `sandboxes`, `sandboxes/status` | create, delete, get, list, patch, update, watch | | `""` | `events` | get, list, watch | | `""` | `pods` | get | +| `apps` | `deployments` | create, delete, get, list, watch | +| `apps` | `replicasets` | get | +| `""` | `services`, `secrets` | create, delete, get, list, watch | +| `networking.k8s.io` | `networkpolicies` | create, delete, get, list, watch | The ClusterRole grants node inspection and token validation: @@ -245,7 +250,7 @@ The gateway exposes `/healthz` for process liveness and `/readyz` for dependency ## Next Steps -- To choose between combined and sidecar sandbox pods, refer to [Topology](/kubernetes/topology). +- To choose between combined, sidecar, and proxy-pod sandbox topology, refer to [Topology](/kubernetes/topology). - To enable automatic certificate rotation with cert-manager, refer to [Managing Certificates](/kubernetes/managing-certificates). - To expose the gateway externally without port-forwarding, refer to [Ingress](/kubernetes/ingress). - To configure OIDC or reverse-proxy authentication, refer to [Access Control](/kubernetes/access-control). diff --git a/docs/kubernetes/topology.mdx b/docs/kubernetes/topology.mdx index 5bf942e35..4f0e195d4 100644 --- a/docs/kubernetes/topology.mdx +++ b/docs/kubernetes/topology.mdx @@ -3,14 +3,15 @@ # SPDX-License-Identifier: Apache-2.0 title: "Kubernetes Sandbox Topology" sidebar-title: "Topology" -description: "Choose between combined and sidecar supervisor topology for Kubernetes sandbox pods." +description: "Choose between combined, sidecar, and proxy-pod topology for Kubernetes sandbox pods." keywords: "Generative AI, Cybersecurity, Kubernetes, Sandboxing, Sidecar, Network Policy, RuntimeClass" position: 2 --- -Kubernetes sandbox pods can run the OpenShell supervisor in `combined` or -`sidecar` topology. Choose the topology based on which controls you need inside -the pod and how much privilege your cluster allows on the agent container. +Kubernetes sandbox pods can run the OpenShell supervisor in `combined`, +`sidecar`, or `proxy-pod` topology. Choose the topology based on which controls +you need inside the pod, how much privilege your cluster allows on the agent +container, and whether the cluster enforces Kubernetes NetworkPolicies. ## Choose a Topology @@ -22,6 +23,7 @@ lower-privilege agent container. |---|---|---| | `combined` | You need OpenShell network, filesystem, and process controls in the sandbox workload. | The agent container carries the Linux capabilities the supervisor needs. | | `sidecar` | You need the agent container to run as non-root without added Linux capabilities, and network policy is the primary control. | Defaults to network-only process supervision unless you opt in to `processEnforcement=full`. | +| `proxy-pod` | You need network enforcement to run outside the agent pod and your cluster enforces Kubernetes NetworkPolicies. | Requires a NetworkPolicy-enforcing CNI or controller; defaults to network-only process supervision unless you opt in to `processEnforcement=full`. | ## Privilege Model @@ -33,6 +35,9 @@ The long-running container permissions differ by topology: | `sidecar` | Agent container, process-only supervisor (`network-only`) | `sandbox_uid:sandbox_gid` | `false` | Drops `ALL` | Agent and workload run without added Linux capabilities. | | `sidecar` | Agent container, process-only supervisor (`full`) | Root supervisor | Not explicitly disabled by the driver | Adds combined-mode capabilities | Agent keeps combined-mode process/filesystem guards. | | `sidecar` | Network supervisor sidecar | `proxyUid:sandbox_gid` | `false` | Drops `ALL` | Long-running proxy sidecar is also non-root without added capabilities. | +| `proxy-pod` | Agent pod container, process-only supervisor (`network-only`) | `sandbox_uid:sandbox_gid` | `false` | Drops `ALL` | Agent and workload run without added Linux capabilities in their own pod. | +| `proxy-pod` | Agent pod container, process-only supervisor (`full`) | Root supervisor | Not explicitly disabled by the driver | Adds combined-mode capabilities | Agent keeps combined-mode process/filesystem guards. | +| `proxy-pod` | Supervisor pod container, network proxy only | `proxyUid:sandbox_gid` | `false` | Drops `ALL` | Long-running proxy runs outside the agent pod without added capabilities. | Short-lived setup containers still have the permissions needed to prepare the pod: @@ -41,6 +46,8 @@ pod: |---|---|---|---|---|---| | `combined` | Supervisor install init container | `0` | Not set | Not set | Copies the supervisor binary into the agent container volume. | | `sidecar` | Network init container | `0` | `false` | Drops `ALL`; adds `NET_ADMIN`, `NET_RAW`, `CHOWN`, and `FOWNER` | Installs pod-local nftables rules and prepares shared sidecar state. | +| `proxy-pod` | Supervisor install init container | `0` | Not set | Not set | Copies the supervisor binary into the agent pod volume. | +| `proxy-pod` | Proxy CA install init container | `0:sandbox_gid` | `false` | Drops `ALL` | Copies proxy CA material into the agent pod TLS volume. | ## Combined Topology @@ -144,12 +151,77 @@ dropping, or process/binary identity checks unless you opt in to `supervisor.processEnforcement=full`. +## Proxy-Pod Topology + +Proxy-pod topology moves network enforcement and gateway forwarding into a +separate supervisor Deployment with one pod. The agent pod runs the process +supervisor and reaches the supervisor through a per-sandbox headless Service. + +```mermaid +flowchart TB + Sandbox["agents.x-k8s.io Sandbox"] + + subgraph Namespace["Sandbox namespace"] + subgraph AgentPod["Agent pod"] + ProcessSupervisor["process supervisor
network-only by default"] + Workload["Agent workload"] + end + + SupervisorDeployment["Supervisor Deployment
1 replica"] + subgraph SupervisorPod["Supervisor pod"] + NetworkProxy["network supervisor proxy
proxyUid"] + end + + Service["Headless Service"] + ProxyCA["Proxy CA Secret"] + AgentEgressPolicy["NetworkPolicy
agent egress to supervisor + DNS"] + SupervisorIngressPolicy["NetworkPolicy
supervisor ingress from paired agent"] + end + + Gateway["OpenShell Gateway"] + External["External services"] + + Sandbox --> AgentPod + Sandbox --> SupervisorDeployment + SupervisorDeployment --> SupervisorPod + ProcessSupervisor --> Workload + AgentPod -->|"egress allowed by NetworkPolicy"| Service + Service --> NetworkProxy + NetworkProxy -->|"gateway forwarding"| Gateway + NetworkProxy -->|"policy-enforced egress"| External + ProxyCA -. mounted .- AgentPod + ProxyCA -. mounted .- SupervisorPod + AgentEgressPolicy -. selects .- AgentPod + SupervisorIngressPolicy -. selects .- SupervisorPod +``` + +OpenShell creates these per-sandbox resources: + +- Agent pod labeled `openshell.ai/sandbox-role=agent`. +- Supervisor Deployment with one pod labeled `openshell.ai/sandbox-role=supervisor`. +- Headless Service for the supervisor pod. +- Proxy CA Secret shared through mounts. +- NetworkPolicy that limits agent egress to the supervisor pod and DNS. +- NetworkPolicy that accepts supervisor ingress only from the paired agent pod. + +The supervisor Deployment has a controlling `Sandbox` ownerReference so +Kubernetes garbage collection removes it when the sandbox is deleted. The +Deployment recreates the supervisor pod if the pod is deleted independently. + + +Proxy-pod topology requires NetworkPolicy enforcement to work as OpenShell +expects. The target cluster must have a policy-enforcing CNI or equivalent +NetworkPolicy controller before deploying this topology. Without enforcement, +the agent pod is not forced through its paired supervisor proxy, so the +agent-to-supervisor isolation policy is only declarative. + + ## Credential Exposure -Sidecar topology uses pod `fsGroup` and group-readable projected credentials so -the non-root process supervisor can authenticate to the gateway. This includes -the projected ServiceAccount token used for sandbox token bootstrap and the -sandbox client TLS secret. +Sidecar and proxy-pod topologies use pod `fsGroup` and group-readable projected +credentials so the non-root process supervisor can authenticate to the gateway. +This includes the projected ServiceAccount token used for sandbox token +bootstrap and the sandbox client TLS secret. Treat the agent container as trusted with respect to those in-pod gateway credentials. Use `combined` topology when that credential exposure is not @@ -157,14 +229,18 @@ acceptable for your deployment. ## RuntimeClass Isolation -Sidecar topology pairs well with runtime classes such as gVisor or Kata -Containers when the cluster supports them. A sandboxed runtime strengthens the -container boundary while OpenShell focuses on network policy enforcement from -the sidecar. +RuntimeClass isolation can add a stronger container boundary, but support +depends on the topology and runtime: + +- `proxy-pod` has been tested with Kata Containers and gVisor and is functional + when the cluster enforces NetworkPolicies. +- `sidecar` is experimental with Kata Containers and is known to fail with + gVisor because sidecar mode depends on pod-local network rule setup. -Runtime classes do not re-enable the OpenShell filesystem and process controls -that sidecar mode relaxes. Use them as an additional workload boundary, not as a -replacement for the combined topology's full supervisor controls. +Runtime classes do not re-enable OpenShell filesystem and process controls when +sidecar and proxy-pod modes use the default `network-only` process +enforcement. Use RuntimeClass isolation as an additional workload boundary, not +as a replacement for combined topology or `processEnforcement=full`. You can set a default runtime class in the Kubernetes driver configuration or override it per sandbox with driver config: @@ -175,9 +251,10 @@ openshell sandbox create \ -- claude ``` -## Enable Sidecar Mode +## Enable Alternate Topologies -For direct gateway TOML configuration, set the Kubernetes driver fields: +For direct gateway TOML configuration, set the Kubernetes driver fields for +sidecar mode: ```toml [openshell.drivers.kubernetes] @@ -186,14 +263,27 @@ process_enforcement = "network-only" proxy_uid = 1337 ``` -`proxy_uid` must be a non-root UID and must not match the sandbox UID. -The network init container exempts this UID from proxy redirection so the -sidecar can reach the gateway. -Set `process_enforcement="full"` only when you want the agent process supervisor -to keep combined-mode process/filesystem guards and accept the added +Set `supervisor_topology="proxy-pod"` to use proxy-pod mode: + +```toml +[openshell.drivers.kubernetes] +supervisor_topology = "proxy-pod" +process_enforcement = "network-only" +proxy_uid = 1337 +``` + +`proxy_uid` must be a non-root UID and must not match the sandbox UID. In +sidecar mode, the network init container exempts this UID from proxy +redirection so the sidecar can reach the gateway. In proxy-pod mode, the same +value is used as the non-root UID for the proxy supervisor pod created by the +Deployment. + +Set `process_enforcement="full"` only when you want the agent process +supervisor to keep combined-mode process/filesystem guards and accept the added agent-container permissions. -When the Helm chart renders `gateway.toml`, set the equivalent chart values: +When the Helm chart renders `gateway.toml`, set the equivalent chart values for +sidecar mode: ```yaml supervisor: @@ -202,6 +292,15 @@ supervisor: proxyUid: 1337 ``` +Set `supervisor.topology=proxy-pod` to use proxy-pod mode: + +```yaml +supervisor: + topology: proxy-pod + processEnforcement: network-only + proxyUid: 1337 +``` + Leave `supervisor_topology` unset, or set it to `combined`, to keep the original single-container supervisor path. For Helm installs, leave `supervisor.topology` unset or set it to `combined`. diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index fd125231e..717b049b4 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -180,13 +180,16 @@ supervisor_sideload_method = "image-volume" # "combined" runs the existing single supervisor container with full process, # filesystem, and network enforcement in the agent container. "sidecar" moves # pod-level network enforcement and gateway forwarding into a network sidecar. +# "proxy-pod" moves network enforcement and gateway forwarding into a separate +# supervisor Deployment and uses NetworkPolicy to force agent egress through it. supervisor_topology = "combined" # Process/filesystem controls for non-combined topologies. "network-only" # keeps the low-permission agent shape; "full" grants combined-mode # process/filesystem controls to the agent process supervisor. process_enforcement = "network-only" -# UID used by the long-running network sidecar. In sidecar topology the -# network init container installs nftables rules that exempt this UID. +# UID used by the long-running network sidecar or proxy supervisor pod. In +# sidecar topology, the network init container installs nftables rules that +# exempt this UID. proxy_uid = 1337 grpc_endpoint = "https://openshell-gateway.agents.svc:8080" ssh_socket_path = "/run/openshell/ssh.sock" diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 3dc305154..5b80e2fa8 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -304,9 +304,9 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA | `supervisor_image` | `supervisor.image.repository` / `supervisor.image.tag` | Set the supervisor image that provides the `openshell-sandbox` binary. | | `supervisor_image_pull_policy` | `supervisor.image.pullPolicy` | Set the Kubernetes image pull policy for the supervisor image. | | `supervisor_sideload_method` | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect from cluster version. Set to `image-volume` to mount the supervisor OCI image directly as a volume (requires Kubernetes 1.33+ with the ImageVolume feature gate; GA in 1.36), or `init-container` to copy it through an init container on older clusters. | -| `supervisor_topology` | `supervisor.topology` | Set `combined` for the default single supervisor path, or `sidecar` to move pod-level network enforcement and gateway forwarding into a dedicated sidecar. | +| `supervisor_topology` | `supervisor.topology` | Set `combined` for the default single supervisor path, `sidecar` to move pod-level network enforcement and gateway forwarding into a dedicated sidecar, or `proxy-pod` to run network enforcement in a separate supervisor Deployment with NetworkPolicy isolation. | | `process_enforcement` | `supervisor.processEnforcement` | Process/filesystem controls for non-combined topologies. `network-only` keeps the low-permission agent shape. `full` grants combined-mode process/filesystem controls to the agent process supervisor. | -| `proxy_uid` | `supervisor.proxyUid` | UID used by the long-running network sidecar in `sidecar` topology. The network init container exempts this UID from proxy redirection. | +| `proxy_uid` | `supervisor.proxyUid` | UID used by the long-running network sidecar or proxy supervisor pod. In `sidecar` topology, the network init container exempts this UID from proxy redirection. | | `app_armor_profile` | `server.appArmorProfile` | Set the sandbox agent container's AppArmor profile. Helm defaults this to `Unconfined` so AppArmor-enabled nodes do not block supervisor network namespace setup. Set the Helm value to an empty string to omit the field, or use `RuntimeDefault` or `Localhost/` for operator-managed profiles. | | `workspace_default_storage_size` | `server.workspaceDefaultStorageSize` | Set the default workspace PVC size for new sandboxes. | | `sa_token_ttl_secs` | `server.sandboxJwt.k8sSaTokenTtlSecs` | Set the projected ServiceAccount token TTL used for the bootstrap token exchange. | @@ -314,13 +314,16 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA In `combined` topology, the agent container carries the Linux capabilities needed by the supervisor for network namespace setup, Landlock filesystem policy, process privilege changes, and network policy enforcement. In `sidecar` -topology, the agent container runs as the resolved sandbox UID/GID with no added -Linux capabilities. A root init container performs the nftables setup, and the -long-running sidecar runs non-root with no added Linux capabilities. Sidecar -mode keeps gateway session and SSH behavior, but the process supervisor runs in -`network-only` mode by default: filesystem policy, process privilege dropping, -and process/binary identity checks are not applied by the process container. -Set `process_enforcement = "full"` only when you want those combined-mode +and `proxy-pod` topology, the agent container runs as the resolved sandbox +UID/GID with no added Linux capabilities. Sidecar mode uses a root init +container for nftables setup and a long-running non-root sidecar. Proxy-pod mode +creates a separate non-root supervisor Deployment with one pod, a headless +Service, a proxy CA Secret, and per-sandbox NetworkPolicies. The Deployment +recreates the supervisor pod if it is deleted. Both modes keep gateway session +and SSH behavior, but the process supervisor runs in `network-only` mode by +default: filesystem policy, process privilege dropping, and process/binary +identity checks are not applied by the process container. Set +`process_enforcement = "full"` only when you want those combined-mode process/filesystem guards and accept the added agent-container permissions. Sidecar mode uses pod `fsGroup` so the non-root process supervisor can read the diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index 8ce1989da..8a84fc6ad 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -20,6 +20,12 @@ # files, relative to the repository root or absolute, to layer additional chart # configuration on top of ci/values-skaffold.yaml. # +# Proxy-pod topology: +# Use OPENSHELL_E2E_KUBE_EXTRA_VALUES=deploy/helm/openshell/ci/values-proxy-pod.yaml +# or `mise run e2e:kubernetes:proxy-pod`. The target cluster must enforce +# Kubernetes NetworkPolicies; the ephemeral k3d/k3s path keeps k3s's embedded +# network policy controller enabled. +# # Image source: # - Ephemeral k3d mode builds local `openshell/{gateway,supervisor}:${IMAGE_TAG}` # images by default, imports them into k3d, then installs the chart. This @@ -80,6 +86,7 @@ EXTERNAL_PG_FIXTURE_SERVICE="openshell-e2e-postgres" EXTERNAL_PG_FIXTURE_USER="openshell" EXTERNAL_PG_FIXTURE_PASSWORD="openshell-e2e-postgres" EXTERNAL_PG_FIXTURE_DATABASE="openshell" +PROXY_POD_E2E=0 # Isolate CLI/SDK gateway metadata from the developer's real config. export XDG_CONFIG_HOME="${WORKDIR}/config" @@ -611,6 +618,9 @@ if [ -n "${OPENSHELL_E2E_KUBE_EXTRA_VALUES:-}" ]; then IFS=':' read -r -a extra_values_files <<< "${OPENSHELL_E2E_KUBE_EXTRA_VALUES}" for values_file in "${extra_values_files[@]}"; do [ -n "${values_file}" ] || continue + if [[ "${values_file}" == *"values-proxy-pod.yaml" ]]; then + PROXY_POD_E2E=1 + fi if [[ "${values_file}" != /* ]]; then values_file="${ROOT}/${values_file}" fi @@ -618,6 +628,11 @@ if [ -n "${OPENSHELL_E2E_KUBE_EXTRA_VALUES:-}" ]; then done fi +if [ "${PROXY_POD_E2E}" = "1" ]; then + echo "Proxy-pod e2e profile enabled; target cluster must enforce Kubernetes NetworkPolicies." + echo "Ephemeral k3d/k3s mode uses k3s's embedded NetworkPolicy controller unless the cluster is customized externally." +fi + if [ "${OPENSHELL_E2E_KUBE_DB_SCENARIOS:-0}" = "1" ]; then # --- Multi-scenario mode: test all database backends --- DB_PASSED=0 diff --git a/tasks/helm.toml b/tasks/helm.toml index 433f04f32..a26c7ecae 100644 --- a/tasks/helm.toml +++ b/tasks/helm.toml @@ -60,6 +60,11 @@ description = "Run skaffold dev with the Kubernetes supervisor sidecar topology" dir = "deploy/helm/openshell" run = "skaffold dev -p sidecar" +["helm:skaffold:dev:proxy-pod"] +description = "Run skaffold dev with proxy-pod topology; requires NetworkPolicy enforcement in the target cluster" +dir = "deploy/helm/openshell" +run = "skaffold dev -p proxy-pod" + ["helm:skaffold:run"] description = "Run skaffold run for deploy/helm/openshell (one-shot deploy)" dir = "deploy/helm/openshell" @@ -70,6 +75,11 @@ description = "Run skaffold run with the Kubernetes supervisor sidecar topology" dir = "deploy/helm/openshell" run = "skaffold run -p sidecar" +["helm:skaffold:run:proxy-pod"] +description = "Run skaffold run with proxy-pod topology; requires NetworkPolicy enforcement in the target cluster" +dir = "deploy/helm/openshell" +run = "skaffold run -p proxy-pod" + ["helm:skaffold:delete"] description = "Run skaffold delete for deploy/helm/openshell" dir = "deploy/helm/openshell" @@ -80,6 +90,11 @@ description = "Run skaffold delete for the Kubernetes supervisor sidecar topolog dir = "deploy/helm/openshell" run = "skaffold delete -p sidecar" +["helm:skaffold:delete:proxy-pod"] +description = "Run skaffold delete for the Kubernetes proxy-pod topology" +dir = "deploy/helm/openshell" +run = "skaffold delete -p proxy-pod" + ["helm:skaffold:diagnose"] description = "Run skaffold diagnose for deploy/helm/openshell" dir = "deploy/helm/openshell" diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index f9ac186f5..7cbc98429 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -69,6 +69,10 @@ Environment: macOS uses k3d from mise (Docker required). Linux can use this flow only when k3d is installed explicitly; otherwise use kind or an existing cluster context. Pair with: mise run helm:skaffold:dev + +The proxy-pod Skaffold profile relies on Kubernetes NetworkPolicy enforcement. +This helper leaves k3s's embedded network policy controller enabled; if you +replace the CNI, install a policy-enforcing CNI before using that profile. EOF } diff --git a/tasks/test.toml b/tasks/test.toml index c08fcc5a0..b8165fd82 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -119,6 +119,11 @@ description = "Run Kubernetes e2e with the supervisor sidecar topology overlay" env = { OPENSHELL_E2E_KUBE_EXTRA_VALUES = "deploy/helm/openshell/ci/values-sidecar.yaml" } run = "e2e/rust/e2e-kubernetes.sh" +["e2e:kubernetes:proxy-pod"] +description = "Run Kubernetes e2e with the proxy-pod topology overlay; requires NetworkPolicy enforcement in the target cluster" +env = { OPENSHELL_E2E_KUBE_EXTRA_VALUES = "deploy/helm/openshell/ci/values-proxy-pod.yaml" } +run = "e2e/rust/e2e-kubernetes.sh" + ["e2e:kubernetes:db"] description = "Run Kubernetes e2e with all database backend scenarios (SQLite and external PostgreSQL with existingSecret)" env = { OPENSHELL_E2E_KUBE_DB_SCENARIOS = "1" }