diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 4c1153b08..d1a5a5814 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -52,6 +52,34 @@ impl FromStr for SupervisorSideloadMethod { } } +/// How the supervisor is arranged inside Kubernetes sandbox pods. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum SupervisorTopology { + /// Run networking and process supervision in the agent container. + #[default] + Combined, +} + +impl std::fmt::Display for SupervisorTopology { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Combined => f.write_str("combined"), + } + } +} + +impl FromStr for SupervisorTopology { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "combined" => Ok(Self::Combined), + other => Err(format!("unknown supervisor topology '{other}'")), + } + } +} + /// Kubernetes `AppArmor` profile requested for the sandbox agent container. #[derive(Debug, Clone, PartialEq, Eq)] pub enum AppArmorProfile { @@ -176,6 +204,8 @@ pub struct KubernetesComputeConfig { pub supervisor_image_pull_policy: String, /// How the supervisor binary is delivered into sandbox pods. pub supervisor_sideload_method: SupervisorSideloadMethod, + /// How the supervisor is arranged for Kubernetes sandbox pods. + pub supervisor_topology: SupervisorTopology, pub grpc_endpoint: String, pub ssh_socket_path: String, pub client_tls_secret_name: String, @@ -236,6 +266,7 @@ impl Default for KubernetesComputeConfig { supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), supervisor_image_pull_policy: String::new(), supervisor_sideload_method: SupervisorSideloadMethod::default(), + supervisor_topology: SupervisorTopology::default(), grpc_endpoint: String::new(), ssh_socket_path: "/run/openshell/ssh.sock".to_string(), client_tls_secret_name: String::new(), @@ -333,6 +364,31 @@ mod tests { ); } + #[test] + fn default_supervisor_topology_is_combined() { + let cfg = KubernetesComputeConfig::default(); + assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined); + assert_eq!(cfg.supervisor_topology.to_string(), "combined"); + } + + #[test] + fn serde_override_supervisor_topology_combined() { + let json = serde_json::json!({ + "supervisor_topology": "combined" + }); + let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap(); + assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined); + } + + #[test] + fn serde_rejects_invalid_supervisor_topology() { + let json = serde_json::json!({ + "supervisor_topology": "unsupported" + }); + let err = serde_json::from_value::(json).unwrap_err(); + assert!(err.to_string().contains("unknown variant")); + } + #[test] fn serde_override_workspace_storage_size() { let json = serde_json::json!({ diff --git a/crates/openshell-driver-kubernetes/src/lib.rs b/crates/openshell-driver-kubernetes/src/lib.rs index 22b0a8703..953ed4abd 100644 --- a/crates/openshell-driver-kubernetes/src/lib.rs +++ b/crates/openshell-driver-kubernetes/src/lib.rs @@ -7,7 +7,7 @@ pub mod grpc; pub use config::{ AppArmorProfile, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_WORKSPACE_STORAGE_SIZE, - KubernetesComputeConfig, SupervisorSideloadMethod, + KubernetesComputeConfig, SupervisorSideloadMethod, SupervisorTopology, }; pub use driver::{KubernetesComputeDriver, KubernetesDriverError}; pub use grpc::ComputeDriverService; diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index f7eeeba42..d755613d6 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -11,7 +11,7 @@ use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_kubernetes::{ AppArmorProfile, ComputeDriverService, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, - KubernetesComputeConfig, KubernetesComputeDriver, SupervisorSideloadMethod, + KubernetesComputeConfig, KubernetesComputeDriver, SupervisorSideloadMethod, SupervisorTopology, }; #[derive(Parser, Debug)] @@ -80,6 +80,13 @@ struct Args { )] supervisor_sideload_method: SupervisorSideloadMethod, + #[arg( + long, + env = "OPENSHELL_SUPERVISOR_TOPOLOGY", + default_value = "combined" + )] + supervisor_topology: SupervisorTopology, + #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] enable_user_namespaces: bool, @@ -117,6 +124,7 @@ async fn main() -> Result<()> { .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()), supervisor_image_pull_policy: args.supervisor_image_pull_policy.unwrap_or_default(), supervisor_sideload_method: args.supervisor_sideload_method, + supervisor_topology: args.supervisor_topology, grpc_endpoint: args.grpc_endpoint.unwrap_or_default(), ssh_socket_path: args.sandbox_ssh_socket_path, client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(), diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index e6d539592..36802e205 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -237,6 +237,7 @@ add `ci/values-spire.yaml` to the OpenShell release values files. | supervisor.image.repository | string | `"ghcr.io/nvidia/openshell/supervisor"` | Supervisor image repository. | | supervisor.image.tag | string | `""` | Supervisor image tag. Defaults to the chart appVersion when empty. | | supervisor.sideloadMethod | string | `""` | How the supervisor binary is delivered into sandbox pods. Empty (default) = auto-detect from cluster version: K8s >= v1.35 -> "image-volume" (ImageVolume enabled by default; GA in v1.36) K8s < v1.35 -> "init-container" (copies via init container + emptyDir) On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, set this to "image-volume" explicitly. | +| supervisor.topology | string | `"combined"` | Supervisor pod topology for Kubernetes sandboxes. "combined" runs networking and process supervision in the agent container. | | tolerations | list | `[]` | Tolerations for the gateway pod. | | workload.allowMultiReplicaStatefulSet | bool | `false` | Allow replicaCount > 1 while rendering a StatefulSet. Prefer workload.kind=deployment for external database-backed multi-replica gateways; this override exists for operators who explicitly require StatefulSet identity or storage semantics. | | workload.kind | string | `"statefulset"` | Gateway workload controller kind. Use `statefulset` for the default SQLite database, or `deployment` when server.externalDbSecret points at an external database. | diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 7037be88f..0e75a311f 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -113,6 +113,7 @@ data: grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} service_account_name = {{ include "openshell.sandboxServiceAccountName" . | quote }} supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + supervisor_topology = {{ .Values.supervisor.topology | default "combined" | quote }} sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} {{- if .Values.server.providerTokenGrants.spiffe.enabled }} provider_spiffe_workload_api_socket_path = {{ .Values.server.providerTokenGrants.spiffe.workloadApiSocketPath | quote }} diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml index c2708a20f..50051e003 100644 --- a/deploy/helm/openshell/tests/gateway_config_test.yaml +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -83,6 +83,22 @@ tests: path: data["gateway.toml"] pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?service_account_name\s*=\s*"openshell-sandbox"' + - it: renders combined supervisor topology by default under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"combined"' + + - it: renders explicit combined supervisor topology under [openshell.drivers.kubernetes] + template: templates/gateway-config.yaml + set: + supervisor.topology: combined + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"combined"' + - it: renders sandbox image pull secrets under [openshell.drivers.kubernetes] template: templates/gateway-config.yaml set: diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index d7ff8b257..d769ed4bf 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -44,6 +44,9 @@ supervisor: # On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, # set this to "image-volume" explicitly. sideloadMethod: "" + # -- Supervisor pod topology for Kubernetes sandboxes. + # "combined" runs networking and process supervision in the agent container. + topology: "combined" # -- Image pull secrets attached to gateway and helper pods. imagePullSecrets: [] diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index 5ab786519..f6051b123 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -160,6 +160,7 @@ The most commonly changed values are: | `server.enableLoopbackServiceHttp` | Enable local plaintext HTTP for loopback sandbox service URLs. Defaults to `true`. | | `pkiInitJob.serverDnsNames` / `certManager.serverDnsNames` | Additional gateway server DNS SANs. Wildcard SANs also enable sandbox service URLs under that domain. | | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect based on cluster version: clusters running Kubernetes 1.35 or later use `image-volume` (ImageVolume GA in 1.36); older clusters use `init-container`. Set explicitly to `image-volume` on Kubernetes 1.33 or 1.34 with the ImageVolume feature gate enabled, or to `init-container` to force the legacy path on any version. | +| `supervisor.topology` | Sandbox pod topology. Refer to [Topology](/kubernetes/topology). | Use a values file for repeatable deployments: @@ -243,6 +244,7 @@ The gateway exposes `/healthz` for process liveness and `/readyz` for dependency ## Next Steps +- To review Kubernetes sandbox topology, refer to [Topology](/kubernetes/topology). - To enable automatic certificate rotation with cert-manager, refer to [Managing Certificates](/kubernetes/managing-certificates). - To expose the gateway externally without port-forwarding, refer to [Ingress](/kubernetes/ingress). - To configure OIDC or reverse-proxy authentication, refer to [Access Control](/kubernetes/access-control). diff --git a/docs/kubernetes/topology.mdx b/docs/kubernetes/topology.mdx new file mode 100644 index 000000000..f3f69cf6e --- /dev/null +++ b/docs/kubernetes/topology.mdx @@ -0,0 +1,102 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Kubernetes Sandbox Topology" +sidebar-title: "Topology" +description: "Review the default combined supervisor topology for Kubernetes sandbox pods." +keywords: "Generative AI, Cybersecurity, Kubernetes, Sandboxing, RuntimeClass" +position: 2 +--- + +Kubernetes sandbox pods run the OpenShell supervisor in `combined` topology by +default. Combined topology keeps network, filesystem, and process controls in +the agent pod so the supervisor can enforce the complete OpenShell sandbox +contract before launching the workload. + +## Choose a Topology + +The default `combined` topology preserves the full OpenShell enforcement model. +Use it when you need OpenShell to apply all sandbox controls inside the workload +pod and your cluster policy permits the required Linux capabilities. + +| Topology | Use when | Main tradeoff | +|---|---|---| +| `combined` | You need OpenShell network, filesystem, and process controls in the sandbox workload. | The agent container carries the Linux capabilities the supervisor needs. | + +Additional Kubernetes sandbox topologies are still being designed. Until they +are documented as supported configuration values, `combined` is the only +supported value for `supervisor.topology`. + +## Privilege Model + +The long-running container permissions for `combined` topology are: + +| Topology | Pod or container | UID/GID | Privilege escalation | Capabilities | Result | +|---|---|---|---|---|---| +| `combined` | Agent container, which also runs the supervisor | Not forced by topology | Not explicitly disabled by the driver | Adds `SYS_ADMIN`, `NET_ADMIN`, `SYS_PTRACE`, and `SYSLOG`; adds `SETUID`, `SETGID`, and `DAC_READ_SEARCH` when user namespaces are enabled | Full supervisor controls run in the agent container. | + +Short-lived setup containers still have the permissions needed to prepare the +pod: + +| Topology | Setup container | UID/GID | Privilege escalation | Capabilities | Purpose | +|---|---|---|---|---|---| +| `combined` | Supervisor install init container | `0` | Not set | Not set | Copies the supervisor binary into the agent container volume. | + +## Combined Topology + +Combined topology is the original Kubernetes mode and remains the default. The +agent container starts the OpenShell supervisor, and the supervisor launches the +workload after applying sandbox setup. + +Combined topology keeps these controls in one supervisor path: + +- Network endpoint and L7 policy enforcement. +- Filesystem policy enforcement. +- Process and binary identity checks. +- Privilege drop into the sandbox user. +- Gateway relay, SSH sessions, exec, and file sync. + +Because the supervisor performs network namespace setup and process/filesystem +controls from the agent container, Kubernetes grants that container elevated +Linux capabilities. Use this mode when you need the complete OpenShell sandbox +contract and your cluster policy permits those capabilities. + +## RuntimeClass Isolation + +RuntimeClass isolation can add a stronger container boundary for the sandbox +workload when the cluster supports it. Runtime classes do not replace the +combined topology's supervisor controls; they add another isolation boundary +around the same supervised workload. + +You can set a default runtime class in the Kubernetes driver configuration or +override it per sandbox with driver config: + +```shell +openshell sandbox create \ + --driver-config-json '{"kubernetes":{"pod":{"runtime_class_name":"kata-containers"}}}' \ + -- claude +``` + +## Configure Combined Mode + +For direct gateway TOML configuration, leave `supervisor_topology` unset, or +set it to `combined`, to use the default single-container supervisor path: + +```toml +[openshell.drivers.kubernetes] +supervisor_topology = "combined" +``` + +When the Helm chart renders `gateway.toml`, leave `supervisor.topology` unset, +or set it to `combined`, to produce the same driver configuration: + +```yaml +supervisor: + topology: combined +``` + +## Next Steps + +- To install OpenShell on Kubernetes, refer to [Setup](/kubernetes/setup). +- To configure gateway authentication, refer to [Access Control](/kubernetes/access-control). +- To review the driver fields, refer to [Gateway Configuration File](/reference/gateway-config). diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx index 2aaa6e7b0..9fa3a45fc 100644 --- a/docs/reference/gateway-config.mdx +++ b/docs/reference/gateway-config.mdx @@ -177,6 +177,9 @@ supervisor_image_pull_policy = "IfNotPresent" # Use the image volume on Kubernetes >= 1.35 (GA in 1.36); switch to "init-container" # on older clusters or where the ImageVolume feature gate is off. supervisor_sideload_method = "image-volume" +# "combined" runs networking and process supervision in the sandbox agent +# container and preserves the existing Kubernetes sandbox behavior. +supervisor_topology = "combined" grpc_endpoint = "https://openshell-gateway.agents.svc:8080" ssh_socket_path = "/run/openshell/ssh.sock" client_tls_secret_name = "openshell-client-tls"