Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions crates/openshell-driver-kubernetes/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,34 @@ impl FromStr for SupervisorSideloadMethod {
}
}

/// How the supervisor is arranged inside Kubernetes sandbox pods.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum SupervisorTopology {
/// Run networking and process supervision in the agent container.
#[default]
Combined,
}

impl std::fmt::Display for SupervisorTopology {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Combined => f.write_str("combined"),
}
}
}

impl FromStr for SupervisorTopology {
type Err = String;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"combined" => Ok(Self::Combined),
other => Err(format!("unknown supervisor topology '{other}'")),
}
}
}

/// Kubernetes `AppArmor` profile requested for the sandbox agent container.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum AppArmorProfile {
Expand Down Expand Up @@ -176,6 +204,8 @@ pub struct KubernetesComputeConfig {
pub supervisor_image_pull_policy: String,
/// How the supervisor binary is delivered into sandbox pods.
pub supervisor_sideload_method: SupervisorSideloadMethod,
/// How the supervisor is arranged for Kubernetes sandbox pods.
pub supervisor_topology: SupervisorTopology,
pub grpc_endpoint: String,
pub ssh_socket_path: String,
pub client_tls_secret_name: String,
Expand Down Expand Up @@ -236,6 +266,7 @@ impl Default for KubernetesComputeConfig {
supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(),
supervisor_image_pull_policy: String::new(),
supervisor_sideload_method: SupervisorSideloadMethod::default(),
supervisor_topology: SupervisorTopology::default(),
grpc_endpoint: String::new(),
ssh_socket_path: "/run/openshell/ssh.sock".to_string(),
client_tls_secret_name: String::new(),
Expand Down Expand Up @@ -333,6 +364,31 @@ mod tests {
);
}

#[test]
fn default_supervisor_topology_is_combined() {
let cfg = KubernetesComputeConfig::default();
assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined);
assert_eq!(cfg.supervisor_topology.to_string(), "combined");
}

#[test]
fn serde_override_supervisor_topology_combined() {
let json = serde_json::json!({
"supervisor_topology": "combined"
});
let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap();
assert_eq!(cfg.supervisor_topology, SupervisorTopology::Combined);
}

#[test]
fn serde_rejects_invalid_supervisor_topology() {
let json = serde_json::json!({
"supervisor_topology": "unsupported"
});
let err = serde_json::from_value::<KubernetesComputeConfig>(json).unwrap_err();
assert!(err.to_string().contains("unknown variant"));
}

#[test]
fn serde_override_workspace_storage_size() {
let json = serde_json::json!({
Expand Down
2 changes: 1 addition & 1 deletion crates/openshell-driver-kubernetes/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub mod grpc;

pub use config::{
AppArmorProfile, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME, DEFAULT_WORKSPACE_STORAGE_SIZE,
KubernetesComputeConfig, SupervisorSideloadMethod,
KubernetesComputeConfig, SupervisorSideloadMethod, SupervisorTopology,
};
pub use driver::{KubernetesComputeDriver, KubernetesDriverError};
pub use grpc::ComputeDriverService;
10 changes: 9 additions & 1 deletion crates/openshell-driver-kubernetes/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use openshell_core::VERSION;
use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer;
use openshell_driver_kubernetes::{
AppArmorProfile, ComputeDriverService, DEFAULT_SANDBOX_SERVICE_ACCOUNT_NAME,
KubernetesComputeConfig, KubernetesComputeDriver, SupervisorSideloadMethod,
KubernetesComputeConfig, KubernetesComputeDriver, SupervisorSideloadMethod, SupervisorTopology,
};

#[derive(Parser, Debug)]
Expand Down Expand Up @@ -80,6 +80,13 @@ struct Args {
)]
supervisor_sideload_method: SupervisorSideloadMethod,

#[arg(
long,
env = "OPENSHELL_SUPERVISOR_TOPOLOGY",
default_value = "combined"
)]
supervisor_topology: SupervisorTopology,

#[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")]
enable_user_namespaces: bool,

Expand Down Expand Up @@ -117,6 +124,7 @@ async fn main() -> Result<()> {
.unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()),
supervisor_image_pull_policy: args.supervisor_image_pull_policy.unwrap_or_default(),
supervisor_sideload_method: args.supervisor_sideload_method,
supervisor_topology: args.supervisor_topology,
grpc_endpoint: args.grpc_endpoint.unwrap_or_default(),
ssh_socket_path: args.sandbox_ssh_socket_path,
client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(),
Expand Down
1 change: 1 addition & 0 deletions deploy/helm/openshell/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ add `ci/values-spire.yaml` to the OpenShell release values files.
| supervisor.image.repository | string | `"ghcr.io/nvidia/openshell/supervisor"` | Supervisor image repository. |
| supervisor.image.tag | string | `""` | Supervisor image tag. Defaults to the chart appVersion when empty. |
| supervisor.sideloadMethod | string | `""` | How the supervisor binary is delivered into sandbox pods. Empty (default) = auto-detect from cluster version: K8s >= v1.35 -> "image-volume" (ImageVolume enabled by default; GA in v1.36) K8s < v1.35 -> "init-container" (copies via init container + emptyDir) On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled, set this to "image-volume" explicitly. |
| supervisor.topology | string | `"combined"` | Supervisor pod topology for Kubernetes sandboxes. "combined" runs networking and process supervision in the agent container. |
| tolerations | list | `[]` | Tolerations for the gateway pod. |
| workload.allowMultiReplicaStatefulSet | bool | `false` | Allow replicaCount > 1 while rendering a StatefulSet. Prefer workload.kind=deployment for external database-backed multi-replica gateways; this override exists for operators who explicitly require StatefulSet identity or storage semantics. |
| workload.kind | string | `"statefulset"` | Gateway workload controller kind. Use `statefulset` for the default SQLite database, or `deployment` when server.externalDbSecret points at an external database. |
Expand Down
1 change: 1 addition & 0 deletions deploy/helm/openshell/templates/gateway-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ data:
grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }}
service_account_name = {{ include "openshell.sandboxServiceAccountName" . | quote }}
supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }}
supervisor_topology = {{ .Values.supervisor.topology | default "combined" | quote }}
sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }}
{{- if .Values.server.providerTokenGrants.spiffe.enabled }}
provider_spiffe_workload_api_socket_path = {{ .Values.server.providerTokenGrants.spiffe.workloadApiSocketPath | quote }}
Expand Down
16 changes: 16 additions & 0 deletions deploy/helm/openshell/tests/gateway_config_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,22 @@ tests:
path: data["gateway.toml"]
pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?service_account_name\s*=\s*"openshell-sandbox"'

- it: renders combined supervisor topology by default under [openshell.drivers.kubernetes]
template: templates/gateway-config.yaml
asserts:
- matchRegex:
path: data["gateway.toml"]
pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"combined"'

- it: renders explicit combined supervisor topology under [openshell.drivers.kubernetes]
template: templates/gateway-config.yaml
set:
supervisor.topology: combined
asserts:
- matchRegex:
path: data["gateway.toml"]
pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?supervisor_topology\s*=\s*"combined"'

- it: renders sandbox image pull secrets under [openshell.drivers.kubernetes]
template: templates/gateway-config.yaml
set:
Expand Down
3 changes: 3 additions & 0 deletions deploy/helm/openshell/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ supervisor:
# On K8s v1.33-v1.34 with the ImageVolume feature gate manually enabled,
# set this to "image-volume" explicitly.
sideloadMethod: ""
# -- Supervisor pod topology for Kubernetes sandboxes.
# "combined" runs networking and process supervision in the agent container.
topology: "combined"

# -- Image pull secrets attached to gateway and helper pods.
imagePullSecrets: []
Expand Down
2 changes: 2 additions & 0 deletions docs/kubernetes/setup.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ The most commonly changed values are:
| `server.enableLoopbackServiceHttp` | Enable local plaintext HTTP for loopback sandbox service URLs. Defaults to `true`. |
| `pkiInitJob.serverDnsNames` / `certManager.serverDnsNames` | Additional gateway server DNS SANs. Wildcard SANs also enable sandbox service URLs under that domain. |
| `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect based on cluster version: clusters running Kubernetes 1.35 or later use `image-volume` (ImageVolume GA in 1.36); older clusters use `init-container`. Set explicitly to `image-volume` on Kubernetes 1.33 or 1.34 with the ImageVolume feature gate enabled, or to `init-container` to force the legacy path on any version. |
| `supervisor.topology` | Sandbox pod topology. Refer to [Topology](/kubernetes/topology). |

Use a values file for repeatable deployments:

Expand Down Expand Up @@ -243,6 +244,7 @@ The gateway exposes `/healthz` for process liveness and `/readyz` for dependency

## Next Steps

- To review Kubernetes sandbox topology, refer to [Topology](/kubernetes/topology).
- To enable automatic certificate rotation with cert-manager, refer to [Managing Certificates](/kubernetes/managing-certificates).
- To expose the gateway externally without port-forwarding, refer to [Ingress](/kubernetes/ingress).
- To configure OIDC or reverse-proxy authentication, refer to [Access Control](/kubernetes/access-control).
Expand Down
102 changes: 102 additions & 0 deletions docs/kubernetes/topology.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: "Kubernetes Sandbox Topology"
sidebar-title: "Topology"
description: "Review the default combined supervisor topology for Kubernetes sandbox pods."
keywords: "Generative AI, Cybersecurity, Kubernetes, Sandboxing, RuntimeClass"
position: 2
---

Kubernetes sandbox pods run the OpenShell supervisor in `combined` topology by
default. Combined topology keeps network, filesystem, and process controls in
the agent pod so the supervisor can enforce the complete OpenShell sandbox
contract before launching the workload.

## Choose a Topology

The default `combined` topology preserves the full OpenShell enforcement model.
Use it when you need OpenShell to apply all sandbox controls inside the workload
pod and your cluster policy permits the required Linux capabilities.

| Topology | Use when | Main tradeoff |
|---|---|---|
| `combined` | You need OpenShell network, filesystem, and process controls in the sandbox workload. | The agent container carries the Linux capabilities the supervisor needs. |

Additional Kubernetes sandbox topologies are still being designed. Until they
are documented as supported configuration values, `combined` is the only
supported value for `supervisor.topology`.

## Privilege Model

The long-running container permissions for `combined` topology are:

| Topology | Pod or container | UID/GID | Privilege escalation | Capabilities | Result |
|---|---|---|---|---|---|
| `combined` | Agent container, which also runs the supervisor | Not forced by topology | Not explicitly disabled by the driver | Adds `SYS_ADMIN`, `NET_ADMIN`, `SYS_PTRACE`, and `SYSLOG`; adds `SETUID`, `SETGID`, and `DAC_READ_SEARCH` when user namespaces are enabled | Full supervisor controls run in the agent container. |

Short-lived setup containers still have the permissions needed to prepare the
pod:

| Topology | Setup container | UID/GID | Privilege escalation | Capabilities | Purpose |
|---|---|---|---|---|---|
| `combined` | Supervisor install init container | `0` | Not set | Not set | Copies the supervisor binary into the agent container volume. |

## Combined Topology

Combined topology is the original Kubernetes mode and remains the default. The
agent container starts the OpenShell supervisor, and the supervisor launches the
workload after applying sandbox setup.

Combined topology keeps these controls in one supervisor path:

- Network endpoint and L7 policy enforcement.
- Filesystem policy enforcement.
- Process and binary identity checks.
- Privilege drop into the sandbox user.
- Gateway relay, SSH sessions, exec, and file sync.

Because the supervisor performs network namespace setup and process/filesystem
controls from the agent container, Kubernetes grants that container elevated
Linux capabilities. Use this mode when you need the complete OpenShell sandbox
contract and your cluster policy permits those capabilities.

## RuntimeClass Isolation

RuntimeClass isolation can add a stronger container boundary for the sandbox
workload when the cluster supports it. Runtime classes do not replace the
combined topology's supervisor controls; they add another isolation boundary
around the same supervised workload.

You can set a default runtime class in the Kubernetes driver configuration or
override it per sandbox with driver config:

```shell
openshell sandbox create \
--driver-config-json '{"kubernetes":{"pod":{"runtime_class_name":"kata-containers"}}}' \
-- claude
```

## Configure Combined Mode

For direct gateway TOML configuration, leave `supervisor_topology` unset, or
set it to `combined`, to use the default single-container supervisor path:

```toml
[openshell.drivers.kubernetes]
supervisor_topology = "combined"
```

When the Helm chart renders `gateway.toml`, leave `supervisor.topology` unset,
or set it to `combined`, to produce the same driver configuration:

```yaml
supervisor:
topology: combined
```

## Next Steps

- To install OpenShell on Kubernetes, refer to [Setup](/kubernetes/setup).
- To configure gateway authentication, refer to [Access Control](/kubernetes/access-control).
- To review the driver fields, refer to [Gateway Configuration File](/reference/gateway-config).
3 changes: 3 additions & 0 deletions docs/reference/gateway-config.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ supervisor_image_pull_policy = "IfNotPresent"
# Use the image volume on Kubernetes >= 1.35 (GA in 1.36); switch to "init-container"
# on older clusters or where the ImageVolume feature gate is off.
supervisor_sideload_method = "image-volume"
# "combined" runs networking and process supervision in the sandbox agent
# container and preserves the existing Kubernetes sandbox behavior.
supervisor_topology = "combined"
grpc_endpoint = "https://openshell-gateway.agents.svc:8080"
ssh_socket_path = "/run/openshell/ssh.sock"
client_tls_secret_name = "openshell-client-tls"
Expand Down
Loading