From 3de55a3e94449790428731f5714b736b3f37a42a Mon Sep 17 00:00:00 2001 From: riddhibhagwat-db Date: Thu, 2 Jul 2026 18:50:03 +0000 Subject: [PATCH] experimental/air: drop max_retries from the ai_runtime_task payload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the ai_runtime_task path, retries are driven by the AI Runtime service, not the Jobs task max_retries field — setting it had no effect on execution retries (a run with max_retries=0 still retried). Stop sending max_retries / retry_on_timeout, matching the Python CLI's native path. The max_retries YAML field and its validation remain in the schema; it is simply not consumed on this submission path. Co-authored-by: Isaac --- experimental/air/cmd/runsubmit.go | 10 ++++------ experimental/air/cmd/runsubmit_test.go | 22 ++++------------------ 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/experimental/air/cmd/runsubmit.go b/experimental/air/cmd/runsubmit.go index b32ebd622c..3cebda33f6 100644 --- a/experimental/air/cmd/runsubmit.go +++ b/experimental/air/cmd/runsubmit.go @@ -66,13 +66,15 @@ type jobEnvironment struct { } // submitTask is the single task air submits: a native ai_runtime_task. +// +// max_retries / retry_on_timeout are intentionally omitted: on the ai_runtime_task +// path retries are driven by the AI Runtime service, not the Jobs task field, so +// setting it here has no effect (matches the Python CLI's native path). type submitTask struct { TaskKey string `json:"task_key"` RunIf string `json:"run_if"` AiRuntimeTask aiRuntimeTask `json:"ai_runtime_task"` EnvironmentKey string `json:"environment_key"` - MaxRetries int `json:"max_retries"` - RetryOnTimeout bool `json:"retry_on_timeout,omitempty"` } // jobsSubmitRun is the Jobs runs/submit payload. @@ -123,11 +125,7 @@ func buildSubmitPayload(cfg *runConfig, commandPath, dlImage string) jobsSubmitR RunIf: "ALL_SUCCESS", AiRuntimeTask: task, EnvironmentKey: aiRuntimeEnvironmentKey, - MaxRetries: cfg.maxRetries(), } - // max_retries 0 (no retries) is sent explicitly; retry_on_timeout only - // applies when retries are allowed. - st.RetryOnTimeout = st.MaxRetries > 0 return jobsSubmitRun{ RunName: cfg.ExperimentName, diff --git a/experimental/air/cmd/runsubmit_test.go b/experimental/air/cmd/runsubmit_test.go index bfd92dcd58..f302c342fb 100644 --- a/experimental/air/cmd/runsubmit_test.go +++ b/experimental/air/cmd/runsubmit_test.go @@ -30,7 +30,6 @@ func TestBuildSubmitPayload(t *testing.T) { ExperimentName: "exp", Command: new("python train.py"), Compute: &computeConfig{AcceleratorType: "GPU_8xH100", NumAccelerators: 16}, - MaxRetries: new(2), TimeoutMinutes: new(30), MLflowRunName: new("run-v2"), MLflowExperimentDirectory: new("/Workspace/Users/me/exp"), @@ -49,8 +48,6 @@ func TestBuildSubmitPayload(t *testing.T) { assert.Equal(t, "exp", task.TaskKey) assert.Equal(t, "ALL_SUCCESS", task.RunIf) assert.Equal(t, aiRuntimeEnvironmentKey, task.EnvironmentKey) - assert.Equal(t, 2, task.MaxRetries) - assert.True(t, task.RetryOnTimeout) at := task.AiRuntimeTask assert.Equal(t, "exp", at.Experiment) @@ -59,24 +56,13 @@ func TestBuildSubmitPayload(t *testing.T) { require.Len(t, at.Deployments, 1) assert.Equal(t, "/d/command.sh", at.Deployments[0].CommandPath) assert.Equal(t, aiRuntimeCompute{AcceleratorType: "GPU_8xH100", AcceleratorCount: 16}, at.Deployments[0].Compute) -} - -func TestBuildSubmitPayload_NoRetries(t *testing.T) { - cfg := &runConfig{ - ExperimentName: "exp", - Command: new("x"), - Compute: &computeConfig{AcceleratorType: "GPU_1xH100", NumAccelerators: 1}, - MaxRetries: new(0), - } - - task := buildSubmitPayload(cfg, "/d/command.sh", "4").Tasks[0] - assert.Equal(t, 0, task.MaxRetries) - assert.False(t, task.RetryOnTimeout) - // max_retries: 0 must be sent, not omitted, so the server honors "no retries". + // max_retries / retry_on_timeout are not sent: the ai_runtime_task path does + // not honor them (retries are driven by the AI Runtime service). b, err := json.Marshal(task) require.NoError(t, err) - assert.Contains(t, string(b), `"max_retries":0`) + assert.NotContains(t, string(b), "max_retries") + assert.NotContains(t, string(b), "retry_on_timeout") } func TestSubmitToken(t *testing.T) {