diff --git a/apps/webapp/app/presenters/v3/ApiBatchResultsPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiBatchResultsPresenter.server.ts index b3dd39637da..cd3c2c9e33a 100644 --- a/apps/webapp/app/presenters/v3/ApiBatchResultsPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ApiBatchResultsPresenter.server.ts @@ -10,19 +10,22 @@ export class ApiBatchResultsPresenter extends BasePresenter { env: AuthenticatedEnvironment ): Promise { return this.traceWithEnv("call", env, async (span) => { - const batchRun = await this._prisma.batchTaskRun.findFirst({ - where: { - friendlyId, - runtimeEnvironmentId: env.id, - }, - include: { - items: { - select: { - taskRunId: true, + // Route through the store so a NEW-resident batch resolves under the run-ops split (the + // router probes NEW→LEGACY and drops this client hint) instead of 404ing on a control-plane read. + const batchRun = await runStore.findBatchTaskRunByFriendlyId( + friendlyId, + env.id, + { + include: { + items: { + select: { + taskRunId: true, + }, }, }, }, - }); + this._prisma + ); if (!batchRun) { return undefined; diff --git a/apps/webapp/test/presenters/ApiBatchResultsPresenter.split.test.ts b/apps/webapp/test/presenters/ApiBatchResultsPresenter.split.test.ts new file mode 100644 index 00000000000..4a75cf281d9 --- /dev/null +++ b/apps/webapp/test/presenters/ApiBatchResultsPresenter.split.test.ts @@ -0,0 +1,97 @@ +// Run-ops split resolution LOCK for ApiBatchResultsPresenter. +// +// GET /api/v1/batches/:id/results constructs the presenter BARE (no injected client), so it must +// resolve a batch that lives in the NEW run-ops DB on its own. The presenter routes the batch-row +// lookup through the `runStore` singleton, whose split router probes NEW→LEGACY. This drives a +// NEW-resident (ksuid) batch through a REAL two-physical-DB split router and asserts the bare +// presenter finds it. Fails before the fix (the presenter read the control-plane DB directly and +// 404'd on a NEW-resident batch). + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore, RoutingRunStore } from "@internal/run-store"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import type { Organization, PrismaClient, Project } from "@trigger.dev/database"; +import { generateKsuidId } from "@trigger.dev/core/v3/isomorphic"; +import { expect, vi } from "vitest"; +import { ApiBatchResultsPresenter } from "~/presenters/v3/ApiBatchResultsPresenter.server"; +import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; + +// The split router built over the two testcontainer DBs; injected in place of the db.server-backed +// singleton the presenter imports. Populated per-test before the presenter is constructed. +let testRunStore: RoutingRunStore; + +// Presenter reads the batch row via `runStore`; child-run reads also go through it. Neutralize the +// real db.server singleton (no env DB) and the runStore singleton (use the split router below). +// The getter defers to `testRunStore` so each test can set its own router before constructing. +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); +vi.mock("~/v3/runStore.server", () => ({ + get runStore() { + return testRunStore; + }, +})); + +vi.setConfig({ testTimeout: 60_000 }); + +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + return new RoutingRunStore({ new: newStore, legacy: legacyStore }); +} + +function authEnv(environmentId: string): AuthenticatedEnvironment { + return { + id: environmentId, + type: "DEVELOPMENT", + project: { id: "proj_split" } as Project, + organization: { id: "org_split" } as Organization, + orgMember: null, + } as unknown as AuthenticatedEnvironment; +} + +heteroRunOpsPostgresTest( + "a bare ApiBatchResultsPresenter resolves a NEW-resident (ksuid) batch under the split", + async ({ prisma14, prisma17 }) => { + testRunStore = makeSplitRouter(prisma14, prisma17); + + const environmentId = "env_split_res"; + // ksuid internal id → classifies to the NEW store, seeded in the NEW (prisma17) DB. The + // friendlyId probe fans out NEW→LEGACY regardless of id shape, so the NEW seed is what matters. + const batchInternalId = generateKsuidId(); + const batchFriendlyId = `batch_${generateKsuidId()}`; + + await prisma17.batchTaskRun.create({ + data: { + id: batchInternalId, + friendlyId: batchFriendlyId, + runtimeEnvironmentId: environmentId, + }, + }); + + // Bare construction — exactly how the results route builds it. + const presenter = new ApiBatchResultsPresenter(); + const result = await presenter.call(batchFriendlyId, authEnv(environmentId)); + + // Before the fix this 404s (undefined) because a control-plane read misses the NEW-resident batch. + expect(result).toEqual({ id: batchFriendlyId, items: [] }); + } +); + +heteroRunOpsPostgresTest( + "a bare ApiBatchResultsPresenter still returns undefined for a genuinely missing batch", + async ({ prisma14, prisma17 }) => { + testRunStore = makeSplitRouter(prisma14, prisma17); + + const presenter = new ApiBatchResultsPresenter(); + const result = await presenter.call("batch_does_not_exist", authEnv("env_none")); + + expect(result).toBeUndefined(); + } +); diff --git a/apps/webapp/test/updateMetadataStoreRoutingHetero.test.ts b/apps/webapp/test/updateMetadataStoreRoutingHetero.test.ts new file mode 100644 index 00000000000..eded6bccf99 --- /dev/null +++ b/apps/webapp/test/updateMetadataStoreRoutingHetero.test.ts @@ -0,0 +1,478 @@ +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { ReadClient, RunStore } from "@internal/run-store"; +import type { Prisma, PrismaClient } from "@trigger.dev/database"; +import { parsePacket } from "@trigger.dev/core/v3"; +import { generateKsuidId } from "@trigger.dev/core/v3/isomorphic"; +import { setTimeout } from "timers/promises"; +import { describe, expect } from "vitest"; +import { UpdateMetadataService } from "~/services/metadata/updateMetadata.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +/** + * A test-only RunStore that routes residency-bearing operations to one of two + * inner PostgresRunStore instances (NEW = PG17, LEGACY = PG14) purely by run-id + * classification — NOT by whatever client the service forwards as `tx`. + * + * This is the load-bearing design point: the UpdateMetadataService forwards + * `this._prisma` as the tx/client to every findRun/updateMetadata call. To prove + * STORE residency routing (and not the forwarded prisma), this wrapper IGNORES + * the forwarded client for residency-bearing calls and resolves to its own inner + * store by id length, then calls the inner store WITHOUT forwarding the outer tx + * (passes undefined), so the inner PostgresRunStore uses its own prisma17/prisma14. + * + * Classification contract (length-disjoint): 27-char id => KSUID => NEW store; + * 25-char cuid => LEGACY store. + */ +class RoutingRunStore implements RunStore { + readonly #newStore: PostgresRunStore; + readonly #legacyStore: PostgresRunStore; + + constructor(newStore: PostgresRunStore, legacyStore: PostgresRunStore) { + this.#newStore = newStore; + this.#legacyStore = legacyStore; + } + + // Resolve by run-id length: 27 => NEW (KSUID), otherwise LEGACY (25-char cuid). + #resolveById(runId: string): PostgresRunStore { + return runId.length === 27 ? this.#newStore : this.#legacyStore; + } + + // Extract a classifiable run id from a `where`. Prefers `where.id`; if only a + // friendlyId is present we cannot classify by length, so the caller falls back + // to read-through (try NEW, then LEGACY). + #idFromWhere(where: Prisma.TaskRunWhereInput): string | undefined { + const id = (where as { id?: unknown }).id; + return typeof id === "string" ? id : undefined; + } + + // ---- Reads (residency routing; drop forwarded client) ---- + + async findRun( + where: Prisma.TaskRunWhereInput, + argsOrClient?: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } | ReadClient, + _client?: ReadClient + ): Promise { + const id = this.#idFromWhere(where); + if (id !== undefined) { + // Classifiable by id length — route to the owning store, dropping the + // forwarded client so the inner store uses its OWN prisma. + return (this.#resolveById(id).findRun as any)(where, argsOrClient); + } + // Not classifiable (friendlyId-only / other) — read-through: NEW then LEGACY. + const fromNew = await (this.#newStore.findRun as any)(where, argsOrClient); + if (fromNew) { + return fromNew; + } + return (this.#legacyStore.findRun as any)(where, argsOrClient); + } + + async findRunOrThrow( + where: Prisma.TaskRunWhereInput, + argsOrClient?: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } | ReadClient, + _client?: ReadClient + ): Promise { + const id = this.#idFromWhere(where); + if (id !== undefined) { + return (this.#resolveById(id).findRunOrThrow as any)(where, argsOrClient); + } + const fromNew = await (this.#newStore.findRun as any)(where, argsOrClient); + if (fromNew) { + return fromNew; + } + return (this.#legacyStore.findRunOrThrow as any)(where, argsOrClient); + } + + async findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args?: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } + ): Promise { + const id = this.#idFromWhere(where); + if (id !== undefined) { + return (this.#resolveById(id).findRunOnPrimary as any)(where, args); + } + const fromNew = await (this.#newStore.findRunOnPrimary as any)(where, args); + if (fromNew) { + return fromNew; + } + return (this.#legacyStore.findRunOnPrimary as any)(where, args); + } + + async findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args?: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } + ): Promise { + const id = this.#idFromWhere(where); + if (id !== undefined) { + return (this.#resolveById(id).findRunOrThrowOnPrimary as any)(where, args); + } + const fromNew = await (this.#newStore.findRunOnPrimary as any)(where, args); + if (fromNew) { + return fromNew; + } + return (this.#legacyStore.findRunOrThrowOnPrimary as any)(where, args); + } + + async findRuns( + args: { where: Prisma.TaskRunWhereInput }, + _client?: ReadClient + ): Promise { + const id = this.#idFromWhere(args.where); + if (id !== undefined) { + return (this.#resolveById(id).findRuns as any)(args); + } + // Read-through across both stores, NEW first. + const fromNew = (await (this.#newStore.findRuns as any)(args)) as unknown[]; + const fromLegacy = (await (this.#legacyStore.findRuns as any)(args)) as unknown[]; + return [...fromNew, ...fromLegacy]; + } + + // ---- Field touches (residency routing; drop forwarded tx) ---- + + async updateMetadata( + runId: string, + data: Parameters[1], + options: Parameters[2], + _tx?: unknown + ): Promise<{ count: number }> { + // Route by run id, dropping the forwarded tx so the inner store writes to + // its OWN prisma — this is what proves the CAS targets the owning store. + return this.#resolveById(runId).updateMetadata(runId, data, options); + } + + // ---- Everything else: delegate by run id to satisfy the RunStore interface; + // not exercised by these tests. ---- + + createRun(params: any, _tx?: unknown): any { + return this.#resolveById(params.data.id).createRun(params); + } + createCancelledRun(params: any, _tx?: unknown): any { + return this.#resolveById(params.data.id).createCancelledRun(params); + } + createFailedRun(params: any, _tx?: unknown): any { + return this.#resolveById(params.data.id).createFailedRun(params); + } + startAttempt(runId: string, data: any, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).startAttempt as any)(runId, data, args); + } + completeAttemptSuccess(runId: string, data: any, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).completeAttemptSuccess as any)(runId, data, args); + } + recordRetryOutcome(runId: string, data: any, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).recordRetryOutcome as any)(runId, data, args); + } + requeueRun(runId: string, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).requeueRun as any)(runId, args); + } + recordBulkActionMembership(runId: string, bulkActionId: string, _tx?: unknown): any { + return this.#resolveById(runId).recordBulkActionMembership(runId, bulkActionId); + } + cancelRun(runId: string, data: any, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).cancelRun as any)(runId, data, args); + } + failRunPermanently(runId: string, data: any, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).failRunPermanently as any)(runId, data, args); + } + expireRun(runId: string, data: any, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).expireRun as any)(runId, data, args); + } + expireRunsBatch(runIds: string[], data: any, _tx?: unknown): any { + return this.#resolveById(runIds[0] ?? "").expireRunsBatch(runIds, data); + } + lockRunToWorker(runId: string, data: any, _tx?: unknown): any { + return this.#resolveById(runId).lockRunToWorker(runId, data); + } + parkPendingVersion(runId: string, data: any, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).parkPendingVersion as any)(runId, data, args); + } + promotePendingVersionRuns(runId: string, _tx?: unknown): any { + return this.#resolveById(runId).promotePendingVersionRuns(runId); + } + suspendForCheckpoint(runId: string, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).suspendForCheckpoint as any)(runId, args); + } + resumeFromCheckpoint(runId: string, args: any, _tx?: unknown): any { + return (this.#resolveById(runId).resumeFromCheckpoint as any)(runId, args); + } + rescheduleRun(runId: string, data: any, _tx?: unknown): any { + return this.#resolveById(runId).rescheduleRun(runId, data); + } + enqueueDelayedRun(runId: string, data: any, _tx?: unknown): any { + return this.#resolveById(runId).enqueueDelayedRun(runId, data); + } + rewriteDebouncedRun(runId: string, data: any, _tx?: unknown): any { + return this.#resolveById(runId).rewriteDebouncedRun(runId, data); + } + clearIdempotencyKey(params: any, _tx?: unknown): any { + const runId = params?.byId?.runId ?? ""; + return this.#resolveById(runId).clearIdempotencyKey(params); + } + pushTags(runId: string, tags: string[], where: any, _tx?: unknown): any { + return this.#resolveById(runId).pushTags(runId, tags, where); + } + pushRealtimeStream(runId: string, streamId: string, _tx?: unknown): any { + return this.#resolveById(runId).pushRealtimeStream(runId, streamId); + } +} + +function buildRoutingStore(prisma17: PrismaClient, prisma14: PrismaClient) { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + return new RoutingRunStore(newStore, legacyStore); +} + +// 25-char cuid-format id (starts with 'c'), length-disjoint from the 27-char KSUID. +function generateLegacyCuid() { + const suffix = Array.from( + { length: 24 }, + () => "0123456789abcdefghijklmnopqrstuvwxyz"[Math.floor(Math.random() * 36)] + ).join(""); + return `c${suffix}`; +} + +async function seedOrgProjectEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `test-${suffix}`, slug: `test-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `test-${suffix}`, + slug: `test-${suffix}`, + organizationId: organization.id, + externalRef: `test-${suffix}`, + }, + }); + const runtimeEnvironment = await prisma.runtimeEnvironment.create({ + data: { + slug: `test-${suffix}`, + type: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + apiKey: `test-${suffix}`, + pkApiKey: `test-${suffix}`, + shortcode: `test-${suffix}`, + }, + }); + return { organization, project, runtimeEnvironment }; +} + +describe("UpdateMetadataService store routing (hetero)", () => { + heteroPostgresTest( + "routes read+CAS to the owning (NEW/PG17) store for a KSUID run", + async ({ prisma17, prisma14 }) => { + const runId = generateKsuidId(); + expect(runId.length).toBe(27); + + const { project, organization, runtimeEnvironment } = await seedOrgProjectEnv( + prisma17, + "new" + ); + + const seeded = await prisma17.taskRun.create({ + data: { + id: runId, + friendlyId: `run_${runId}`, + taskIdentifier: "my-task", + payload: JSON.stringify({ foo: "bar" }), + traceId: "1234", + spanId: "1234", + queue: "test", + runtimeEnvironmentId: runtimeEnvironment.id, + projectId: project.id, + organizationId: organization.id, + environmentType: "DEVELOPMENT", + engine: "V2", + }, + }); + + const service = new UpdateMetadataService({ + // prisma is set to one of the clients only to satisfy the required option; + // the routing store deliberately does NOT honor it for residency. + prisma: prisma17, + runStore: buildRoutingStore(prisma17, prisma14), + flushIntervalMs: 100, + flushEnabled: true, + flushLoggingEnabled: true, + maximumSize: 1024 * 1024 * 1, + logLevel: "error", + }); + + const result = await service.call(runId, { + operations: [{ type: "set", key: "foo", value: "bar" }], + }); + + expect(result?.metadata).toEqual({ foo: "bar" }); + + // The owning store (PG17) has the update with version incremented by exactly 1. + const newRow = await prisma17.taskRun.findFirst({ where: { id: runId } }); + expect(newRow).not.toBeNull(); + const newMetadata = await parsePacket({ + data: newRow?.metadata ?? undefined, + dataType: newRow?.metadataType ?? "application/json", + }); + expect(newMetadata).toEqual({ foo: "bar" }); + // CAS incremented the version by exactly 1. + expect(newRow?.metadataVersion).toBe(seeded.metadataVersion + 1); + + // The LEGACY store (PG14) never saw this id — no cross-DB leakage. + const legacyRow = await prisma14.taskRun.findFirst({ where: { id: runId } }); + expect(legacyRow).toBeNull(); + + service.stopFlushing(); + } + ); + + heteroPostgresTest( + "preserves CAS under concurrent writers on a NEW-DB (PG17) run", + async ({ prisma17, prisma14 }) => { + const runId = generateKsuidId(); + expect(runId.length).toBe(27); + + const { project, organization, runtimeEnvironment } = await seedOrgProjectEnv( + prisma17, + "cas" + ); + + const seeded = await prisma17.taskRun.create({ + data: { + id: runId, + friendlyId: `run_${runId}`, + taskIdentifier: "my-task", + payload: JSON.stringify({ foo: "bar" }), + traceId: "1234", + spanId: "1234", + queue: "test", + runtimeEnvironmentId: runtimeEnvironment.id, + projectId: project.id, + organizationId: organization.id, + environmentType: "DEVELOPMENT", + engine: "V2", + }, + }); + + let onAfterReadCallCount = 0; + + const service = new UpdateMetadataService({ + prisma: prisma17, + runStore: buildRoutingStore(prisma17, prisma14), + flushIntervalMs: 100, + flushEnabled: true, + flushLoggingEnabled: true, + maximumSize: 1024 * 1024 * 1, + logLevel: "error", + onAfterRead: async (rId) => { + onAfterReadCallCount++; + // Simulate a concurrent writer landing between the service's read and CAS, + // for the first 3 reads — forcing CAS count===0 and a retry each time. + // The concurrent writes go straight to PG17 (the owning DB). + if (onAfterReadCallCount <= 3) { + await prisma17.taskRun.updateMany({ + where: { id: rId }, + data: { + metadata: JSON.stringify({ concurrent: `update${onAfterReadCallCount}` }), + metadataVersion: { increment: 1 }, + }, + }); + } + }, + }); + + const result = await service.call(runId, { + operations: [{ type: "set", key: "immediate", value: "value1" }], + }); + + // Initial read + 3 retries. + expect(onAfterReadCallCount).toBe(4); + + // No lost update: the final state reflects BOTH the last concurrent write and + // the service's operation. + expect(result?.metadata).toEqual({ concurrent: "update3", immediate: "value1" }); + + // Let the buffered (post-retry) operation flush to the owning store. + await setTimeout(1000); + + const newRow = await prisma17.taskRun.findFirst({ where: { id: runId } }); + const metadata = await parsePacket({ + data: newRow?.metadata ?? undefined, + dataType: newRow?.metadataType ?? "application/json", + }); + expect(metadata).toEqual({ concurrent: "update3", immediate: "value1" }); + + // 3 concurrent increments + 1 successful service CAS, relative to the seed. + expect(newRow?.metadataVersion).toBe(seeded.metadataVersion + 4); + + // LEGACY store untouched. + const legacyRow = await prisma14.taskRun.findFirst({ where: { id: runId } }); + expect(legacyRow).toBeNull(); + + service.stopFlushing(); + } + ); + + heteroPostgresTest( + "routes read-through + CAS to the LEGACY (PG14) store for a cuid run without spanning DBs", + async ({ prisma17, prisma14 }) => { + const runId = generateLegacyCuid(); + expect(runId.length).toBe(25); + + const { project, organization, runtimeEnvironment } = await seedOrgProjectEnv( + prisma14, + "legacy" + ); + + const seeded = await prisma14.taskRun.create({ + data: { + id: runId, + friendlyId: `run_${runId}`, + taskIdentifier: "my-task", + payload: JSON.stringify({ foo: "bar" }), + traceId: "1234", + spanId: "1234", + queue: "test", + runtimeEnvironmentId: runtimeEnvironment.id, + projectId: project.id, + organizationId: organization.id, + environmentType: "DEVELOPMENT", + engine: "V2", + }, + }); + + const service = new UpdateMetadataService({ + prisma: prisma17, + runStore: buildRoutingStore(prisma17, prisma14), + flushIntervalMs: 100, + flushEnabled: true, + flushLoggingEnabled: true, + maximumSize: 1024 * 1024 * 1, + logLevel: "error", + }); + + // Call WITHOUT an environment arg, so the `where` is just `{ id: runId }` and + // the router classifies by id length (25 => LEGACY). + const result = await service.call(runId, { + operations: [{ type: "set", key: "x", value: 1 }], + }); + + expect(result?.metadata).toEqual({ x: 1 }); + + // The owning LEGACY store (PG14) got the update. + const legacyRow = await prisma14.taskRun.findFirst({ where: { id: runId } }); + expect(legacyRow).not.toBeNull(); + const legacyMetadata = await parsePacket({ + data: legacyRow?.metadata ?? undefined, + dataType: legacyRow?.metadataType ?? "application/json", + }); + expect(legacyMetadata).toEqual({ x: 1 }); + // CAS incremented the version by exactly 1. + expect(legacyRow?.metadataVersion).toBe(seeded.metadataVersion + 1); + + // The NEW store (PG17) never saw a write for this id — read-through resolved to + // LEGACY and the CAS targeted the SAME store. + const newRow = await prisma17.taskRun.findFirst({ where: { id: runId } }); + expect(newRow).toBeNull(); + + service.stopFlushing(); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/controlPlaneResolver.ts b/internal-packages/run-engine/src/engine/controlPlaneResolver.ts new file mode 100644 index 00000000000..fe5416a94e8 --- /dev/null +++ b/internal-packages/run-engine/src/engine/controlPlaneResolver.ts @@ -0,0 +1,348 @@ +import { + type BackgroundWorker, + type BackgroundWorkerTask, + type PrismaClient, + Prisma, + RuntimeEnvironmentType, + type TaskQueue, + type WorkerDeployment, +} from "@trigger.dev/database"; +import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; +import type { AuthenticatedEnvironment } from "@trigger.dev/core/v3/auth/environment"; + +/** + * Read-side analogue of the `runStore` seam. + * + * Each of the 5 run-rooted reads that threaded control-plane data through a single + * Prisma `include` (env/project/org, the worker version + its tasks/queues) was an + * in-DB join today but a broken cross-provider join once the run-ops DB splits. This + * resolver lets the consumer read the run-ops scalars via `runStore` and resolve the + * control-plane half here, so no cross-DB join is required. + * + * The default `PassthroughControlPlaneResolver` runs the SAME in-DB joins as before + * against a single client, so single-DB / self-host behaviour is byte-identical. The + * webapp injects an adapter over its cached cross-DB resolver. + */ + +/** + * The control-plane half of an environment, carrying BOTH the flat scalars some + * consumers read AND the nested shape required by `MinimalAuthenticatedEnvironment`, + * so a `ResolvedEngineEnv` is a structural supertype of it and can be passed directly + * to `enqueueSystem.enqueueRun({ env })`. `concurrencyLimitBurstFactor` stays a + * `Prisma.Decimal` (do NOT coerce); `maximumConcurrencyLimit` is non-null per schema. + */ +export type ResolvedEngineEnv = { + id: string; + type: RuntimeEnvironmentType; + archivedAt: Date | null; + maximumConcurrencyLimit: number; + concurrencyLimitBurstFactor: Prisma.Decimal; + projectId: string; + organizationId: string; + project: { id: string }; + organization: { id: string }; +}; + +/** + * The richer control-plane env primitive: the slim, structural + * `AuthenticatedEnvironment` (slug/branchName/project/organization/…) PLUS the + * `git` JSON column that the runAttemptSystem reads via `safeParseGitMeta`. + * `AuthenticatedEnvironment` does not carry `git`, so the intersection adds it. + */ +export type ResolvedAuthenticatedEnv = AuthenticatedEnvironment & { git: Prisma.JsonValue | null }; + +/** Identical to dequeue's `WorkerDeploymentWithWorkerTasks`. */ +export type ResolvedWorkerVersion = { + worker: BackgroundWorker; + tasks: BackgroundWorkerTask[]; + queues: TaskQueue[]; + deployment: WorkerDeployment | null; +}; + +export interface ControlPlaneResolver { + resolveEnv(environmentId: string): Promise; + resolveAuthenticatedEnv(environmentId: string): Promise; + resolveWorkerVersion(args: { + environmentId: string; + type: RuntimeEnvironmentType; + workerId?: string; + }): Promise; + assertEnvExists(environmentId: string): Promise; +} + +export class PassthroughControlPlaneResolver implements ControlPlaneResolver { + readonly #prisma: PrismaClient; + + constructor(opts: { prisma: PrismaClient }) { + // Reads go through the primary client to stay snapshot-consistent with the run row the + // consumers read on `prisma`/tx, so passthrough is byte-identical to the prior in-DB include. + this.#prisma = opts.prisma; + } + + async resolveEnv(environmentId: string): Promise { + const env = await this.#prisma.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + select: { + id: true, + type: true, + archivedAt: true, + maximumConcurrencyLimit: true, + concurrencyLimitBurstFactor: true, + projectId: true, + project: { select: { id: true, organizationId: true } }, + organization: { select: { id: true } }, + }, + }); + + if (!env) { + return null; + } + + return { + id: env.id, + type: env.type, + archivedAt: env.archivedAt, + maximumConcurrencyLimit: env.maximumConcurrencyLimit, + concurrencyLimitBurstFactor: env.concurrencyLimitBurstFactor, + projectId: env.projectId, + organizationId: env.project.organizationId, + project: { id: env.project.id }, + organization: { id: env.organization.id }, + }; + } + + async resolveAuthenticatedEnv(environmentId: string): Promise { + const env = await this.#prisma.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + include: { + project: true, + organization: true, + orgMember: { + select: { + userId: true, + user: { select: { id: true, displayName: true, name: true } }, + }, + }, + }, + }); + + if (!env) { + return null; + } + + return { + id: env.id, + slug: env.slug, + type: env.type, + apiKey: env.apiKey, + organizationId: env.organizationId, + projectId: env.projectId, + orgMemberId: env.orgMemberId, + parentEnvironmentId: env.parentEnvironmentId, + branchName: env.branchName, + archivedAt: env.archivedAt, + paused: env.paused, + shortcode: env.shortcode, + maximumConcurrencyLimit: env.maximumConcurrencyLimit, + // Coerce Prisma's Decimal to a plain number — mirrors toAuthenticated(). + concurrencyLimitBurstFactor: env.concurrencyLimitBurstFactor.toNumber(), + builtInEnvironmentVariableOverrides: env.builtInEnvironmentVariableOverrides, + createdAt: env.createdAt, + updatedAt: env.updatedAt, + project: { + id: env.project.id, + slug: env.project.slug, + name: env.project.name, + externalRef: env.project.externalRef, + engine: env.project.engine, + deletedAt: env.project.deletedAt, + defaultWorkerGroupId: env.project.defaultWorkerGroupId, + organizationId: env.project.organizationId, + builderProjectId: env.project.builderProjectId, + }, + organization: { + id: env.organization.id, + slug: env.organization.slug, + title: env.organization.title, + streamBasinName: env.organization.streamBasinName, + maximumConcurrencyLimit: env.organization.maximumConcurrencyLimit, + runsEnabled: env.organization.runsEnabled, + maximumDevQueueSize: env.organization.maximumDevQueueSize, + maximumDeployedQueueSize: env.organization.maximumDeployedQueueSize, + featureFlags: env.organization.featureFlags, + apiRateLimiterConfig: env.organization.apiRateLimiterConfig, + batchRateLimitConfig: env.organization.batchRateLimitConfig, + batchQueueConcurrencyConfig: env.organization.batchQueueConcurrencyConfig, + }, + orgMember: env.orgMember, + parentEnvironment: null, + git: env.git, + }; + } + + async assertEnvExists(environmentId: string): Promise { + const env = await this.#prisma.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + select: { id: true }, + }); + + if (!env) { + throw new Error(`Environment not found: ${environmentId}`); + } + } + + async resolveWorkerVersion(args: { + environmentId: string; + type: RuntimeEnvironmentType; + workerId?: string; + }): Promise { + const { environmentId, type, workerId } = args; + + if (type === "DEVELOPMENT") { + return workerId ? this.#getWorkerById(workerId) : this.#getMostRecentWorker(environmentId); + } + + return workerId + ? this.#getWorkerDeploymentFromWorker(workerId) + : this.#getManagedWorkerFromCurrentlyPromotedDeployment(environmentId); + } + + async #getWorkerDeploymentFromWorker(workerId: string): Promise { + const worker = await this.#prisma.backgroundWorker.findFirst({ + where: { + id: workerId, + }, + include: { + deployment: true, + tasks: true, + queues: true, + }, + }); + + if (!worker) { + return null; + } + + return { + worker, + tasks: worker.tasks, + queues: worker.queues, + deployment: worker.deployment, + }; + } + + async #getMostRecentWorker(environmentId: string): Promise { + const worker = await this.#prisma.backgroundWorker.findFirst({ + where: { + runtimeEnvironmentId: environmentId, + }, + include: { + tasks: true, + queues: true, + }, + orderBy: { + id: "desc", + }, + }); + + if (!worker) { + return null; + } + + return { worker, tasks: worker.tasks, queues: worker.queues, deployment: null }; + } + + async #getWorkerById(workerId: string): Promise { + const worker = await this.#prisma.backgroundWorker.findFirst({ + where: { + id: workerId, + }, + include: { + deployment: true, + tasks: true, + queues: true, + }, + orderBy: { + id: "desc", + }, + }); + + if (!worker) { + return null; + } + + return { + worker, + tasks: worker.tasks, + queues: worker.queues, + deployment: worker.deployment, + }; + } + + async #getManagedWorkerFromCurrentlyPromotedDeployment( + environmentId: string + ): Promise { + const promotion = await this.#prisma.workerDeploymentPromotion.findFirst({ + where: { + environmentId, + label: CURRENT_DEPLOYMENT_LABEL, + }, + include: { + deployment: { + include: { + worker: { + include: { + tasks: true, + queues: true, + }, + }, + }, + }, + }, + }); + + if (!promotion || !promotion.deployment.worker) { + return null; + } + + if (promotion.deployment.type === "MANAGED") { + // This is a run engine v2 deployment, so return it + return { + worker: promotion.deployment.worker, + tasks: promotion.deployment.worker.tasks, + queues: promotion.deployment.worker.queues, + deployment: promotion.deployment, + }; + } + + // We need to get the latest run engine v2 deployment + const latestV2Deployment = await this.#prisma.workerDeployment.findFirst({ + where: { + environmentId, + type: "MANAGED", + }, + orderBy: { + id: "desc", + }, + include: { + worker: { + include: { + tasks: true, + queues: true, + }, + }, + }, + }); + + if (!latestV2Deployment?.worker) { + return null; + } + + return { + worker: latestV2Deployment.worker, + tasks: latestV2Deployment.worker.tasks, + queues: latestV2Deployment.worker.queues, + deployment: latestV2Deployment, + }; + } +} diff --git a/internal-packages/run-engine/src/engine/errors.ts b/internal-packages/run-engine/src/engine/errors.ts index 2bb05a304c9..68c146d8e5f 100644 --- a/internal-packages/run-engine/src/engine/errors.ts +++ b/internal-packages/run-engine/src/engine/errors.ts @@ -111,3 +111,20 @@ export class ExecutionSnapshotNotFoundError extends Error { this.name = "ExecutionSnapshotNotFoundError"; } } + +export class UnclassifiableWaitpointId extends Error { + readonly waitpointId: string; + readonly waitpointIdLength: number; + readonly cause?: unknown; + constructor(waitpointId: string, options?: { cause?: unknown }) { + super( + `Unclassifiable waitpointId for completion: length ${waitpointId.length} matches neither cuid nor ksuid — waitpointId=${JSON.stringify( + waitpointId + )}` + ); + this.name = "UnclassifiableWaitpointId"; + this.waitpointId = waitpointId; + this.waitpointIdLength = waitpointId.length; + this.cause = options?.cause; + } +} diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 106f5947fe3..40bc954b485 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -74,6 +74,7 @@ import { RunAttemptSystem } from "./systems/runAttemptSystem.js"; import { NoopPendingVersionRunIdLookup } from "./services/pendingVersionLookup.js"; import { SystemResources } from "./systems/systems.js"; import { PostgresRunStore, RunStore } from "@internal/run-store"; +import { ControlPlaneResolver, PassthroughControlPlaneResolver } from "./controlPlaneResolver.js"; import { TtlSystem } from "./systems/ttlSystem.js"; import { WaitpointSystem } from "./systems/waitpointSystem.js"; import { @@ -105,6 +106,7 @@ export class RunEngine { prisma: PrismaClient; readOnlyPrisma: PrismaReplicaClient; runStore: RunStore; + controlPlaneResolver: ControlPlaneResolver; runQueue: RunQueue; eventBus: EventBus = new EventEmitter(); executionSnapshotSystem: ExecutionSnapshotSystem; @@ -126,10 +128,17 @@ export class RunEngine { this.logger = options.logger ?? new Logger("RunEngine", this.options.logLevel ?? "info"); this.prisma = options.prisma; this.readOnlyPrisma = options.readOnlyPrisma ?? this.prisma; - this.runStore = new PostgresRunStore({ - prisma: this.prisma, - readOnlyPrisma: this.readOnlyPrisma, - }); + this.runStore = + options.store ?? + new PostgresRunStore({ + prisma: this.prisma, + readOnlyPrisma: this.readOnlyPrisma, + }); + this.controlPlaneResolver = + options.controlPlaneResolver ?? + new PassthroughControlPlaneResolver({ + prisma: this.prisma, + }); this.runLockRedis = createRedisClient( { ...options.runLock.redis, @@ -321,6 +330,7 @@ export class RunEngine { prisma: this.prisma, readOnlyPrisma: this.readOnlyPrisma, runStore: this.runStore, + controlPlaneResolver: this.controlPlaneResolver, worker: this.worker, eventBus: this.eventBus, logger: this.logger, @@ -425,7 +435,6 @@ export class RunEngine { }); // Initialize BatchQueue for DRR-based batch processing (if configured) - // Only start consumers if consumerDisabled is not set or is false const startBatchQueueConsumers = options.batchQueue?.consumerEnabled ?? true; this.batchQueue = new BatchQueue({ @@ -632,7 +641,11 @@ export class RunEngine { const id = RunId.fromFriendlyId(snapshot.friendlyId); const error: TaskRunError = { type: "STRING_ERROR", raw: cancelReason }; + // App-level replacement for the dropped TaskRun env/project Cascade FKs. + await this.controlPlaneResolver.assertEnvExists(snapshot.environment.id); + try { + // Forward the bare caller tx so the routing store picks the owning DB by id. const taskRun = await this.runStore.createCancelledRun( { data: { @@ -711,7 +724,7 @@ export class RunEngine { organizationId: snapshot.environment.organization.id, }, }, - prisma + tx ); if (emitRunCancelledEvent) { @@ -837,7 +850,6 @@ export class RunEngine { "trigger", async (span) => { // Handle debounce before creating a new run - // Store claimId if we successfully claimed the debounce key let debounceClaimId: string | undefined; if (debounce) { @@ -898,7 +910,8 @@ export class RunEngine { batch, workerId, runnerId, - tx: prisma, + // No tx: the block edge is a routed, run-co-resident write, not part of the + // control-plane trigger tx. Threading it pinned the edge write to the wrong DB. }); } @@ -922,10 +935,14 @@ export class RunEngine { // Apply defaultMaxTtl: use as default when no TTL is provided, clamp when larger const resolvedTtl = this.#resolveMaxTtl(ttl); - //create run let taskRun: TaskRun & { associatedWaitpoint: Waitpoint | null }; const taskRunId = RunId.fromFriendlyId(friendlyId); + + // App-level replacement for the dropped TaskRun env/project Cascade FKs. + await this.controlPlaneResolver.assertEnvExists(environment.id); + try { + // Forward the bare caller tx so the routing store picks the owning DB by id. taskRun = await this.runStore.createRun( { data: { @@ -1026,7 +1043,7 @@ export class RunEngine { }) : undefined, }, - prisma + tx ); } catch (error) { if (error instanceof Prisma.PrismaClientKnownRequestError) { @@ -1079,7 +1096,6 @@ export class RunEngine { waitpoints: taskRun.associatedWaitpoint.id, projectId: taskRun.associatedWaitpoint.projectId, batch, - tx: prisma, }); } else { // Single triggerAndWait: acquire the parent run lock to safely transition @@ -1092,7 +1108,6 @@ export class RunEngine { batch, workerId, runnerId, - tx: prisma, }); } } @@ -1258,6 +1273,9 @@ export class RunEngine { async (span) => { const taskRunId = RunId.fromFriendlyId(friendlyId); + // App-level replacement for the dropped TaskRun env/project Cascade FKs. + await this.controlPlaneResolver.assertEnvExists(environment.id); + // Build associated waitpoint data if parent is waiting for this run const waitpointData = resumeParentOnCompletion && parentTaskRunId @@ -1267,7 +1285,6 @@ export class RunEngine { }) : undefined; - // Create the run in terminal SYSTEM_FAILURE status. // No execution snapshot is needed: this run never gets dequeued, executed, // or heartbeated, so nothing will call getLatestExecutionSnapshot on it. const taskRun = await this.runStore.createFailedRun( @@ -1302,7 +1319,7 @@ export class RunEngine { }, associatedWaitpoint: waitpointData, }, - this.prisma + undefined ); span.setAttribute("runId", taskRun.id); @@ -1623,6 +1640,7 @@ export class RunEngine { * If you pass an `idempotencyKey`, the waitpoint will be created only if it doesn't already exist. */ async createDateTimeWaitpoint({ + runId, projectId, environmentId, completedAfter, @@ -1630,6 +1648,8 @@ export class RunEngine { idempotencyKeyExpiresAt, tx, }: { + /** The run that will block on this waitpoint. Co-locates the waitpoint with the run's DB. */ + runId?: string; projectId: string; environmentId: string; completedAfter: Date; @@ -1638,6 +1658,7 @@ export class RunEngine { tx?: PrismaClientOrTransaction; }) { return this.waitpointSystem.createDateTimeWaitpoint({ + runId, projectId, environmentId, completedAfter, @@ -1651,6 +1672,7 @@ export class RunEngine { * If you pass an `idempotencyKey` and it already exists, it will return the existing waitpoint. */ async createManualWaitpoint({ + runId, environmentId, projectId, idempotencyKey, @@ -1658,6 +1680,8 @@ export class RunEngine { timeout, tags, }: { + /** The run that will block on this waitpoint. Co-locates the waitpoint with the run's DB. */ + runId?: string; environmentId: string; projectId: string; idempotencyKey?: string; @@ -1666,6 +1690,7 @@ export class RunEngine { tags?: string[]; }): Promise<{ waitpoint: Waitpoint; isCached: boolean }> { return this.waitpointSystem.createManualWaitpoint({ + runId, environmentId, projectId, idempotencyKey, @@ -1693,20 +1718,21 @@ export class RunEngine { organizationId: string; tx?: PrismaClientOrTransaction; }): Promise { - const prisma = tx ?? this.prisma; - try { - const waitpoint = await prisma.waitpoint.create({ - data: { - ...WaitpointId.generate(), - type: "BATCH", - idempotencyKey: batchId, - userProvidedIdempotencyKey: false, - completedByBatchId: batchId, - environmentId, - projectId, + const waitpoint = await this.runStore.createWaitpoint( + { + data: { + ...WaitpointId.generate(), + type: "BATCH", + idempotencyKey: batchId, + userProvidedIdempotencyKey: false, + completedByBatchId: batchId, + environmentId, + projectId, + }, }, - }); + tx + ); await this.blockRunWithWaitpoint({ runId, @@ -1714,7 +1740,7 @@ export class RunEngine { projectId, organizationId, batch: { id: batchId }, - tx: prisma, + // No tx: the block edge routes to the run's owning DB, not the control-plane tx. }); return waitpoint; @@ -1829,21 +1855,24 @@ export class RunEngine { projectId: string; waitpointId: string; }): Promise { - const waitpoint = await this.prisma.waitpoint.findFirst({ - where: { id: waitpointId }, - include: { - blockingTaskRuns: { - select: { - taskRun: { - select: { - id: true, - friendlyId: true, + const waitpoint = await this.runStore.findWaitpoint( + { + where: { id: waitpointId }, + include: { + blockingTaskRuns: { + select: { + taskRun: { + select: { + id: true, + friendlyId: true, + }, }, }, }, }, }, - }); + this.prisma + ); if (!waitpoint) return null; if (waitpoint.environmentId !== environmentId) return null; @@ -1904,6 +1933,14 @@ export class RunEngine { isError: boolean; }; }): Promise { + // Consult the cross-seam guard FIRST so an unclassifiable id fails loudly + // here (never a silent local apply). Do NOT branch on decision.store: store routing is + // installed below, as the first statement of waitpointSystem.completeWaitpoint; + // we delegate unconditionally and inherit it. No-op when unset. + const guard = this.options.crossSeamGuard; + if (guard) { + await guard({ waitpointId: id, routeKind: "RESUME_TOKEN" }); + } return this.waitpointSystem.completeWaitpoint({ id, output }); } @@ -2022,7 +2059,7 @@ export class RunEngine { }): Promise { const prisma = tx ?? this.prisma; try { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); + const snapshot = await getLatestExecutionSnapshot(prisma, runId, this.runStore); return executionDataFromSnapshot(snapshot); } catch (e) { this.logger.error("Failed to getRunExecutionData", { @@ -2052,7 +2089,7 @@ export class RunEngine { const prisma = tx ?? (useReplica ? this.readOnlyPrisma : this.prisma); const query = async (client: PrismaClientOrTransaction) => { - const snapshots = await getExecutionSnapshotsSince(client, runId, snapshotId); + const snapshots = await getExecutionSnapshotsSince(client, runId, snapshotId, this.runStore); return snapshots.map(executionDataFromSnapshot); }; @@ -2156,10 +2193,8 @@ export class RunEngine { async quit() { try { - // stop the worker queue observer loop this.workerQueueObserverAbortController?.abort(); - //stop the run queue await this.runQueue.quit(); await this.worker.stop(); await this.ttlWorker.stop(); @@ -2168,13 +2203,11 @@ export class RunEngine { // This is just a failsafe await this.runLockRedis.quit(); - // Close the batch queue and its Redis connections await this.batchQueue.close(); - // Close the debounce system Redis connection await this.debounceSystem.quit(); } catch (error) { - // And should always throw + // Best-effort shutdown; ignore quit/close errors. } } @@ -2222,7 +2255,7 @@ export class RunEngine { } async #repairRun(runId: string, dryRun: boolean) { - const snapshot = await getLatestExecutionSnapshot(this.prisma, runId); + const snapshot = await getLatestExecutionSnapshot(this.prisma, runId, this.runStore); if ( snapshot.executionStatus === "QUEUED" || @@ -2387,7 +2420,7 @@ export class RunEngine { }) { const prisma = tx ?? this.prisma; return await this.runLock.lock("handleStalledSnapshot", [runId], async () => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.runStore); if (latestSnapshot.id !== snapshotId) { this.logger.log( "RunEngine.#handleStalledSnapshot() no longer the latest snapshot, stopping the heartbeat.", @@ -2423,19 +2456,7 @@ export class RunEngine { }); //the run didn't start executing, we need to requeue it - const run = await this.runStore.findRun( - { id: runId }, - { - include: { - runtimeEnvironment: { - include: { - organization: true, - }, - }, - }, - }, - prisma - ); + const run = await this.runStore.findRun({ id: runId }, prisma); if (!run) { this.logger.error( @@ -2669,7 +2690,7 @@ export class RunEngine { executionStatus: string; }) { return await this.runLock.lock("handleRepairSnapshot", [runId], async () => { - const latestSnapshot = await getLatestExecutionSnapshot(this.prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(this.prisma, runId, this.runStore); if (latestSnapshot.id !== snapshotId) { this.logger.log( @@ -2832,7 +2853,6 @@ export class RunEngine { }, }); - // Log the finished runs for (const run of runs) { this.logger.info("Concurrency sweeper callback found finished run", { runId: run.id, diff --git a/internal-packages/run-engine/src/engine/systems/batchSystem.test.ts b/internal-packages/run-engine/src/engine/systems/batchSystem.test.ts new file mode 100644 index 00000000000..39a0f965e13 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/batchSystem.test.ts @@ -0,0 +1,712 @@ +import { + assertNonNullable, + containerTestWithIsolatedRedis as containerTest, +} from "@internal/testcontainers"; +import { PostgresRunStore, RoutingRunStore, type RunStore } from "@internal/run-store"; +import { trace } from "@internal/tracing"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import type { Prisma, PrismaClient } from "@trigger.dev/database"; +import { setTimeout } from "node:timers/promises"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any, store?: RunStore) { + return { + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 20, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * A real PostgresRunStore subclass that counts the four batch-completion reads/writes + * #tryCompleteBatch routes through. super.* runs the genuine implementation, so the routing is + * observed over real containers without ever mocking prisma or the store. + */ +class CountingPostgresRunStore extends PostgresRunStore { + public batchReads = 0; + public memberRunReads = 0; + public batchUpdates = 0; + public waitpointReads = 0; + // Captures the `where` of the most recent findRuns call so a test can prove the member-run read + // is scoped by BOTH batchId AND runtimeEnvironmentId (matching the index), not batchId alone. + public lastFindRunsWhere: any = undefined; + + override async findBatchTaskRunById( + id: string, + args?: any, + client?: any + ): ReturnType { + this.batchReads++; + return super.findBatchTaskRunById(id, args, client); + } + + override async findRuns(args: any, client?: any): Promise { + this.memberRunReads++; + this.lastFindRunsWhere = args?.where; + return super.findRuns(args, client); + } + + override async updateBatchTaskRun( + args: { + where: Prisma.BatchTaskRunWhereUniqueInput; + data: Prisma.BatchTaskRunUpdateInput; + select: S; + }, + tx?: any + ): Promise> { + this.batchUpdates++; + return super.updateBatchTaskRun(args, tx); + } + + override async findWaitpoint( + args: any, + client?: any + ): Promise | null> { + this.waitpointReads++; + return super.findWaitpoint(args, client); + } +} + +/** + * Drives a batchTriggerAndWait batch to all-children-complete, returning the engine + ids needed + * to assert completion. Mirrors the batchTriggerAndWait.test.ts preamble; the parent is blocked on + * the batch waitpoint and both children are run to completion. + */ +async function driveBatchToAllChildrenComplete( + engine: RunEngine, + prisma: PrismaClient, + friendlyPrefix: string +) { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, environment, [parentTask, childTask]); + + const batch = await prisma.batchTaskRun.create({ + data: { + friendlyId: generateFriendlyId("batch"), + runtimeEnvironmentId: environment.id, + runCount: 2, + processingJobsCount: 2, + }, + }); + + const parentRun = await engine.trigger( + { + number: 1, + friendlyId: generateFriendlyId("run"), + environment, + taskIdentifier: parentTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${parentTask}`, + isTest: false, + tags: [], + }, + prisma + ); + + await setTimeout(500); + await engine.dequeueFromWorkerQueue({ consumerId: "test_consumer", workerQueue: "main" }); + + const initialExecutionData = await engine.getRunExecutionData({ runId: parentRun.id }); + assertNonNullable(initialExecutionData); + await engine.startRunAttempt({ + runId: parentRun.id, + snapshotId: initialExecutionData.snapshot.id, + }); + + await engine.blockRunWithCreatedBatch({ + runId: parentRun.id, + batchId: batch.id, + environmentId: environment.id, + projectId: environment.projectId, + organizationId: environment.organizationId, + }); + + const child1 = await engine.trigger( + { + number: 1, + friendlyId: generateFriendlyId("run"), + environment, + taskIdentifier: childTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12346", + spanId: "s12346", + workerQueue: "main", + queue: `task/${childTask}`, + isTest: false, + tags: [], + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + batch: { id: batch.id, index: 0 }, + }, + prisma + ); + + const child2 = await engine.trigger( + { + number: 2, + friendlyId: generateFriendlyId("run"), + environment, + taskIdentifier: childTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12347", + spanId: "s12347", + workerQueue: "main", + queue: `task/${childTask}`, + isTest: false, + tags: [], + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + batch: { id: batch.id, index: 1 }, + }, + prisma + ); + + for (const child of [child1, child2]) { + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_consumer", + workerQueue: "main", + }); + const match = dequeued.find((d) => d.run.id === child.id) ?? dequeued[0]; + assertNonNullable(match); + const attempt = await engine.startRunAttempt({ + runId: match.run.id, + snapshotId: match.snapshot.id, + }); + await engine.completeRunAttempt({ + runId: attempt.run.id, + snapshotId: attempt.snapshot.id, + completion: { + id: attempt.run.id, + ok: true, + output: '{"foo":"bar"}', + outputType: "application/json", + }, + }); + } + + await setTimeout(500); + + return { environment, batch, parentRun, child1, child2 }; +} + +describe("RunEngine #tryCompleteBatch store routing", () => { + // Batch completion reads/writes route through the store. + containerTest( + "batch completion reads/writes route through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const { batch } = await driveBatchToAllChildrenComplete(engine, prisma, "run_batch_d"); + + // The batch completes through #tryCompleteBatch (driven by the debounced worker on the last + // child completing). All four reads/writes must have routed through the injected store. + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + expect(countingStore.batchReads).toBeGreaterThan(0); + expect(countingStore.memberRunReads).toBeGreaterThan(0); + expect(countingStore.batchUpdates).toBeGreaterThan(0); + expect(countingStore.waitpointReads).toBeGreaterThan(0); + + const completedBatch = await prisma.batchTaskRun.findFirst({ where: { id: batch.id } }); + expect(completedBatch?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // The batch waitpoint completion goes through the guarded completeWaitpoint, unblocking + // the parent run. + containerTest( + "waitpoint completion goes through the guarded completeWaitpoint (parent resumes)", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const { batch, parentRun } = await driveBatchToAllChildrenComplete( + engine, + prisma, + "run_batch_e" + ); + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + await setTimeout(1_000); + + const batchWaitpoint = await prisma.waitpoint.findFirst({ + where: { completedByBatchId: batch.id }, + }); + assertNonNullable(batchWaitpoint); + expect(batchWaitpoint.status).toBe("COMPLETED"); + + // the parent is no longer blocked on the batch waitpoint + const remainingParentWaitpoints = await prisma.taskRunWaitpoint.findMany({ + where: { taskRunId: parentRun.id }, + }); + expect(remainingParentWaitpoints.length).toBe(0); + + const parentExecution = await engine.getRunExecutionData({ runId: parentRun.id }); + assertNonNullable(parentExecution); + expect(parentExecution.snapshot.executionStatus).not.toBe("EXECUTING_WITH_WAITPOINTS"); + } finally { + await engine.quit(); + } + } + ); + + // The member-run read is driven by batchId only and does not rely on the + // BatchTaskRun.runtimeEnvironmentId FK. A second batch (distinct batchId) must not leak members + // into the first batch's batchId-scoped read. + containerTest( + "member-run read does not rely on the runtimeEnvironmentId FK (no cross-batch leakage)", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const { batch, environment } = await driveBatchToAllChildrenComplete( + engine, + prisma, + "run_batch_f" + ); + + const otherBatch = await prisma.batchTaskRun.create({ + data: { + friendlyId: generateFriendlyId("batch"), + runtimeEnvironmentId: environment.id, + }, + }); + + // The first batch's batchId-scoped read finds exactly its two members and never the second + // batch's (zero) members — proving batchId alone correctly scopes without the FK predicate. + const membersForFirstBatch = await prisma.taskRun.findMany({ + where: { batchId: batch.id }, + select: { id: true, batchId: true }, + }); + expect(membersForFirstBatch.length).toBe(2); + for (const member of membersForFirstBatch) { + expect(member.batchId).toBe(batch.id); + } + const membersForOtherBatch = await prisma.taskRun.findMany({ + where: { batchId: otherBatch.id }, + }); + expect(membersForOtherBatch.length).toBe(0); + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + const completedBatch = await prisma.batchTaskRun.findFirst({ where: { id: batch.id } }); + expect(completedBatch?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB binds one client (passthrough) — proven by behavior, not store.prisma === prisma. + containerTest( + "single-DB binds one client (passthrough) — batch complete round-trips on the one client", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const { batch } = await driveBatchToAllChildrenComplete(engine, prisma, "run_batch_g"); + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + await setTimeout(1_000); + + const completedBatch = await prisma.batchTaskRun.findFirst({ where: { id: batch.id } }); + expect(completedBatch?.status).toBe("COMPLETED"); + + const batchWaitpoint = await prisma.waitpoint.findFirst({ + where: { completedByBatchId: batch.id }, + }); + assertNonNullable(batchWaitpoint); + expect(batchWaitpoint.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // Residency invariant: inject a DISTINCT, POISONED replica — a JS Proxy over real prisma that + // throws if taskRun/waitpoint/batchTaskRun reads are issued through it. Not a DB mock: a guard + // client proving which client was used. If any routed read defaulted to readOnlyPrisma instead of + // this.$.prisma it would throw; completing cleanly proves the reads use the primary. + containerTest( + "routed batch-completion reads use the primary, never the replica", + async ({ prisma, redisOptions }) => { + // The replica is legitimately read by other systems (e.g. runAttemptSystem) while driving the + // batch. We only want to assert residency for the #tryCompleteBatch reads, so the poison is + // armed just before performCompleteBatch and stays delegating until then. + let armed = false; + + const poisonModel = (real: any) => + new Proxy(real, { + get(target, prop, receiver) { + if (armed && (prop === "findMany" || prop === "findFirst")) { + return () => { + throw new Error("replica read in #tryCompleteBatch — residency regression"); + }; + } + const value = Reflect.get(target, prop, receiver); + return typeof value === "function" ? value.bind(target) : value; + }, + }); + + const poisonedReplica = new Proxy(prisma, { + get(target, prop, receiver) { + if (prop === "taskRun" || prop === "waitpoint" || prop === "batchTaskRun") { + return poisonModel((target as any)[prop]); + } + const value = Reflect.get(target, prop, receiver); + return typeof value === "function" ? value.bind(target) : value; + }, + }) as unknown as PrismaClient; + + const countingStore = new CountingPostgresRunStore({ + prisma, + readOnlyPrisma: poisonedReplica, + }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const { batch } = await driveBatchToAllChildrenComplete( + engine, + prisma, + "run_batch_residency" + ); + + // The debounced background tryCompleteBatch (200ms) has already fired by the time the drive + // helper returns (it ends with a 500ms sleep) and is not re-scheduled, so no background job + // races the explicit call below. Re-open the batch so the explicit armed call genuinely + // re-walks ALL FOUR routed reads/writes under the poison — otherwise it short-circuits at + // the `status === "COMPLETED"` guard after only the batch read (vacuous). + await prisma.batchTaskRun.update({ + where: { id: batch.id }, + data: { status: "PENDING" }, + }); + + const beforeBatchReads = countingStore.batchReads; + const beforeMemberRunReads = countingStore.memberRunReads; + const beforeBatchUpdates = countingStore.batchUpdates; + const beforeWaitpointReads = countingStore.waitpointReads; + + // Must not throw: every routed read resolved to the primary, never the poisoned replica. + armed = true; + try { + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + } finally { + armed = false; + } + + // Non-vacuity: every routed read/write actually executed under the armed poison (i.e. the + // explicit call did the full walk, not a short-circuit after the batch read alone). + expect(countingStore.batchReads).toBeGreaterThan(beforeBatchReads); + expect(countingStore.memberRunReads).toBeGreaterThan(beforeMemberRunReads); + expect(countingStore.batchUpdates).toBeGreaterThan(beforeBatchUpdates); + expect(countingStore.waitpointReads).toBeGreaterThan(beforeWaitpointReads); + + const completedBatch = await prisma.batchTaskRun.findFirst({ where: { id: batch.id } }); + expect(completedBatch?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // The env-scoped member-run read is + // findRuns({ where: { batchId, runtimeEnvironmentId } }, this.$.prisma). Assert the where the store + // actually received carries BOTH predicates so the index-scoping isn't silently dropped. + containerTest( + "member-run read is scoped by both batchId and runtimeEnvironmentId", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const { batch, environment } = await driveBatchToAllChildrenComplete( + engine, + prisma, + "run_batch_envscope" + ); + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + expect(countingStore.lastFindRunsWhere?.batchId).toBe(batch.id); + expect(countingStore.lastFindRunsWhere?.runtimeEnvironmentId).toBe(environment.id); + } finally { + await engine.quit(); + } + } + ); + + // Batch not found returns at the `if (!batch)` guard, before any member read. + containerTest( + "batch not found returns early without reading members", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + await engine.batchSystem.performCompleteBatch({ batchId: "batch_nonexistent_xyz" }); + expect(countingStore.memberRunReads).toBe(0); + } finally { + await engine.quit(); + } + } + ); + + // An already-COMPLETED batch returns at the `status === "COMPLETED"` guard. Because + // performCompleteBatch is debounce/retry-driven and can fire twice, a second call must be a no-op: + // no further batch update and no further waitpoint read. + containerTest( + "already-COMPLETED batch returns early (idempotent re-run)", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const { batch } = await driveBatchToAllChildrenComplete( + engine, + prisma, + "run_batch_idempotent" + ); + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + const updatesAfterFirst = countingStore.batchUpdates; + const waitpointReadsAfterFirst = countingStore.waitpointReads; + + // Second call: must short-circuit at the COMPLETED guard. + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + expect(countingStore.batchUpdates).toBe(updatesAfterFirst); + expect(countingStore.waitpointReads).toBe(waitpointReadsAfterFirst); + } finally { + await engine.quit(); + } + } + ); + + // Not-all-runs-processed returns at `processedRunCount < runCount`, before the member + // read. A v1 batch with runCount 2 but processingJobsCount 1 (and no members) must stay + // non-COMPLETED and never read members. + containerTest( + "not-all-runs-processed returns before the member read", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const batch = await prisma.batchTaskRun.create({ + data: { + friendlyId: generateFriendlyId("batch"), + runtimeEnvironmentId: environment.id, + runCount: 2, + processingJobsCount: 1, + }, + }); + + countingStore.memberRunReads = 0; + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + expect(countingStore.memberRunReads).toBe(0); + const stillPending = await prisma.batchTaskRun.findFirst({ where: { id: batch.id } }); + expect(stillPending?.status).not.toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // A plain batch (batchTrigger, not batchTriggerAndWait) has no waitpoint, so completion + // hits `if (!waitpoint) return` after flipping the batch to COMPLETED. Drive a real run via + // engine.trigger with a batch but NO parent/resumeParentOnCompletion (so no waitpoint is created), + // run it to a final status, then complete the batch. + containerTest( + "batch with no waitpoint still completes (plain batchTrigger)", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const childTask = "child-task"; + await setupBackgroundWorker(engine, environment, [childTask]); + + const batch = await prisma.batchTaskRun.create({ + data: { + friendlyId: generateFriendlyId("batch"), + runtimeEnvironmentId: environment.id, + runCount: 1, + processingJobsCount: 1, + }, + }); + + const run = await engine.trigger( + { + number: 1, + friendlyId: generateFriendlyId("run"), + environment, + taskIdentifier: childTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-nowp-1", + spanId: "s-nowp-1", + workerQueue: "main", + queue: `task/${childTask}`, + isTest: false, + tags: [], + batch: { id: batch.id, index: 0 }, + }, + prisma + ); + + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_consumer", + workerQueue: "main", + }); + const match = dequeued.find((d) => d.run.id === run.id) ?? dequeued[0]; + assertNonNullable(match); + const attempt = await engine.startRunAttempt({ + runId: match.run.id, + snapshotId: match.snapshot.id, + }); + await engine.completeRunAttempt({ + runId: attempt.run.id, + snapshotId: attempt.snapshot.id, + completion: { + id: attempt.run.id, + ok: true, + output: '{"foo":"bar"}', + outputType: "application/json", + }, + }); + await setTimeout(500); + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + expect(countingStore.batchUpdates).toBeGreaterThan(0); + expect(countingStore.waitpointReads).toBeGreaterThan(0); + + const completedBatch = await prisma.batchTaskRun.findFirst({ where: { id: batch.id } }); + expect(completedBatch?.status).toBe("COMPLETED"); + + // Proves the `if (!waitpoint) return` branch: a plain batch has no waitpoint. + const waitpoint = await prisma.waitpoint.findFirst({ + where: { completedByBatchId: batch.id }, + }); + expect(waitpoint).toBeNull(); + } finally { + await engine.quit(); + } + } + ); +}); + +/** + * Two-store routing proof: a real RoutingRunStore over two distinct PostgresRunStores. Every + * #tryCompleteBatch read/write must resolve to the run-ops (new) store and never touch the legacy + * store. No mocks — both stores are genuine PostgresRunStore instances over real containers. + */ +describe("#tryCompleteBatch two-store routing", () => { + // A batch + members + waitpoint complete via the run-ops store only; the legacy store + // is never touched, and all members are discovered within the one owning store. + containerTest( + "batch completion routes to the run-ops store only; the legacy store is untouched", + async ({ prisma, redisOptions }) => { + const newStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const legacyStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const routingStore = new RoutingRunStore({ + new: newStore, + legacy: legacyStore, + classify: () => "NEW", + }); + + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, routingStore)); + + try { + const { batch } = await driveBatchToAllChildrenComplete(engine, prisma, "run_batch_h"); + + const beforeLegacy = + legacyStore.batchReads + + legacyStore.memberRunReads + + legacyStore.batchUpdates + + legacyStore.waitpointReads; + + await engine.batchSystem.performCompleteBatch({ batchId: batch.id }); + + expect(newStore.batchReads).toBeGreaterThan(0); + expect(newStore.memberRunReads).toBeGreaterThan(0); + expect(newStore.batchUpdates).toBeGreaterThan(0); + expect(newStore.waitpointReads).toBeGreaterThan(0); + + expect( + legacyStore.batchReads + + legacyStore.memberRunReads + + legacyStore.batchUpdates + + legacyStore.waitpointReads + ).toBe(beforeLegacy); + + const completedBatch = await prisma.batchTaskRun.findFirst({ where: { id: batch.id } }); + expect(completedBatch?.status).toBe("COMPLETED"); + + const members = await prisma.taskRun.findMany({ where: { batchId: batch.id } }); + expect(members.length).toBe(2); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/systems/batchSystem.ts b/internal-packages/run-engine/src/engine/systems/batchSystem.ts index a3d44507a46..93534f3bf46 100644 --- a/internal-packages/run-engine/src/engine/systems/batchSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/batchSystem.ts @@ -23,7 +23,6 @@ export class BatchSystem { id: `tryCompleteBatch:${batchId}`, job: "tryCompleteBatch", payload: { batchId: batchId }, - //200ms in the future availableAt: new Date(Date.now() + 200), }); } @@ -38,20 +37,7 @@ export class BatchSystem { */ async #tryCompleteBatch({ batchId }: { batchId: string }) { return startSpan(this.$.tracer, "#tryCompleteBatch", async (span) => { - const batch = await this.$.prisma.batchTaskRun.findFirst({ - select: { - status: true, - runtimeEnvironmentId: true, - processingJobsCount: true, - runCount: true, - batchVersion: true, - successfulRunCount: true, - failedRunCount: true, - }, - where: { - id: batchId, - }, - }); + const batch = await this.$.runStore.findBatchTaskRunById(batchId, undefined, this.$.prisma); if (!batch) { this.$.logger.error("#tryCompleteBatch batch doesn't exist", { batchId }); @@ -103,21 +89,30 @@ export class BatchSystem { if (runs.every((r) => isFinalRunStatus(r.status))) { this.$.logger.debug("#tryCompleteBatch: All runs are completed", { batchId }); - await this.$.prisma.batchTaskRun.update({ - where: { - id: batchId, - }, - data: { - status: "COMPLETED", + await this.$.runStore.updateBatchTaskRun( + { + where: { + id: batchId, + }, + data: { + status: "COMPLETED", + }, + select: { + id: true, + }, }, - }); + this.$.prisma + ); //get waitpoint (if there is one) - const waitpoint = await this.$.prisma.waitpoint.findFirst({ - where: { - completedByBatchId: batchId, + const waitpoint = await this.$.runStore.findWaitpoint( + { + where: { + completedByBatchId: batchId, + }, }, - }); + this.$.prisma + ); if (!waitpoint) { this.$.logger.debug( diff --git a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts index b956a0f01aa..1b5884ba72e 100644 --- a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts @@ -51,7 +51,7 @@ export class CheckpointSystem { const prisma = tx ?? this.$.prisma; return await this.$.runLock.lock("createCheckpoint", [runId], async () => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); + const snapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); const isValidSnapshot = // Case 1: The provided snapshotId matches the current snapshot @@ -114,18 +114,12 @@ export class CheckpointSystem { }; } - // Get the run and update the status + // Get the run (run-ops scalars only) and update the status; the control-plane env is + // resolved separately so the run-ops DB can split without a cross-provider join. const run = await this.$.runStore.suspendForCheckpoint( runId, { - include: { - runtimeEnvironment: { - include: { - project: true, - organization: true, - }, - }, - }, + include: {}, }, this.$.prisma ); @@ -138,6 +132,17 @@ export class CheckpointSystem { throw new ServiceValidationError("Run not found", 404); } + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + this.$.logger.error("Environment not found for createCheckpoint", { + snapshot, + runtimeEnvironmentId: run.runtimeEnvironmentId, + }); + + throw new ServiceValidationError("Run not found", 404); + } + this.$.eventBus.emit("runStatusChanged", { time: new Date(), run: { @@ -149,34 +154,40 @@ export class CheckpointSystem { batchId: run.batchId, }, organization: { - id: run.runtimeEnvironment.organizationId, + id: env.organizationId, }, project: { - id: run.runtimeEnvironment.projectId, + id: env.projectId, }, environment: { - id: run.runtimeEnvironment.id, + id: env.id, }, }); - // Create the checkpoint - const taskRunCheckpoint = await prisma.taskRunCheckpoint.create({ - data: { - ...CheckpointId.generate(), - type: checkpoint.type, - location: checkpoint.location, - imageRef: checkpoint.imageRef, - reason: checkpoint.reason, - runtimeEnvironmentId: run.runtimeEnvironment.id, - projectId: run.runtimeEnvironment.projectId, + // Create the checkpoint through the run-ops store (routed by owning run id). When a caller + // supplied a tx distinct from the base client, pass it through so the write stays atomic with + // that transaction; otherwise the store resolves it on its own client (passthrough in single-DB). + const taskRunCheckpoint = await this.$.runStore.createTaskRunCheckpoint( + { + data: { + ...CheckpointId.generate(), + type: checkpoint.type, + location: checkpoint.location, + imageRef: checkpoint.imageRef, + reason: checkpoint.reason, + runtimeEnvironmentId: env.id, + projectId: env.projectId, + }, }, - }); + run.id, + tx ? prisma : undefined + ); if (snapshot.executionStatus === "QUEUED_EXECUTING") { // Enqueue the run again const newSnapshot = await this.enqueueSystem.enqueueRun({ run, - env: run.runtimeEnvironment, + env, snapshot: { status: "QUEUED", description: @@ -267,7 +278,7 @@ export class CheckpointSystem { const prisma = tx ?? this.$.prisma; return await this.$.runLock.lock("continueRunExecution", [runId], async () => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); + const snapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); if (snapshot.id !== snapshotId) { throw new ServiceValidationError( diff --git a/internal-packages/run-engine/src/engine/systems/debounceSystem.test.ts b/internal-packages/run-engine/src/engine/systems/debounceSystem.test.ts new file mode 100644 index 00000000000..f2aa7557fc2 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/debounceSystem.test.ts @@ -0,0 +1,717 @@ +import { containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import { trace } from "@internal/tracing"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; +import { createTestSnapshot } from "../tests/helpers/snapshotTestHelpers.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any, store?: PostgresRunStore) { + return { + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + debounce: { + maxDebounceDurationMs: 60_000, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * A real PostgresRunStore subclass that counts the TaskRun read/write methods the debounce + * existing-run path routes through, so the routing can be observed over real containers + * without ever mocking prisma. super.* runs the genuine store implementation. + */ +class CountingPostgresRunStore extends PostgresRunStore { + public findRunCalls = 0; + public rewriteCalls = 0; + public latestSnapshotReads = 0; + public lastFindRunClients: unknown[] = []; + + override async findRun(...args: any[]): Promise { + this.findRunCalls++; + // The trailing arg is the resolved client (`client?: ReadClient`) when present. + this.lastFindRunClients.push(args[args.length - 1]); + return (super.findRun as any)(...args); + } + + override async rewriteDebouncedRun( + ...args: Parameters + ): ReturnType { + this.rewriteCalls++; + return super.rewriteDebouncedRun(...args); + } + + override async findLatestExecutionSnapshot( + ...args: Parameters + ): ReturnType { + this.latestSnapshotReads++; + return super.findLatestExecutionSnapshot(...args); + } +} + +async function triggerDebouncedRun( + engine: RunEngine, + prisma: PrismaClient, + environment: Awaited>, + taskIdentifier: string, + opts: { + delay?: string; + delayUntilMs?: number; + debounce: { key: string; delay: string; mode?: "leading" | "trailing"; updateData?: any }; + payload?: string; + } +) { + const friendlyId = RunId.generate().friendlyId; + return engine.trigger( + { + number: 1, + friendlyId, + environment, + taskIdentifier, + payload: opts.payload ?? '{"data":"first"}', + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `t_${friendlyId}`, + spanId: `s_${friendlyId}`, + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + delayUntil: new Date(Date.now() + (opts.delayUntilMs ?? 10_000)), + debounce: opts.debounce, + }, + prisma + ); +} + +describe("debounceSystem store routing (single-DB passthrough)", () => { + // The existing-run fast-path skip routes its reads through the store. + containerTest( + "existing-run fast-path skip routes its reads through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "fastpath-key", delay: "30s" }, + delayUntilMs: 30_000, + }); + expect(first.status).toBe("DELAYED"); + + const before = countingStore.findRunCalls; + + // A second trigger whose (quantized) delayUntil is not later than the existing + // one takes the fast-path skip and returns the same existing run. + const second = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "fastpath-key", delay: "5s" }, + delayUntilMs: 5_000, + }); + + // Same run is returned (debounced onto the existing run id). + expect(second.id).toBe(first.id); + // The probe + full-run reads went through the store. + expect(countingStore.findRunCalls).toBeGreaterThan(before); + } finally { + await engine.quit(); + } + } + ); + + // The existing-run locked reschedule routes the re-read + the snapshot through + // the store on the non-tx path (the snapshot read is this unit's one source edit, gated on + // `tx ? undefined : this.$.runStore`). The public trigger path always supplies `tx: prisma`, + // so the store-routed snapshot read is driven by calling handleDebounce without a tx. + containerTest( + "existing-run locked reschedule routes the re-read and snapshot through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "locked-key", delay: "10s" }, + delayUntilMs: 10_000, + }); + expect(first.status).toBe("DELAYED"); + + const firstRow = await prisma.taskRun.findFirstOrThrow({ where: { id: first.id } }); + const beforeFindRun = countingStore.findRunCalls; + const beforeSnapshot = countingStore.latestSnapshotReads; + + // Drive handleDebounce directly with no tx so the snapshot read routes through the + // store. A later delay pushes execution later, forcing the locked reschedule path + // (#handleExistingRunLocked) rather than the fast-path skip. + const result = await engine.debounceSystem.handleDebounce({ + environmentId: environment.id, + taskIdentifier, + debounce: { key: "locked-key", delay: "50s" }, + tx: undefined, + }); + + expect(result.status).toBe("existing"); + expect((result as any).run.id).toBe(first.id); + expect((result as any).run.status).toBe("DELAYED"); + + const rescheduledRow = await prisma.taskRun.findFirstOrThrow({ where: { id: first.id } }); + // The reschedule (delayedRunSystem) advanced delayUntil. + expect(rescheduledRow.delayUntil!.getTime()).toBeGreaterThan( + firstRow.delayUntil!.getTime() + ); + // The locked re-read went through the store. + expect(countingStore.findRunCalls).toBeGreaterThan(beforeFindRun); + // The snapshot read in #handleExistingRunLocked routes through the store on the + // non-tx path. + expect(countingStore.latestSnapshotReads).toBeGreaterThan(beforeSnapshot); + } finally { + await engine.quit(); + } + } + ); + + // Even on the tx path the snapshot read routes through the store. + // getLatestExecutionSnapshot always passes this.$.runStore, so the read is routed to the + // OWNING DB (correct for split mode — a ksuid run's snapshot lives on the dedicated DB, not the + // caller's control-plane tx). Driving the locked reschedule inside a tx must still increment the + // counting store's snapshot-read counter. + containerTest( + "existing-run locked reschedule on the tx path still routes the snapshot read through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "tx-snapshot-key", delay: "10s" }, + delayUntilMs: 10_000, + }); + expect(first.status).toBe("DELAYED"); + + const firstRow = await prisma.taskRun.findFirstOrThrow({ where: { id: first.id } }); + const beforeSnapshot = countingStore.latestSnapshotReads; + + // Drive handleDebounce inside a transaction, passing the tx. A later delay (50s vs the + // existing 10s remaining) defeats the fast-path skip and reaches #handleExistingRunLocked, + // whose snapshot read still routes through the store (owning-DB resolution). + const result = await prisma.$transaction(async (tx) => { + return await engine.debounceSystem.handleDebounce({ + environmentId: environment.id, + taskIdentifier, + debounce: { key: "tx-snapshot-key", delay: "50s" }, + tx: tx as any, + }); + }); + + expect(result.status).toBe("existing"); + expect((result as any).run.id).toBe(first.id); + expect((result as any).run.status).toBe("DELAYED"); + + const rescheduledRow = await prisma.taskRun.findFirstOrThrow({ where: { id: first.id } }); + // The reschedule advanced delayUntil. + expect(rescheduledRow.delayUntil!.getTime()).toBeGreaterThan( + firstRow.delayUntil!.getTime() + ); + // The snapshot read routed through the store (owning-DB resolution), not the caller tx. + expect(countingStore.latestSnapshotReads).toBeGreaterThan(beforeSnapshot); + } finally { + await engine.quit(); + } + } + ); + + // Snapshot-read catch branch: when the snapshot read throws (run/snapshot gone), + // #handleExistingRunLocked clears the stale Redis key and claims a fresh one, returning + // status "new" instead of "existing". + containerTest( + "snapshot read failure clears the stale key and returns new", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "snapshot-throw-key", delay: "10s" }, + delayUntilMs: 10_000, + }); + expect(first.status).toBe("DELAYED"); + + // Remove the run's snapshot rows so getLatestExecutionSnapshot throws + // ("No execution snapshot found"), driving the catch branch. + await prisma.taskRunExecutionSnapshot.deleteMany({ where: { runId: first.id } }); + + // A later delay defeats the fast-path skip and reaches #handleExistingRunLocked, whose + // snapshot read now throws. + const result = await engine.debounceSystem.handleDebounce({ + environmentId: environment.id, + taskIdentifier, + debounce: { key: "snapshot-throw-key", delay: "50s" }, + tx: undefined, + }); + + // The stale existing run was abandoned; a fresh key was claimed for a new run. + expect(result.status).toBe("new"); + expect((result as any).claimId).toBeDefined(); + } finally { + await engine.quit(); + } + } + ); + + // Non-DELAYED snapshot branch: when the latest snapshot's executionStatus is + // neither DELAYED nor RUN_CREATED, #handleExistingRunLocked clears the Redis key and claims + // a fresh one, returning status "new". + containerTest( + "non-delayed snapshot status clears the key and returns new", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "non-delayed-key", delay: "10s" }, + delayUntilMs: 10_000, + }); + expect(first.status).toBe("DELAYED"); + + // Insert a newer valid snapshot whose executionStatus is EXECUTING. The snapshot read + // (orderBy createdAt desc, isValid) now resolves a non-DELAYED/non-RUN_CREATED status. + await createTestSnapshot(prisma, { + runId: first.id, + status: "EXECUTING", + environmentId: environment.id, + environmentType: environment.type, + projectId: environment.project.id, + organizationId: environment.organization.id, + }); + + // A later delay defeats the fast-path skip and reaches #handleExistingRunLocked, whose + // snapshot status check now falls through to the claim-new path. + const result = await engine.debounceSystem.handleDebounce({ + environmentId: environment.id, + taskIdentifier, + debounce: { key: "non-delayed-key", delay: "50s" }, + tx: undefined, + }); + + expect(result.status).toBe("new"); + expect((result as any).claimId).toBeDefined(); + } finally { + await engine.quit(); + } + } + ); + + // The trailing-mode update routes through the store via rewriteDebouncedRun. + containerTest( + "trailing-mode update routes through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "trailing-key", delay: "10s", mode: "trailing" }, + delayUntilMs: 10_000, + payload: '{"data":"first"}', + }); + expect(first.status).toBe("DELAYED"); + + const before = countingStore.rewriteCalls; + + const second = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { + key: "trailing-key", + delay: "10s", + mode: "trailing", + updateData: { payload: '{"data":"updated"}', payloadType: "application/json" }, + }, + delayUntilMs: 10_000, + payload: '{"data":"updated"}', + }); + + expect(second.id).toBe(first.id); + // The trailing update went through rewriteDebouncedRun. + expect(countingStore.rewriteCalls).toBeGreaterThan(before); + + const updatedRow = await prisma.taskRun.findFirstOrThrow({ where: { id: first.id } }); + expect(updatedRow.payload).toBe('{"data":"updated"}'); + } finally { + await engine.quit(); + } + } + ); + + // The lock-contention fallback routes its read through the store. A + // LockAcquisitionTimeoutError-shaped failure from runLock drives + // #handleLockContentionFallback, which reads the existing run via the store. + containerTest( + "lock-contention fallback routes its read through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "contention-key", delay: "10s" }, + delayUntilMs: 10_000, + }); + expect(first.status).toBe("DELAYED"); + + // Force a lock-contention error from runLock so handleExistingRun falls back. + const runLock = (engine as any).runLock; + const originalLock = runLock.lock.bind(runLock); + runLock.lock = async (..._args: any[]) => { + const err = new Error("simulated lock contention"); + err.name = "LockAcquisitionTimeoutError"; + throw err; + }; + + const before = countingStore.findRunCalls; + let result: any; + try { + // Drive handleDebounce directly: fast-path is disabled by the later delay so the + // path reaches the lock, which throws, triggering the contention fallback read. + result = await engine.debounceSystem.handleDebounce({ + environmentId: environment.id, + taskIdentifier, + debounce: { key: "contention-key", delay: "50s" }, + tx: undefined, + }); + } finally { + runLock.lock = originalLock; + } + + expect(result.status).toBe("existing"); + expect(result.run.id).toBe(first.id); + // The fallback read went through the store. + expect(countingStore.findRunCalls).toBeGreaterThan(before); + } finally { + await engine.quit(); + } + } + ); + + // A caller-supplied tx is threaded through findRun's `client?` arg and honored, + // not re-selected by the store. + containerTest( + "tx path is threaded straight through to the store read (honored, not re-routed)", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "tx-key", delay: "10s" }, + delayUntilMs: 10_000, + }); + expect(first.status).toBe("DELAYED"); + + await prisma.$transaction(async (tx) => { + countingStore.lastFindRunClients = []; + const result = await engine.debounceSystem.handleDebounce({ + environmentId: environment.id, + taskIdentifier, + debounce: { key: "tx-key", delay: "5s" }, + tx: tx as any, + }); + + expect(result.status).toBe("existing"); + // Every store read in this call executed on the supplied tx client, not a + // re-routed one. + expect(countingStore.lastFindRunClients.length).toBeGreaterThan(0); + for (const client of countingStore.lastFindRunClients) { + expect(client).toBe(tx); + } + }); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB binds one client (passthrough) — proven by behavior, not by reaching + // into a private prisma member. The routed read returns exactly the row just written. + containerTest( + "single-DB binds one client (passthrough) — debounce round-trip on one client", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const first = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "passthru-key", delay: "10s" }, + delayUntilMs: 10_000, + }); + expect(first.status).toBe("DELAYED"); + + // Push the run later through the locked reschedule path, then read it back through + // the default-store engine — it resolves on the one client to exactly the row just + // rescheduled. + const second = await triggerDebouncedRun(engine, prisma, environment, taskIdentifier, { + debounce: { key: "passthru-key", delay: "50s" }, + delayUntilMs: 50_000, + }); + expect(second.id).toBe(first.id); + + const routed = await engine.runStore.findRun( + { id: first.id }, + { include: { associatedWaitpoint: true } }, + prisma + ); + const persisted = await prisma.taskRun.findFirstOrThrow({ where: { id: first.id } }); + + expect(routed).not.toBeNull(); + expect(routed!.id).toBe(persisted.id); + expect(routed!.delayUntil!.getTime()).toBe(persisted.delayUntil!.getTime()); + } finally { + await engine.quit(); + } + } + ); +}); + +// --- Cross-version read-through proof --- + +/** + * A real, minimal two-store router over two PostgresRunStore instances, selecting by owning + * run id. Never a mock: it only re-implements the by-run-id route selection the production + * RoutingRunStore performs, delegating to genuine stores over real containers. We know which + * runs live where because we seed each run on exactly one store. + */ +class TwoStoreRunRouter { + readonly newStore: PostgresRunStore; + readonly legacyStore: PostgresRunStore; + readonly #newRunIds: Set; + + constructor( + newStore: PostgresRunStore, + legacyStore: PostgresRunStore, + newRunIds: Iterable + ) { + this.newStore = newStore; + this.legacyStore = legacyStore; + this.#newRunIds = new Set(newRunIds); + } + + #route(runId: string): PostgresRunStore { + return this.#newRunIds.has(runId) ? this.newStore : this.legacyStore; + } + + findRun(where: any, ...rest: any[]) { + return (this.#route(where.id).findRun as any)(where, ...rest); + } +} + +async function seedEnvironment(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +async function seedDelayedRunWithWaitpoint( + prisma: PrismaClient, + store: PostgresRunStore, + suffix: string, + runId: string +) { + const env = await seedEnvironment(prisma, suffix); + const delayUntil = new Date("2024-06-01T00:00:00.000Z"); + const createdAt = new Date("2024-01-01T00:00:00.000Z"); + + await store.createRun({ + data: { + id: runId, + engine: "V2", + status: "DELAYED", + friendlyId: `run_friendly_${suffix}`, + runtimeEnvironmentId: env.environment.id, + environmentType: "DEVELOPMENT", + organizationId: env.organization.id, + projectId: env.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `trace_${suffix}`, + spanId: `span_${suffix}`, + runTags: ["tag-a", "tag-b"], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + delayUntil, + createdAt, + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "DELAYED", + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }, + } as any); + + // Attach an associated waitpoint so the `{ include: { associatedWaitpoint } }` read is exercised. + await prisma.waitpoint.create({ + data: { + type: "RUN", + status: "PENDING", + friendlyId: `waitpoint_${suffix}`, + idempotencyKey: `wp_idem_${suffix}`, + userProvidedIdempotencyKey: false, + environmentId: env.environment.id, + projectId: env.project.id, + completedByTaskRunId: runId, + }, + }); + + return { env, delayUntil, createdAt }; +} + +describe("debounceSystem store routing (cross-version read-through)", () => { + // An existing-run read round-trips deep-equal across PG14/PG17, routed by owning + // run id (NEW=PG17 resolved, LEGACY=PG14 untouched for a NEW run). + heteroPostgresTest( + "existing-run read round-trips across versions, routed by run id", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const rNew = "run_new_k"; + const rOld = "run_old_k"; + const seededNew = await seedDelayedRunWithWaitpoint(prisma17 as any, newStore, "new_k", rNew); + const seededOld = await seedDelayedRunWithWaitpoint( + prisma14 as any, + legacyStore, + "old_k", + rOld + ); + + const router = new TwoStoreRunRouter(newStore, legacyStore, [rNew]); + + const newRun = await (router.findRun as any)( + { id: rNew }, + { include: { associatedWaitpoint: true } }, + prisma17 + ); + const oldRun = await (router.findRun as any)( + { id: rOld }, + { include: { associatedWaitpoint: true } }, + prisma14 + ); + + // Routed by run id to the correct store; legacy untouched for the NEW run. + const legacyRowForNew = await prisma14.taskRun.findFirst({ where: { id: rNew } }); + expect(legacyRowForNew).toBeNull(); + + expect(newRun!.id).toBe(rNew); + expect(oldRun!.id).toBe(rOld); + + // The read shape is identical across versions: status, delayUntil, createdAt, + // runTags array, and the associatedWaitpoint include. + expect(newRun!.status).toBe(oldRun!.status); + expect(newRun!.status).toBe("DELAYED"); + expect(newRun!.delayUntil!.getTime()).toBe(seededNew.delayUntil.getTime()); + expect(oldRun!.delayUntil!.getTime()).toBe(seededOld.delayUntil.getTime()); + expect(newRun!.createdAt.getTime()).toBe(seededNew.createdAt.getTime()); + expect(newRun!.runTags).toEqual(oldRun!.runTags); + expect(newRun!.runTags).toEqual(["tag-a", "tag-b"]); + expect(newRun!.associatedWaitpoint).not.toBeNull(); + expect(oldRun!.associatedWaitpoint).not.toBeNull(); + expect(newRun!.associatedWaitpoint!.type).toBe(oldRun!.associatedWaitpoint!.type); + expect(newRun!.associatedWaitpoint!.completedByTaskRunId).toBe(rNew); + expect(oldRun!.associatedWaitpoint!.completedByTaskRunId).toBe(rOld); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/systems/debounceSystem.ts b/internal-packages/run-engine/src/engine/systems/debounceSystem.ts index bf4b3e68bb4..bf733d81f6d 100644 --- a/internal-packages/run-engine/src/engine/systems/debounceSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/debounceSystem.ts @@ -742,7 +742,7 @@ return 0 // Get the latest execution snapshot let snapshot; try { - snapshot = await getLatestExecutionSnapshot(prisma, existingRunId); + snapshot = await getLatestExecutionSnapshot(prisma, existingRunId, this.$.runStore); } catch (error) { // Run no longer exists or has no snapshot this.$.logger.debug("handleExistingRun: existing run not found or has no snapshot", { diff --git a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.test.ts b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.test.ts new file mode 100644 index 00000000000..f02efd7f30d --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.test.ts @@ -0,0 +1,595 @@ +import { + containerTest, + assertNonNullable, + heteroPostgresTest, + network, + redisContainer, + redisOptions, +} from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { expect } from "vitest"; +import { PostgresRunStore } from "@internal/run-store"; +import type { RunStore } from "@internal/run-store"; +import { PrismaClient } from "@trigger.dev/database"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; + +// heteroPostgresTest provides two postgres clients but no redis; the engine needs +// redis. Compose a fixture that adds a per-test redis container + options. +const heteroEngineTest = heteroPostgresTest.extend<{ + network: any; + redisContainer: any; + redisOptions: any; +}>({ + network, + redisContainer, + redisOptions, +}); + +vi.setConfig({ testTimeout: 60_000 }); + +// A real PostgresRunStore subclass (no mocks) that counts how many times each +// routed method is invoked, then delegates to the real Prisma-backed +// implementation. The counters let a test prove that a code path resolved its +// reads/writes through the owning store rather than going straight to Prisma. +class CountingPostgresRunStore extends PostgresRunStore { + public calls = { + rescheduleRun: 0, + enqueueDelayedRun: 0, + expireRun: 0, + expireRunsBatch: 0, + findRun: 0, + findRuns: 0, + findLatestExecutionSnapshot: 0, + forWaitpointCompletion: 0, + }; + + override rescheduleRun(...args: Parameters) { + this.calls.rescheduleRun++; + return super.rescheduleRun(...args); + } + + override enqueueDelayedRun(...args: Parameters) { + this.calls.enqueueDelayedRun++; + return super.enqueueDelayedRun(...args); + } + + // expireRun is generic over the select payload; keep the loose arg list so the + // override still satisfies the generic interface signature. + override expireRun(...args: any[]): any { + this.calls.expireRun++; + return super.expireRun(...(args as [any, any, any, any])); + } + + override expireRunsBatch(...args: Parameters) { + this.calls.expireRunsBatch++; + return super.expireRunsBatch(...args); + } + + // findRun has three overloads; accept the loose arg list and forward verbatim. + override findRun(...args: any[]): any { + this.calls.findRun++; + return super.findRun(...(args as [any, any, any])); + } + + override findRuns(...args: any[]): any { + this.calls.findRuns++; + return super.findRuns(...(args as [any, any])); + } + + override findLatestExecutionSnapshot( + ...args: Parameters + ) { + this.calls.findLatestExecutionSnapshot++; + return super.findLatestExecutionSnapshot(...args); + } + + override forWaitpointCompletion(...args: Parameters) { + this.calls.forWaitpointCompletion++; + return super.forWaitpointCompletion(...args); + } +} + +function createEngine( + prisma: PrismaClient, + redisOptions: any, + store?: RunStore, + extraQueueOptions?: Record +) { + return new RunEngine({ + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + ...extraQueueOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); +} + +const triggerDefaults = { + payload: "{}", + payloadType: "application/json" as const, + context: {}, + traceContext: {}, + isTest: false, + tags: [] as string[], + workerQueue: "main", +}; + +describe("DelayedRunSystem store routing", () => { + containerTest( + "rescheduleDelayedRun routes snapshot read + reschedule write through the store", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = createEngine(prisma, redisOptions, store); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + ...triggerDefaults, + number: 1, + friendlyId: "run_resched1", + environment: authenticatedEnvironment, + taskIdentifier, + traceId: "t_resched", + spanId: "s_resched", + queue: "task/test-task", + delayUntil: new Date(Date.now() + 400), + }, + prisma + ); + + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("DELAYED"); + + // Reset counters so we only measure the rescheduleDelayedRun path. + store.calls.findLatestExecutionSnapshot = 0; + store.calls.rescheduleRun = 0; + + const rescheduleTo = new Date(Date.now() + 5_000); + const updatedRun = await engine.rescheduleDelayedRun({ + runId: run.id, + delayUntil: rescheduleTo, + }); + expect(updatedRun.delayUntil?.toISOString()).toBe(rescheduleTo.toISOString()); + + // The snapshot read routed through the owning store (this is the unit's edit), + // and the reschedule write routed through the store too. + expect(store.calls.findLatestExecutionSnapshot).toBeGreaterThanOrEqual(1); + expect(store.calls.rescheduleRun).toBeGreaterThanOrEqual(1); + + // Persisted state: delayUntil updated and a fresh DELAYED snapshot row exists. + const persisted = await prisma.taskRun.findFirstOrThrow({ where: { id: run.id } }); + expect(persisted.delayUntil?.toISOString()).toBe(rescheduleTo.toISOString()); + + const delayedSnapshots = await prisma.taskRunExecutionSnapshot.findMany({ + where: { runId: run.id, executionStatus: "DELAYED" }, + }); + // Two DELAYED snapshots: the trigger-time one and the reschedule one. + expect(delayedSnapshots.length).toBeGreaterThanOrEqual(2); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "rescheduleDelayedRun with a caller tx still routes the snapshot read through the store", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = createEngine(prisma, redisOptions, store); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + ...triggerDefaults, + number: 1, + friendlyId: "run_reschedtx", + environment: authenticatedEnvironment, + taskIdentifier, + traceId: "t_reschedtx", + spanId: "s_reschedtx", + queue: "task/test-task", + delayUntil: new Date(Date.now() + 400), + }, + prisma + ); + + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("DELAYED"); + + // Reset counters so we only measure the rescheduleDelayedRun path. + store.calls.findLatestExecutionSnapshot = 0; + store.calls.rescheduleRun = 0; + + // Pass a caller tx: the snapshot read must still route through the store (owning-DB + // resolution), not read the caller's control-plane tx directly. + const rescheduleTo = new Date(Date.now() + 5_000); + const updatedRun = await engine.rescheduleDelayedRun({ + runId: run.id, + delayUntil: rescheduleTo, + tx: prisma, + }); + + // The snapshot read routed through the store (owning-DB resolution), not the caller tx. + expect(store.calls.findLatestExecutionSnapshot).toBeGreaterThanOrEqual(1); + + // The reschedule still succeeded and persisted. + expect(updatedRun.delayUntil?.toISOString()).toBe(rescheduleTo.toISOString()); + + const persisted = await prisma.taskRun.findFirstOrThrow({ where: { id: run.id } }); + expect(persisted.delayUntil?.toISOString()).toBe(rescheduleTo.toISOString()); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "enqueueDelayedRun routes snapshot read + findRun + enqueue write through the store", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = createEngine(prisma, redisOptions, store); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + // Long delay so the background worker job never races our direct call. + const run = await engine.trigger( + { + ...triggerDefaults, + number: 1, + friendlyId: "run_enq1", + environment: authenticatedEnvironment, + taskIdentifier, + traceId: "t_enq", + spanId: "s_enq", + queue: "task/test-task", + delayUntil: new Date(Date.now() + 60_000), + }, + prisma + ); + + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("DELAYED"); + + // Reset counters so we only measure the enqueueDelayedRun path. + store.calls.findLatestExecutionSnapshot = 0; + store.calls.findRun = 0; + store.calls.enqueueDelayedRun = 0; + + // Drive enqueue directly so timing is deterministic. The run's delayUntil + // is in the future, so first move it to the past to allow enqueue. + await prisma.taskRun.update({ + where: { id: run.id }, + data: { delayUntil: new Date(Date.now() - 1_000) }, + }); + + await engine.delayedRunSystem.enqueueDelayedRun({ runId: run.id }); + + expect(store.calls.findLatestExecutionSnapshot).toBeGreaterThanOrEqual(1); + expect(store.calls.findRun).toBeGreaterThanOrEqual(1); + expect(store.calls.enqueueDelayedRun).toBeGreaterThanOrEqual(1); + + const persisted = await prisma.taskRun.findFirstOrThrow({ where: { id: run.id } }); + expect(persisted.status).toBe("PENDING"); + + const queuedSnapshots = await prisma.taskRunExecutionSnapshot.findMany({ + where: { runId: run.id, executionStatus: "QUEUED" }, + }); + expect(queuedSnapshots.length).toBeGreaterThanOrEqual(1); + } finally { + await engine.quit(); + } + } + ); + + containerTest("single-DB binds one client (passthrough)", async ({ prisma, redisOptions }) => { + // No custom store: the engine builds a default PostgresRunStore over the one + // prisma client. A reschedule + enqueue round-trip must land on that one DB. + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = createEngine(prisma, redisOptions); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + ...triggerDefaults, + number: 1, + friendlyId: "run_pass1", + environment: authenticatedEnvironment, + taskIdentifier, + traceId: "t_pass", + spanId: "s_pass", + queue: "task/test-task", + delayUntil: new Date(Date.now() + 60_000), + }, + prisma + ); + + const rescheduleTo = new Date(Date.now() + 90_000); + await engine.rescheduleDelayedRun({ runId: run.id, delayUntil: rescheduleTo }); + + const afterReschedule = await prisma.taskRun.findFirstOrThrow({ where: { id: run.id } }); + expect(afterReschedule.delayUntil?.toISOString()).toBe(rescheduleTo.toISOString()); + + // Move delay to the past, then enqueue and confirm the transition persisted. + await prisma.taskRun.update({ + where: { id: run.id }, + data: { delayUntil: new Date(Date.now() - 1_000) }, + }); + await engine.delayedRunSystem.enqueueDelayedRun({ runId: run.id }); + + const afterEnqueue = await prisma.taskRun.findFirstOrThrow({ where: { id: run.id } }); + expect(afterEnqueue.status).toBe("PENDING"); + } finally { + await engine.quit(); + } + }); + + heteroEngineTest( + "far-future delayed run fires post-migration on the same worker, NO re-arm", + async ({ prisma14, prisma17, redisOptions }) => { + // A delayed run is born on the LEGACY DB (PG14) with a far-future delayUntil. + // A straggler migration copies the run row and its latest snapshot onto the + // NEW DB (PG17) and flips the residency map, WITHOUT re-arming any worker job. + // The unchanged enqueueDelayedRun handler must resolve its reads/writes to the + // NEW store (PG17), leaving the LEGACY copy untouched. + + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + + // Residency table keyed by runId -> owning store. A real two-store router that + // delegates every RunStore method to the resolved store. Methods with a runId + // first arg resolve by it; bulk reads resolve by the first id; method without a + // run id fall back to the default store. + const residency = new Map(); + const router = createRouter(residency, legacyStore); + + // Seed env/worker/task on BOTH databases (control-plane resolver reads env via + // the engine's prisma; the run-ops rows live on whichever store owns them). + const env14 = await setupAuthenticatedEnvironment(prisma14, "PRODUCTION"); + const env17 = await setupAuthenticatedEnvironment(prisma17, "PRODUCTION"); + + // The engine's prisma is PG17 (control plane / env resolution). Run-ops route + // through the router. + const engine = createEngine(prisma17, redisOptions, router); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, env17, taskIdentifier); + // Mirror a background worker + queue on PG14 so the legacy create is valid. + await setupBackgroundWorker(engine14Proxy(engine, prisma14), env14, taskIdentifier); + + // Create a DELAYED run + latest snapshot directly on LEGACY (PG14) via the + // legacy store, with a far-future delayUntil (+1 year). Residency: LEGACY. + const farFuture = new Date(Date.now() + 365 * 24 * 60 * 60 * 1000); + const created = await legacyStore.createRun( + { + data: { + id: "run_hetero_delayed", + engine: "V2", + status: "DELAYED", + friendlyId: "run_hd1", + runtimeEnvironmentId: env14.id, + environmentType: "PRODUCTION", + organizationId: env14.organization.id, + projectId: env14.project.id, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: "t_hd", + spanId: "s_hd", + queue: "task/test-task", + workerQueue: "main", + isTest: false, + delayUntil: farFuture, + }, + snapshot: { + engine: "V2", + executionStatus: "DELAYED", + description: "Run was created with a delay", + runStatus: "DELAYED", + environmentId: env14.id, + environmentType: "PRODUCTION", + projectId: env14.project.id, + organizationId: env14.organization.id, + }, + }, + prisma14 + ); + residency.set(created.id, legacyStore); + + // Sanity: the run is DELAYED on LEGACY and absent on NEW. + const legacyBefore = await prisma14.taskRun.findUnique({ where: { id: created.id } }); + expect(legacyBefore?.status).toBe("DELAYED"); + const newBefore = await prisma17.taskRun.findUnique({ where: { id: created.id } }); + expect(newBefore).toBeNull(); + + // Simulate the straggler migration: copy the run row + its latest snapshot + // onto NEW (PG17) and flip the residency map to NEW. NO worker re-arm. + const latestSnapshot = await prisma14.taskRunExecutionSnapshot.findFirstOrThrow({ + where: { runId: created.id }, + orderBy: { createdAt: "desc" }, + }); + + await prisma17.taskRun.create({ + data: { + id: created.id, + engine: "V2", + status: "DELAYED", + friendlyId: created.friendlyId, + runtimeEnvironmentId: env17.id, + environmentType: "PRODUCTION", + organizationId: env17.organization.id, + projectId: env17.project.id, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: created.traceId, + spanId: created.spanId, + queue: "task/test-task", + workerQueue: "main", + isTest: false, + // Move the deadline to the past on NEW so the (unchanged) handler enqueues. + delayUntil: new Date(Date.now() - 1_000), + }, + }); + + await prisma17.taskRunExecutionSnapshot.create({ + data: { + engine: "V2", + executionStatus: latestSnapshot.executionStatus, + description: latestSnapshot.description, + runId: created.id, + runStatus: latestSnapshot.runStatus, + environmentId: env17.id, + environmentType: "PRODUCTION", + projectId: env17.project.id, + organizationId: env17.organization.id, + isValid: true, + }, + }); + + residency.set(created.id, newStore); + + // Fire the UNCHANGED handler. The router resolves the run to NEW. + await engine.delayedRunSystem.enqueueDelayedRun({ runId: created.id }); + + // Reads/writes resolved to NEW: status is now PENDING on PG17. + const newAfter = await prisma17.taskRun.findUnique({ where: { id: created.id } }); + expect(newAfter?.status).toBe("PENDING"); + + // LEGACY copy is untouched (still DELAYED with the far-future deadline). + const legacyAfter = await prisma14.taskRun.findUnique({ where: { id: created.id } }); + expect(legacyAfter?.status).toBe("DELAYED"); + expect(legacyAfter?.delayUntil?.toISOString()).toBe(farFuture.toISOString()); + } finally { + await engine.quit(); + } + } + ); +}); + +// A minimal two-store router implementing RunStore by delegating each method to the +// store that OWNS the run id. Reads/writes that carry a runId first arg (or a bulk id +// list) resolve by residency; methods without a run id use the default store. This is +// a real router over real PostgresRunStores — no mocking. +export function createRouter(residency: Map, defaultStore: RunStore): RunStore { + const resolveById = (runId: string): RunStore => residency.get(runId) ?? defaultStore; + + const handler: ProxyHandler = { + get(_target, prop: string | symbol) { + switch (prop) { + // runId is the first positional arg. + case "rescheduleRun": + case "enqueueDelayedRun": + case "expireRun": + case "findLatestExecutionSnapshot": + case "startAttempt": + case "completeAttemptSuccess": + case "recordRetryOutcome": + case "requeueRun": + case "recordBulkActionMembership": + case "cancelRun": + case "failRunPermanently": + case "lockRunToWorker": + case "parkPendingVersion": + case "promotePendingVersionRuns": + case "suspendForCheckpoint": + case "resumeFromCheckpoint": + case "rewriteDebouncedRun": + case "updateMetadata": + case "pushTags": + case "pushRealtimeStream": + case "findSnapshotCompletedWaitpointIds": + return (...args: any[]) => (resolveById(args[0]) as any)[prop](...args); + + // findRun(where, ...) — resolve by where.id when present. + case "findRun": + case "findRunOrThrow": + return (...args: any[]) => { + const where = args[0]; + const id = where && typeof where.id === "string" ? where.id : undefined; + const store = id ? resolveById(id) : defaultStore; + return (store as any)[prop](...args); + }; + + // expireRunsBatch(runIds, ...) — resolve by the first id. + case "expireRunsBatch": + return (...args: any[]) => { + const runIds: string[] = args[0] ?? []; + const store = runIds.length > 0 ? resolveById(runIds[0]) : defaultStore; + return (store as any)[prop](...args); + }; + + // findRuns({ where: { id: { in: [...] } } }) — resolve by the first id. + case "findRuns": + return (...args: any[]) => { + const inList = args[0]?.where?.id?.in as string[] | undefined; + const store = inList && inList.length > 0 ? resolveById(inList[0]) : defaultStore; + return (store as any)[prop](...args); + }; + + default: + // Everything else (createRun, waitpoint family, forWaitpointCompletion, + // batch, checkpoint, attempt, dependency reads) goes to the default store. + return (...args: any[]) => (defaultStore as any)[prop](...args); + } + }, + }; + + return new Proxy({} as RunStore, handler); +} + +// Tiny shim so setupBackgroundWorker can write its rows on the PG14 prisma using the +// engine's other facilities. setupBackgroundWorker only touches engine.prisma and +// engine.runQueue; we pass a view of the engine whose `prisma` is PG14. +function engine14Proxy(engine: RunEngine, prisma14: PrismaClient): RunEngine { + return new Proxy(engine, { + get(target, prop, receiver) { + if (prop === "prisma") return prisma14; + const value = Reflect.get(target, prop, receiver); + return typeof value === "function" ? value.bind(target) : value; + }, + }) as RunEngine; +} diff --git a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts index cd22895429c..bc08a654840 100644 --- a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts @@ -38,7 +38,7 @@ export class DelayedRunSystem { "rescheduleDelayedRun", async () => { return await this.$.runLock.lock("rescheduleDelayedRun", [runId], async () => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); + const snapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); // Check if the run is still in DELAYED status (or legacy RUN_CREATED for older runs) if ( @@ -100,7 +100,7 @@ export class DelayedRunSystem { return await this.$.runLock.lock("enqueueDelayedRun", [runId], async () => { // Check if run is still in DELAYED status before enqueuing // This prevents a race where debounce reschedules the run while we're about to enqueue it - const snapshot = await getLatestExecutionSnapshot(this.$.prisma, runId); + const snapshot = await getLatestExecutionSnapshot(this.$.prisma, runId, this.$.runStore); if (snapshot.executionStatus !== "DELAYED" && snapshot.executionStatus !== "RUN_CREATED") { this.$.logger.debug("enqueueDelayedRun: run is no longer delayed, skipping enqueue", { @@ -110,25 +110,20 @@ export class DelayedRunSystem { return; } - const run = await this.$.runStore.findRun( - { id: runId }, - { - include: { - runtimeEnvironment: { - include: { - project: true, - organization: true, - }, - }, - }, - }, - this.$.prisma - ); + // Read run-ops scalars only; resolve the control-plane env separately so the run-ops DB can + // split without a cross-provider join. + const run = await this.$.runStore.findRun({ id: runId }, this.$.prisma); if (!run) { throw new Error(`#enqueueDelayedRun: run not found: ${runId}`); } + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + throw new Error(`#enqueueDelayedRun: environment not found for run: ${runId}`); + } + // Check if delayUntil has been rescheduled to the future (e.g., by debounce) // If so, don't enqueue - the rescheduled worker job will handle it if (run.delayUntil && run.delayUntil > new Date()) { @@ -146,7 +141,7 @@ export class DelayedRunSystem { // For DEV environments where the dev CLI may not be running, fast-pathed // runs can sit on the worker queue indefinitely. Keep the legacy per-run // expireRun job armed for DEV so those runs still expire. - if (run.ttl && run.runtimeEnvironment.type === "DEVELOPMENT") { + if (run.ttl && env.type === "DEVELOPMENT") { const expireAt = parseNaturalLanguageDuration(run.ttl); if (expireAt) { await this.$.worker.enqueue({ @@ -158,7 +153,6 @@ export class DelayedRunSystem { } } - // Now we need to enqueue the run into the RunQueue // Skip the lock in enqueueRun since we already hold it. // includeTtl: true so the run's TTL is armed from the moment it enters // the queue (not from taskRun.createdAt). The TTL system tracks runs @@ -166,7 +160,7 @@ export class DelayedRunSystem { // enqueued here, so this is the correct point to arm TTL. await this.enqueueSystem.enqueueRun({ run, - env: run.runtimeEnvironment, + env, batchId: run.batchId ?? undefined, skipRunLock: true, includeTtl: true, @@ -194,10 +188,10 @@ export class DelayedRunSystem { batchId: updatedRun.batchId, }, organization: { - id: run.runtimeEnvironment.organizationId, + id: env.organizationId, }, project: { - id: run.runtimeEnvironment.projectId, + id: env.projectId, }, environment: { id: run.runtimeEnvironmentId, diff --git a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts index 8791dc1bd12..3470f70a9b7 100644 --- a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts @@ -9,10 +9,10 @@ import { BackgroundWorkerTask, Prisma, PrismaClientOrTransaction, + RuntimeEnvironmentType, TaskQueue, WorkerDeployment, } from "@trigger.dev/database"; -import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; import { sendNotificationToWorker } from "../eventBus.js"; import { getMachinePreset } from "../machinePresets.js"; @@ -30,16 +30,26 @@ export type DequeueSystemOptions = { billingCache: BillingCache; }; -type RunWithMininimalEnvironment = Prisma.TaskRunGetPayload<{ - include: { - runtimeEnvironment: { - select: { - id: true; - type: true; - }; - }; - }; -}>; +// Run-ops scalars the dequeue path reads off the run row. The environment half (type, +// archivedAt) is resolved separately via the controlPlaneResolver so the run-ops DB can +// split without a cross-provider join. +const dequeueRunSelect = { + id: true, + taskIdentifier: true, + lockedToVersionId: true, + lockedQueueId: true, + queue: true, + projectId: true, + runtimeEnvironmentId: true, + maxAttempts: true, + startedAt: true, + maxDurationInSeconds: true, + lockedRetryConfig: true, + attemptNumber: true, + machinePreset: true, +} satisfies Prisma.TaskRunSelect; + +type RunWithDequeueScalars = Prisma.TaskRunGetPayload<{ select: typeof dequeueRunSelect }>; type RunWithBackgroundWorkerTasksResult = | { @@ -57,7 +67,8 @@ type RunWithBackgroundWorkerTasksResult = | "QUEUE_NOT_FOUND" | "RUN_ENVIRONMENT_ARCHIVED"; message: string; - run: RunWithMininimalEnvironment; + run: RunWithDequeueScalars; + environmentType: RuntimeEnvironmentType; } | { success: false; @@ -67,11 +78,13 @@ type RunWithBackgroundWorkerTasksResult = expected: string; received: string; }; - run: RunWithMininimalEnvironment; + run: RunWithDequeueScalars; + environmentType: RuntimeEnvironmentType; } | { success: true; - run: RunWithMininimalEnvironment; + run: RunWithDequeueScalars; + environmentType: RuntimeEnvironmentType; worker: BackgroundWorker; task: BackgroundWorkerTask; queue: TaskQueue; @@ -127,7 +140,6 @@ export class DequeueSystem { this.$.tracer, "dequeueFromWorkerQueue", async (span) => { - //gets multiple runs from the queue const message = await this.$.runQueue.dequeueMessageFromWorkerQueue( consumerId, workerQueue, @@ -167,7 +179,7 @@ export class DequeueSystem { "dequeueFromWorkerQueue", [runId], async () => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); + const snapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); if (!isDequeueableExecutionStatus(snapshot.executionStatus)) { // If it's pending executing it will be picked up by the stalled system if there's an issue @@ -350,7 +362,7 @@ export class DequeueSystem { } //check for a valid deployment if it's not a development environment - if (result.run.runtimeEnvironment.type !== "DEVELOPMENT") { + if (result.environmentType !== "DEVELOPMENT") { if (!result.deployment || !result.deployment.imageReference) { this.$.logger.warn("RunEngine.dequeueFromWorkerQueue(): No deployment found", { runId, @@ -604,8 +616,8 @@ export class DequeueSystem { annotations: RunAnnotations.safeParse(lockedTaskRun.annotations).data, }, environment: { - id: lockedTaskRun.runtimeEnvironment.id, - type: lockedTaskRun.runtimeEnvironment.type, + id: lockedTaskRun.runtimeEnvironmentId, + type: result.environmentType, }, organization: { id: orgId, @@ -644,16 +656,22 @@ export class DequeueSystem { this.$.runStore.findRun( { id: runId }, { - include: { - runtimeEnvironment: true, + select: { + id: true, + runtimeEnvironmentId: true, + projectId: true, }, }, prisma ) ); - // If DB is unavailable or run not found, just nack directly via Redis - if (findError || !run) { + const env = run + ? await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId) + : null; + + // If DB is unavailable, run not found, or env not resolved, just nack directly via Redis + if (findError || !run || !env) { this.$.logger.error( "RunEngine.dequeueFromWorkerQueue(): Failed to find run, nacking directly via Redis", { @@ -670,9 +688,9 @@ export class DequeueSystem { //this is an unknown error, we'll reattempt (with auto-backoff and eventually DLQ) const gotRequeued = await this.runAttemptSystem.tryNackAndRequeue({ run, - environment: run.runtimeEnvironment, + environment: { id: env.id, type: env.type }, orgId, - projectId: run.runtimeEnvironment.projectId, + projectId: run.projectId, error: { type: "INTERNAL_ERROR", code: "TASK_RUN_DEQUEUED_MAX_RETRIES", @@ -738,25 +756,26 @@ export class DequeueSystem { { select: { id: true, + runtimeEnvironmentId: true, status: true, attemptNumber: true, updatedAt: true, createdAt: true, runTags: true, batchId: true, - runtimeEnvironment: { - select: { - id: true, - type: true, - projectId: true, - project: { select: { id: true, organizationId: true } }, - }, - }, }, }, prisma ); + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + this.$.logger.error("RunEngine.#pendingVersion(): environment not found", { runId }); + await this.$.runQueue.acknowledgeMessage(orgId, runId); + return; + } + this.$.logger.debug("RunEngine.dequeueFromWorkerQueue(): Pending version", { runId, run, @@ -769,10 +788,10 @@ export class DequeueSystem { description: reason ?? "The run doesn't have a background worker, so we're going to ack it for now.", }, - environmentId: run.runtimeEnvironment.id, - environmentType: run.runtimeEnvironment.type, - projectId: run.runtimeEnvironment.projectId, - organizationId: run.runtimeEnvironment.project.organizationId, + environmentId: env.id, + environmentType: env.type, + projectId: env.projectId, + organizationId: env.organizationId, workerId, runnerId, }); @@ -791,13 +810,13 @@ export class DequeueSystem { batchId: run.batchId, }, organization: { - id: run.runtimeEnvironment.project.organizationId, + id: env.organizationId, }, project: { - id: run.runtimeEnvironment.projectId, + id: env.projectId, }, environment: { - id: run.runtimeEnvironment.id, + id: env.id, }, }); }); @@ -811,26 +830,14 @@ export class DequeueSystem { return startSpan(this.$.tracer, "getRunWithBackgroundWorkerTasks", async (span) => { span.setAttribute("run_id", runId); + // Read the run-ops scalars only; the control-plane env + worker version are resolved + // separately so the run-ops DB can split without a cross-provider join. const run = await this.$.runStore.findRun( { id: runId, }, { - include: { - runtimeEnvironment: { - select: { - id: true, - type: true, - archivedAt: true, - }, - }, - lockedToVersion: { - include: { - deployment: true, - tasks: true, - }, - }, - }, + select: dequeueRunSelect, }, prisma ); @@ -844,35 +851,39 @@ export class DequeueSystem { }; } - span.setAttribute("environment_type", run.runtimeEnvironment.type); + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + span.setAttribute("result", "NO_RUN"); + return { + success: false as const, + code: "NO_RUN", + message: `No environment found for run: ${runId}`, + }; + } + + span.setAttribute("environment_type", env.type); - if (run.runtimeEnvironment.archivedAt) { + if (env.archivedAt) { span.setAttribute("result", "RUN_ENVIRONMENT_ARCHIVED"); return { success: false as const, code: "RUN_ENVIRONMENT_ARCHIVED", message: `Run is on an archived environment: ${run.id}`, run, + environmentType: env.type, }; } const workerId = run.lockedToVersionId ?? backgroundWorkerId; //get the relevant BackgroundWorker with tasks and deployment (if not DEV) - let workerWithTasks: WorkerDeploymentWithWorkerTasks | null = null; - - if (run.runtimeEnvironment.type === "DEVELOPMENT") { - workerWithTasks = workerId - ? await this.#getWorkerById(prisma, workerId) - : await this.#getMostRecentWorker(prisma, run.runtimeEnvironmentId); - } else { - workerWithTasks = workerId - ? await this.#getWorkerDeploymentFromWorker(prisma, workerId) - : await this.#getManagedWorkerFromCurrentlyPromotedDeployment( - prisma, - run.runtimeEnvironmentId - ); - } + const workerWithTasks: WorkerDeploymentWithWorkerTasks | null = + await this.$.controlPlaneResolver.resolveWorkerVersion({ + environmentId: run.runtimeEnvironmentId, + type: env.type, + workerId: workerId ?? undefined, + }); if (!workerWithTasks) { span.setAttribute("result", "NO_WORKER"); @@ -881,6 +892,7 @@ export class DequeueSystem { code: "NO_WORKER", message: `No worker found for run: ${run.id}`, run, + environmentType: env.type, }; } @@ -896,6 +908,7 @@ export class DequeueSystem { received: workerWithTasks.worker.id, }, run, + environmentType: env.type, }; } } @@ -903,6 +916,8 @@ export class DequeueSystem { const backgroundTask = workerWithTasks.tasks.find((task) => task.slug === run.taskIdentifier); if (!backgroundTask) { + // Diagnostic-only disambiguation (off the hot path); left on `prisma` as the resolver + // interface exposes only env + worker-version resolution. const nonCurrentTask = await prisma.backgroundWorkerTask.findFirst({ where: { slug: run.taskIdentifier, @@ -924,6 +939,7 @@ export class DequeueSystem { code: "TASK_NOT_IN_LATEST", message: `Task not found in latest version: ${run.taskIdentifier}. Found in ${nonCurrentTask.worker.version}`, run, + environmentType: env.type, }; } else { span.setAttribute("result", "TASK_NEVER_REGISTERED"); @@ -932,6 +948,7 @@ export class DequeueSystem { code: "TASK_NEVER_REGISTERED", message: `Task has never been registered (in dev or deployed): ${run.taskIdentifier}`, run, + environmentType: env.type, }; } } @@ -947,6 +964,7 @@ export class DequeueSystem { code: "QUEUE_NOT_FOUND", message: `Queue not found for run: ${run.id}`, run, + environmentType: env.type, }; } @@ -955,6 +973,7 @@ export class DequeueSystem { return { success: true as const, run, + environmentType: env.type, worker: workerWithTasks.worker, task: backgroundTask, queue, @@ -962,180 +981,4 @@ export class DequeueSystem { }; }); } - - async #getWorkerDeploymentFromWorker( - prisma: PrismaClientOrTransaction, - workerId: string - ): Promise { - return startSpan(this.$.tracer, "getWorkerDeploymentFromWorker", async (span) => { - const worker = await prisma.backgroundWorker.findFirst({ - where: { - id: workerId, - }, - include: { - deployment: true, - tasks: true, - queues: true, - }, - }); - - if (!worker) { - span.setAttribute("result", "NOT_FOUND"); - return null; - } - - span.setAttribute("result", "SUCCESS"); - - return { - worker, - tasks: worker.tasks, - queues: worker.queues, - deployment: worker.deployment, - }; - }); - } - - async #getMostRecentWorker( - prisma: PrismaClientOrTransaction, - environmentId: string - ): Promise { - return startSpan(this.$.tracer, "getMostRecentWorker", async (span) => { - const worker = await prisma.backgroundWorker.findFirst({ - where: { - runtimeEnvironmentId: environmentId, - }, - include: { - tasks: true, - queues: true, - }, - orderBy: { - id: "desc", - }, - }); - - if (!worker) { - span.setAttribute("result", "NOT_FOUND"); - return null; - } - - span.setAttribute("result", "SUCCESS"); - - return { worker, tasks: worker.tasks, queues: worker.queues, deployment: null }; - }); - } - - async #getWorkerById( - prisma: PrismaClientOrTransaction, - workerId: string - ): Promise { - return startSpan(this.$.tracer, "getWorkerById", async (span) => { - const worker = await prisma.backgroundWorker.findFirst({ - where: { - id: workerId, - }, - include: { - deployment: true, - tasks: true, - queues: true, - }, - orderBy: { - id: "desc", - }, - }); - - if (!worker) { - span.setAttribute("result", "NOT_FOUND"); - return null; - } - - span.setAttribute("result", "SUCCESS"); - - return { - worker, - tasks: worker.tasks, - queues: worker.queues, - deployment: worker.deployment, - }; - }); - } - - async #getManagedWorkerFromCurrentlyPromotedDeployment( - prisma: PrismaClientOrTransaction, - environmentId: string - ): Promise { - return startSpan( - this.$.tracer, - "getManagedWorkerFromCurrentlyPromotedDeployment", - async (span) => { - const promotion = await prisma.workerDeploymentPromotion.findFirst({ - where: { - environmentId, - label: CURRENT_DEPLOYMENT_LABEL, - }, - include: { - deployment: { - include: { - worker: { - include: { - tasks: true, - queues: true, - }, - }, - }, - }, - }, - }); - - if (!promotion || !promotion.deployment.worker) { - span.setAttribute("result", "NO_PROMOTION_OR_WORKER"); - return null; - } - - if (promotion.deployment.type === "MANAGED") { - // This is a run engine v2 deployment, so return it - span.setAttribute("result", "SUCCESS_CURRENT_MANAGED"); - - return { - worker: promotion.deployment.worker, - tasks: promotion.deployment.worker.tasks, - queues: promotion.deployment.worker.queues, - deployment: promotion.deployment, - }; - } - - // We need to get the latest run engine v2 deployment - const latestV2Deployment = await prisma.workerDeployment.findFirst({ - where: { - environmentId, - type: "MANAGED", - }, - orderBy: { - id: "desc", - }, - include: { - worker: { - include: { - tasks: true, - queues: true, - }, - }, - }, - }); - - if (!latestV2Deployment?.worker) { - span.setAttribute("result", "NO_V2_DEPLOYMENT"); - return null; - } - - span.setAttribute("result", "SUCCESS_LATEST_V2"); - - return { - worker: latestV2Deployment.worker, - tasks: latestV2Deployment.worker.tasks, - queues: latestV2Deployment.worker.queues, - deployment: latestV2Deployment, - }; - } - ); - } } diff --git a/internal-packages/run-engine/src/engine/systems/enqueueSystem.test.ts b/internal-packages/run-engine/src/engine/systems/enqueueSystem.test.ts new file mode 100644 index 00000000000..a5730b306f2 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/enqueueSystem.test.ts @@ -0,0 +1,202 @@ +import { assertNonNullable, containerTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import { trace } from "@internal/tracing"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { getLatestExecutionSnapshot } from "../systems/executionSnapshotSystem.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any, store?: PostgresRunStore) { + return { + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * A real PostgresRunStore subclass that counts the snapshot create method that enqueueRun's + * snapshot write routes through (via executionSnapshotSystem.createExecutionSnapshot). super.* + * runs the genuine store implementation, so the routing is observed over real containers without + * ever mocking prisma or the store. + */ +class CountingPostgresRunStore extends PostgresRunStore { + public snapshotCreates = 0; + + override async createExecutionSnapshot( + input: any, + tx?: any + ): ReturnType { + this.snapshotCreates++; + return super.createExecutionSnapshot(input, tx); + } +} + +describe("RunEngine enqueueRun store routing", () => { + // The QUEUED snapshot written while enqueuing a run routes through the injected store. + containerTest( + "enqueueRun snapshot routes through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const before = countingStore.snapshotCreates; + + const run = await engine.trigger( + { + number: 1, + friendlyId: generateFriendlyId("run"), + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + expect(countingStore.snapshotCreates).toBeGreaterThan(before); + + const latest = await getLatestExecutionSnapshot(prisma, run.id); + assertNonNullable(latest); + expect(latest.executionStatus).toBe("QUEUED"); + + const snapshotRow = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId: run.id, executionStatus: "QUEUED" }, + }); + assertNonNullable(snapshotRow); + expect(snapshotRow.runId).toBe(run.id); + } finally { + await engine.quit(); + } + } + ); + + // The Redis enqueueMessage path is unchanged — the run is dequeuable after enqueueRun. + containerTest( + "Redis enqueue is unchanged (run is dequeuable after enqueueRun)", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: generateFriendlyId("run"), + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_consumer", + workerQueue: "main", + }); + + expect(dequeued.length).toBe(1); + expect(dequeued[0].run.id).toBe(run.id); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB binds one client (passthrough) — proven by behavior, not by store.prisma === prisma. + containerTest( + "single-DB binds one client (passthrough) — snapshot round-trips on the one client", + async ({ prisma, redisOptions }) => { + // No `store` injected → the engine builds its default single-client PostgresRunStore. + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: generateFriendlyId("run"), + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const latest = await getLatestExecutionSnapshot(prisma, run.id); + assertNonNullable(latest); + expect(latest.executionStatus).toBe("QUEUED"); + expect(latest.runId).toBe(run.id); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts index d899aa7a6f3..e0a065c445b 100644 --- a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts @@ -4,6 +4,7 @@ import { TaskRun, TaskRunExecutionStatus, } from "@trigger.dev/database"; +import { RunStore } from "@internal/run-store"; import { parseNaturalLanguageDuration } from "@trigger.dev/core/v3/isomorphic"; import { MinimalAuthenticatedEnvironment } from "../../shared/index.js"; import { ExecutionSnapshotSystem } from "./executionSnapshotSystem.js"; @@ -37,6 +38,7 @@ export class EnqueueSystem { skipRunLock, includeTtl = false, enableFastPath = false, + store, }: { run: TaskRun; env: MinimalAuthenticatedEnvironment; @@ -60,28 +62,38 @@ export class EnqueueSystem { includeTtl?: boolean; /** When true, allow the queue to push directly to worker queue if concurrency is available. */ enableFastPath?: boolean; + /** + * When set (inside `runStore.runInTransaction`), the snapshot write goes through this tx-bound + * store so the promote+snapshot pair is atomic on the run's owning DB. The Redis enqueue + * below is not part of that transaction. + */ + store?: RunStore; }) { const prisma = tx ?? this.$.prisma; return await this.$.runLock.lockIf(!skipRunLock, "enqueueRun", [run.id], async () => { - const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { - run: run, - snapshot: { - executionStatus: snapshot?.status ?? "QUEUED", - description: snapshot?.description ?? "Run was QUEUED", - metadata: snapshot?.metadata ?? undefined, + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + prisma, + { + run: run, + snapshot: { + executionStatus: snapshot?.status ?? "QUEUED", + description: snapshot?.description ?? "Run was QUEUED", + metadata: snapshot?.metadata ?? undefined, + }, + previousSnapshotId, + batchId, + environmentId: env.id, + environmentType: env.type, + projectId: env.project.id, + organizationId: env.organization.id, + checkpointId, + completedWaitpoints, + workerId, + runnerId, }, - previousSnapshotId, - batchId, - environmentId: env.id, - environmentType: env.type, - projectId: env.project.id, - organizationId: env.organization.id, - checkpointId, - completedWaitpoints, - workerId, - runnerId, - }); + store + ); // Force development runs to use the environment id as the worker queue. const workerQueue = env.type === "DEVELOPMENT" ? env.id : run.workerQueue; diff --git a/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.test.ts b/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.test.ts new file mode 100644 index 00000000000..c52291876c7 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.test.ts @@ -0,0 +1,374 @@ +import { containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { + PostgresRunStore, + type CreateExecutionSnapshotInput, + type RunStore, +} from "@internal/run-store"; +import { trace } from "@internal/tracing"; +import { SnapshotId } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; +import { getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any, store?: PostgresRunStore) { + return { + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * A real PostgresRunStore subclass that counts the snapshot read/write methods this unit + * routes through, so the routing can be observed over real containers without ever mocking + * prisma. super.* runs the genuine store implementation. + */ +class CountingPostgresRunStore extends PostgresRunStore { + public creates = 0; + public latestReads = 0; + + override async createExecutionSnapshot( + input: CreateExecutionSnapshotInput, + tx?: any + ): ReturnType { + this.creates++; + return super.createExecutionSnapshot(input, tx); + } + + override async findLatestExecutionSnapshot( + runId: string, + client?: any + ): ReturnType { + this.latestReads++; + return super.findLatestExecutionSnapshot(runId, client); + } +} + +async function triggerRun(engine: RunEngine, prisma: PrismaClient, friendlyId: string) { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId, + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + return run; +} + +describe("executionSnapshotSystem store routing (single-DB passthrough)", () => { + // A triggered run's first snapshot create goes through the store, and the row lands. + containerTest("snapshot create routes through the store", async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const run = await triggerRun(engine, prisma, "run_snapcreate1"); + + const persisted = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId: run.id }, + }); + expect(persisted).not.toBeNull(); + expect(persisted?.runId).toBe(run.id); + expect(countingStore.creates).toBeGreaterThanOrEqual(1); + } finally { + await engine.quit(); + } + }); + + // getLatestExecutionSnapshot reads through the store, routed by run id. + containerTest( + "getLatestExecutionSnapshot reads through the store routed by run id", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const run = await triggerRun(engine, prisma, "run_snaplatest1"); + + const before = countingStore.latestReads; + const latest = await getLatestExecutionSnapshot(prisma, run.id, countingStore); + + expect(latest.runId).toBe(run.id); + expect(countingStore.latestReads).toBeGreaterThan(before); + // friendlyId is a valid SnapshotId friendly id derived from the cuid. + expect(latest.friendlyId).toMatch(/^snapshot_/); + expect(SnapshotId.fromFriendlyId(latest.friendlyId)).toBe(latest.id); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB binds one client (passthrough) — proven by behavior, not by reaching + // into a private prisma member. The read returns exactly the row just written. + containerTest( + "single-DB binds one client (passthrough) — round-trip on one client", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const run = await triggerRun(engine, prisma, "run_snappassthru"); + + const latest = await getLatestExecutionSnapshot(prisma, run.id, engine.runStore); + const persisted = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId: run.id, isValid: true }, + orderBy: { createdAt: "desc" }, + }); + + expect(persisted).not.toBeNull(); + // The read resolves on the one client to exactly the row just written. + expect(latest.id).toBe(persisted!.id); + expect(latest.runId).toBe(run.id); + } finally { + await engine.quit(); + } + } + ); +}); + +// --- Cross-version read-through proofs (Tests D/E/F) --- + +/** + * A real, minimal two-store router over two PostgresRunStore instances, selecting by owning + * run id. Never a mock: it only re-implements the by-run-id #route selection the production + * RoutingRunStore performs, delegating to genuine stores over real containers. We know which + * runs live where because we seed each run on exactly one store. + */ +class TwoStoreSnapshotRouter { + readonly newStore: PostgresRunStore; + readonly legacyStore: PostgresRunStore; + readonly #newRunIds: Set; + + constructor( + newStore: PostgresRunStore, + legacyStore: PostgresRunStore, + newRunIds: Iterable + ) { + this.newStore = newStore; + this.legacyStore = legacyStore; + this.#newRunIds = new Set(newRunIds); + } + + #route(runId: string): PostgresRunStore { + return this.#newRunIds.has(runId) ? this.newStore : this.legacyStore; + } + + findLatestExecutionSnapshot(runId: string, client?: any) { + return this.#route(runId).findLatestExecutionSnapshot(runId, client); + } +} + +async function seedEnvironment(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +async function seedRunWithSnapshot( + prisma: PrismaClient, + store: PostgresRunStore, + suffix: string, + runId: string +) { + const env = await seedEnvironment(prisma, suffix); + await store.createRun({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: `run_friendly_${suffix}`, + runtimeEnvironmentId: env.environment.id, + environmentType: "DEVELOPMENT", + organizationId: env.organization.id, + projectId: env.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `trace_${suffix}`, + spanId: `span_${suffix}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }, + } as any); + + const snapshot = await store.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "executing" }, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + return { env, snapshot }; +} + +describe("executionSnapshotSystem store routing (cross-version read-through)", () => { + // A new run resolves to the run-ops (NEW/PG17) store; the legacy store is untouched. + heteroPostgresTest( + "new run -> run-ops store (legacy untouched)", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const rNew = "run_new_d"; + const { snapshot } = await seedRunWithSnapshot(prisma17 as any, newStore, "new_d", rNew); + + const router = new TwoStoreSnapshotRouter(newStore, legacyStore, [rNew]); + + const latest = await getLatestExecutionSnapshot( + prisma17 as any, + rNew, + router as unknown as RunStore + ); + + expect(latest.runId).toBe(rNew); + expect(latest.id).toBe(snapshot.id); + // The legacy store has no such run. + const legacyRow = await prisma14.taskRunExecutionSnapshot.findFirst({ + where: { runId: rNew }, + }); + expect(legacyRow).toBeNull(); + } + ); + + // An old run resolves via read-through to the LEGACY (PG14) store; the enhanced + // snapshot is well-formed across the version boundary. + heteroPostgresTest( + "old run -> read-through to legacy store (well-formed across versions)", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const rOld = "run_old_e"; + const { snapshot } = await seedRunWithSnapshot(prisma14 as any, legacyStore, "old_e", rOld); + + // rOld is NOT in the new-run set, so the router routes it to LEGACY (read-through). + const router = new TwoStoreSnapshotRouter(newStore, legacyStore, []); + + const latest = await getLatestExecutionSnapshot( + prisma14 as any, + rOld, + router as unknown as RunStore + ); + + expect(latest.runId).toBe(rOld); + expect(latest.id).toBe(snapshot.id); + // EnhancedExecutionSnapshot is well-formed: friendlyId/runFriendlyId derived, arrays present. + expect(latest.friendlyId).toMatch(/^snapshot_/); + expect(latest.runFriendlyId).toMatch(/^run_/); + expect(Array.isArray(latest.completedWaitpoints)).toBe(true); + expect(latest.checkpoint).toBeNull(); + } + ); + + // Routing keys off runId, the SnapshotId is a cuid (not a 27-char ksuid), and no + // residency classifier is consulted for the snapshot id (D5). + heteroPostgresTest( + "snapshots route by owning run id; SnapshotId stays cuid", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const rNew = "run_new_f"; + const { snapshot } = await seedRunWithSnapshot(prisma17 as any, newStore, "new_f", rNew); + + // cuid is 25 chars (c + 24); a ksuid friendly id is 27 chars. The snapshot id is a cuid. + expect(snapshot.id.length).toBe(25); + + const router = new TwoStoreSnapshotRouter(newStore, legacyStore, [rNew]); + + // Route succeeds purely via runId; the snapshot id is never classified. + const latest = await getLatestExecutionSnapshot( + prisma17 as any, + rNew, + router as unknown as RunStore + ); + expect(latest.id).toBe(snapshot.id); + expect(latest.id.length).toBe(25); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.ts b/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.ts index 38d1cf79a8c..6c01fde9393 100644 --- a/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.ts @@ -10,6 +10,7 @@ import { TaskRunStatus, Waitpoint, } from "@trigger.dev/database"; +import { RunStore } from "@internal/run-store"; import { ExecutionSnapshotNotFoundError } from "../errors.js"; import { HeartbeatTimeouts } from "../types.js"; import { SystemResources } from "./systems.js"; @@ -124,8 +125,13 @@ function enhanceExecutionSnapshotWithWaitpoints( */ async function getSnapshotWaitpointIds( prisma: PrismaClientOrTransaction, - snapshotId: string + snapshotId: string, + runStore?: RunStore ): Promise { + if (runStore) { + return runStore.findSnapshotCompletedWaitpointIds(snapshotId, prisma); + } + const result = await prisma.$queryRaw<{ B: string }[]>` SELECT "B" FROM "_completedWaitpoints" WHERE "A" = ${snapshotId} `; @@ -139,16 +145,19 @@ async function getSnapshotWaitpointIds( */ async function fetchWaitpointsInChunks( prisma: PrismaClientOrTransaction, - waitpointIds: string[] + waitpointIds: string[], + runStore?: RunStore ): Promise { if (waitpointIds.length === 0) return []; const allWaitpoints: Waitpoint[] = []; for (let i = 0; i < waitpointIds.length; i += WAITPOINT_CHUNK_SIZE) { const chunk = waitpointIds.slice(i, i + WAITPOINT_CHUNK_SIZE); - const waitpoints = await prisma.waitpoint.findMany({ - where: { id: { in: chunk } }, - }); + const waitpoints = runStore + ? await runStore.findManyWaitpoints({ where: { id: { in: chunk } } }, prisma) + : await prisma.waitpoint.findMany({ + where: { id: { in: chunk } }, + }); allWaitpoints.push(...waitpoints); } return allWaitpoints; @@ -157,16 +166,19 @@ async function fetchWaitpointsInChunks( /* Gets the most recent valid snapshot for a run */ export async function getLatestExecutionSnapshot( prisma: PrismaClientOrTransaction, - runId: string + runId: string, + runStore?: RunStore ): Promise { - const snapshot = await prisma.taskRunExecutionSnapshot.findFirst({ - where: { runId, isValid: true }, - include: { - completedWaitpoints: true, - checkpoint: true, - }, - orderBy: { createdAt: "desc" }, - }); + const snapshot = runStore + ? await runStore.findLatestExecutionSnapshot(runId, prisma) + : await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId, isValid: true }, + include: { + completedWaitpoints: true, + checkpoint: true, + }, + orderBy: { createdAt: "desc" }, + }); if (!snapshot) { throw new Error(`No execution snapshot found for TaskRun ${runId}`); @@ -175,31 +187,6 @@ export async function getLatestExecutionSnapshot( return enhanceExecutionSnapshot(snapshot); } -export async function getExecutionSnapshotCompletedWaitpoints( - prisma: PrismaClientOrTransaction, - snapshotId: string -) { - const waitpoints = await prisma.taskRunExecutionSnapshot.findFirst({ - where: { id: snapshotId }, - include: { - completedWaitpoints: true, - }, - }); - - //deduplicate waitpoints - const waitpointIds = new Set(); - return ( - waitpoints?.completedWaitpoints.filter((waitpoint) => { - if (waitpointIds.has(waitpoint.id)) { - return false; - } else { - waitpointIds.add(waitpoint.id); - return true; - } - }) ?? [] - ); -} - export function executionResultFromSnapshot(snapshot: TaskRunExecutionSnapshot): ExecutionResult { return { snapshot: { @@ -272,41 +259,67 @@ export function executionDataFromSnapshot(snapshot: EnhancedExecutionSnapshot): export async function getExecutionSnapshotsSince( prisma: PrismaClientOrTransaction, runId: string, - sinceSnapshotId: string + sinceSnapshotId: string, + runStore?: RunStore ): Promise { // Step 1: Find the createdAt of the sinceSnapshotId - const sinceSnapshot = await prisma.taskRunExecutionSnapshot.findFirst({ - where: { id: sinceSnapshotId, runId }, - select: { createdAt: true }, - }); + const sinceSnapshot = runStore + ? await runStore.findExecutionSnapshot( + { + where: { id: sinceSnapshotId, runId }, + select: { createdAt: true }, + }, + prisma + ) + : await prisma.taskRunExecutionSnapshot.findFirst({ + where: { id: sinceSnapshotId, runId }, + select: { createdAt: true }, + }); if (!sinceSnapshot) { throw new ExecutionSnapshotNotFoundError(sinceSnapshotId); } // Step 2: Fetch snapshots WITHOUT waitpoints to avoid N×M data explosion - const snapshots = await prisma.taskRunExecutionSnapshot.findMany({ - where: { - runId, - isValid: true, - createdAt: { gt: sinceSnapshot.createdAt }, - }, - include: { - checkpoint: true, - // DO NOT include completedWaitpoints here - this causes the N×M explosion - }, - orderBy: { createdAt: "desc" }, - take: 50, - }); + const snapshots = runStore + ? await runStore.findManyExecutionSnapshots( + { + where: { + runId, + isValid: true, + createdAt: { gt: sinceSnapshot.createdAt }, + }, + include: { + checkpoint: true, + // DO NOT include completedWaitpoints here - this causes the N×M explosion + }, + orderBy: { createdAt: "desc" }, + take: 50, + }, + prisma + ) + : await prisma.taskRunExecutionSnapshot.findMany({ + where: { + runId, + isValid: true, + createdAt: { gt: sinceSnapshot.createdAt }, + }, + include: { + checkpoint: true, + // DO NOT include completedWaitpoints here - this causes the N×M explosion + }, + orderBy: { createdAt: "desc" }, + take: 50, + }); if (snapshots.length === 0) return []; // Step 3: Get waitpoint IDs for the LATEST snapshot only (first in desc order) const latestSnapshot = snapshots[0]; - const waitpointIds = await getSnapshotWaitpointIds(prisma, latestSnapshot.id); + const waitpointIds = await getSnapshotWaitpointIds(prisma, latestSnapshot.id, runStore); // Step 4: Fetch waitpoints in chunks to avoid NAPI string conversion limits - const waitpoints = await fetchWaitpointsInChunks(prisma, waitpointIds); + const waitpoints = await fetchWaitpointsInChunks(prisma, waitpointIds, runStore); // Step 5: Build enhanced snapshots - only latest gets waitpoints, others get empty arrays // The runner only uses completedWaitpoints from the latest snapshot anyway @@ -366,18 +379,18 @@ export class ExecutionSnapshotSystem { index?: number; }[]; error?: string; - } + }, + // When set (inside runStore.runInTransaction), the snapshot write goes through the owning store + // with `prisma` = that store's own tx, so it shares ONE transaction with the sibling write (e.g. + // startAttempt) and a mid-pair failure rolls both back. Otherwise the router routes it. + // The heartbeat/eventBus side effects below are unchanged. + store?: RunStore ) { - const newSnapshot = await prisma.taskRunExecutionSnapshot.create({ - data: { - engine: "V2", - executionStatus: snapshot.executionStatus, - description: snapshot.description, + const newSnapshot = await (store ?? this.$.runStore).createExecutionSnapshot( + { + run, + snapshot, previousSnapshotId, - runId: run.id, - // We can't set the runStatus to DEQUEUED because it will break older runners - runStatus: run.status === "DEQUEUED" ? "PENDING" : run.status, - attemptNumber: run.attemptNumber ?? undefined, batchId, environmentId, environmentType, @@ -386,21 +399,11 @@ export class ExecutionSnapshotSystem { checkpointId, workerId, runnerId, - metadata: snapshot.metadata ?? undefined, - completedWaitpoints: { - connect: completedWaitpoints?.map((w) => ({ id: w.id })), - }, - completedWaitpointOrder: completedWaitpoints - ?.filter((c) => c.index !== undefined) - .sort((a, b) => a.index! - b.index!) - .map((w) => w.id), - isValid: error ? false : true, + completedWaitpoints, error, }, - include: { - checkpoint: true, - }, - }); + prisma + ); if (!error) { //set heartbeat (if relevant) @@ -449,7 +452,7 @@ export class ExecutionSnapshotSystem { const prisma = tx ?? this.$.prisma; //we don't need to acquire a run lock for any of this, it's not critical if it happens on an older version - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); if (latestSnapshot.id !== snapshotId) { this.$.logger.log("heartbeatRun: no longer the latest snapshot, stopping the heartbeat.", { runId, @@ -503,7 +506,7 @@ export class ExecutionSnapshotSystem { }): Promise { const prisma = tx ?? this.$.prisma; - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); this.$.logger.debug("restartHeartbeatForRun: enqueuing heartbeat", { runId, diff --git a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.test.ts b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.test.ts new file mode 100644 index 00000000000..9604613bcb2 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.test.ts @@ -0,0 +1,534 @@ +import { containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import { trace } from "@internal/tracing"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +/** + * A real lookup double standing in for the ClickHouse/control-plane candidate-id source. + * Not a DB mock: it is the injected control-plane dependency that supplies candidate run ids, + * exactly as the production CH-backed lookup does. + */ +class StubPendingVersionRunIdLookup { + name = "stub-lookup"; + constructor(private ids: string[]) {} + setIds(ids: string[]) { + this.ids = ids; + } + async lookupPendingVersionRunIds(_args: any): Promise<{ runIds: string[] }> { + return { runIds: this.ids }; + } +} + +/** + * A real PostgresRunStore subclass that counts the run-ops methods the pendingVersion path + * routes through, so the routing can be observed over real containers without mocking prisma. + */ +class CountingPostgresRunStore extends PostgresRunStore { + public findRunsCalls = 0; + public promoteCalls = 0; + public promotedIds: string[] = []; + + /** + * Optional side-effect run AFTER `findRuns` has hydrated its rows but BEFORE + * they are returned to the caller. Used to simulate a candidate that is still + * PENDING_VERSION at hydrate time but races out of it before the per-run + * promote transaction runs, so the loop reaches it and the count === 0 + * idempotency guard fires. + */ + public afterFindRuns?: () => Promise; + + override async findRuns( + ...args: Parameters + ): ReturnType { + this.findRunsCalls++; + const result = await super.findRuns(...args); + if (this.afterFindRuns) { + await this.afterFindRuns(); + } + return result; + } + + override async promotePendingVersionRuns( + ...args: Parameters + ): ReturnType { + this.promoteCalls++; + this.promotedIds.push(args[0] as string); + return super.promotePendingVersionRuns(...args); + } +} + +function createEngineOptions( + redisOptions: any, + prisma: any, + lookup: StubPendingVersionRunIdLookup, + store?: PostgresRunStore +) { + return { + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + pendingVersionRunIdLookup: lookup, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * Seed a PENDING_VERSION run via the store directly into the given environment. This avoids the + * trigger lifecycle's background auto-resolution racing the test, while still exercising the + * real run-ops store create path over real containers. + */ +async function seedPendingVersionRunInEnv( + store: PostgresRunStore, + environment: Awaited>, + taskIdentifier: string, + createdAt: Date = new Date() +) { + const { id, friendlyId } = RunId.generate(); + await store.createRun({ + data: { + id, + engine: "V2", + status: "PENDING_VERSION", + friendlyId, + runtimeEnvironmentId: environment.id, + environmentType: environment.type, + organizationId: environment.organizationId, + projectId: environment.projectId, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `trace_${friendlyId}`, + spanId: `span_${friendlyId}`, + runTags: [], + queue: `task/${taskIdentifier}`, + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt, + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING_VERSION", + environmentId: environment.id, + environmentType: environment.type, + projectId: environment.projectId, + organizationId: environment.organizationId, + }, + } as any); + return { id, queue: `task/${taskIdentifier}` }; +} + +describe("pendingVersionSystem store routing (single-DB passthrough)", () => { + // Candidate ids from the (control-plane/CH) lookup hydrate from the run-ops store + // via findRuns. Uses a DEVELOPMENT env so setupBackgroundWorker performs no deployment + + // no background auto-resolution that would race the manual resolve. + containerTest( + "CH candidate ids hydrate from the run-ops store via findRuns", + async ({ prisma, redisOptions }) => { + const lookup = new StubPendingVersionRunIdLookup([]); + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine( + createEngineOptions(redisOptions, prisma, lookup, countingStore) + ); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "DEVELOPMENT"); + const taskIdentifier = "test-task"; + const worker = await setupBackgroundWorker(engine, environment, taskIdentifier); + + const r1 = await seedPendingVersionRunInEnv(countingStore, environment, taskIdentifier); + const r2 = await seedPendingVersionRunInEnv(countingStore, environment, taskIdentifier); + + lookup.setIds([r1.id, r2.id]); + + const beforeFindRuns = countingStore.findRunsCalls; + await engine.pendingVersionSystem.enqueueRunsForBackgroundWorker(worker.worker.id); + + // The id-set hydrate went through findRuns. + expect(countingStore.findRunsCalls).toBeGreaterThan(beforeFindRuns); + // Both candidate rows were promoted out of PENDING_VERSION via the routed flip. + const after = await prisma.taskRun.findMany({ where: { id: { in: [r1.id, r2.id] } } }); + for (const row of after) expect(row.status).toBe("PENDING"); + } finally { + await engine.quit(); + } + } + ); + + // The promotion flips PENDING_VERSION -> PENDING atomically and enqueues; the + // count === 0 idempotency guard fires for a candidate that is still PENDING_VERSION at + // hydrate time but races out of it before its per-run promote transaction runs. r2 is + // flipped to PENDING by the store's afterFindRuns hook — i.e. AFTER findRuns has already + // hydrated it into the candidate set — so the loop reaches r2, promotePendingVersionRuns + // is genuinely invoked for it, returns count === 0, and the `if (!promoted) continue` + // guard skips the enqueue/event without throwing. + containerTest( + "promotion flips PENDING_VERSION -> PENDING atomically; count === 0 guard skips a raced candidate", + async ({ prisma, redisOptions }) => { + const lookup = new StubPendingVersionRunIdLookup([]); + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine( + createEngineOptions(redisOptions, prisma, lookup, countingStore) + ); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "DEVELOPMENT"); + const taskIdentifier = "test-task"; + const worker = await setupBackgroundWorker(engine, environment, taskIdentifier); + + const r1 = await seedPendingVersionRunInEnv(countingStore, environment, taskIdentifier); + const r2 = await seedPendingVersionRunInEnv(countingStore, environment, taskIdentifier); + + const statusEvents: string[] = []; + engine.eventBus.on("runStatusChanged", (e: any) => { + statusEvents.push(e.run.id); + }); + + // Both r1 and r2 are PENDING_VERSION when findRuns hydrates them. Right after the + // hydrate returns, flip r2 to PENDING so that when the per-run loop reaches it the + // promote update matches 0 rows. This drives the count === 0 branch — unlike a + // pre-call flip, which would have r2 dropped by the hydrate status filter and never + // reach the promote at all. Guard against re-entrancy (findRuns may run again on a + // reschedule) so we only flip once. + let flipped = false; + countingStore.afterFindRuns = async () => { + if (flipped) return; + flipped = true; + await prisma.taskRun.update({ where: { id: r2.id }, data: { status: "PENDING" } }); + }; + + lookup.setIds([r1.id, r2.id]); + await engine.pendingVersionSystem.enqueueRunsForBackgroundWorker(worker.worker.id); + + // The guard was actually reached for r2 (it survived the hydrate), not skipped earlier. + expect(countingStore.promotedIds).toContain(r2.id); + expect(countingStore.promotedIds).toContain(r1.id); + + const after = await prisma.taskRun.findMany({ where: { id: { in: [r1.id, r2.id] } } }); + const byId = new Map(after.map((r) => [r.id, r])); + // r1 was promoted PENDING_VERSION -> PENDING. + expect(byId.get(r1.id)!.status).toBe("PENDING"); + // r2 stays PENDING (the count === 0 guard skipped it, no double-promote, no throw). + expect(byId.get(r2.id)!.status).toBe("PENDING"); + + // r1 entered the queue. + const queueLength = await engine.runQueue.lengthOfQueue(environment, r1.queue); + expect(queueLength).toBeGreaterThanOrEqual(1); + + // A runStatusChanged event fired for the promoted run only; r2 was skipped. + expect(statusEvents).toContain(r1.id); + expect(statusEvents).not.toContain(r2.id); + } finally { + await engine.quit(); + } + } + ); + + // Candidate ids whose rows are no longer PENDING_VERSION are dropped by the hydrate + // and no promotion/enqueue fires. + containerTest( + "stale candidates (not PENDING_VERSION) are dropped by the hydrate", + async ({ prisma, redisOptions }) => { + const lookup = new StubPendingVersionRunIdLookup([]); + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine( + createEngineOptions(redisOptions, prisma, lookup, countingStore) + ); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "DEVELOPMENT"); + const taskIdentifier = "test-task"; + const worker = await setupBackgroundWorker(engine, environment, taskIdentifier); + + const r1 = await seedPendingVersionRunInEnv(countingStore, environment, taskIdentifier); + + // Move it past PENDING_VERSION so the residual status filter drops it. + await prisma.taskRun.update({ where: { id: r1.id }, data: { status: "PENDING" } }); + + lookup.setIds([r1.id]); + + const beforePromote = countingStore.promoteCalls; + await engine.pendingVersionSystem.enqueueRunsForBackgroundWorker(worker.worker.id); + + // Hydrate ran but found nothing PENDING_VERSION, so no promotion fired. + expect(countingStore.findRunsCalls).toBeGreaterThanOrEqual(1); + expect(countingStore.promoteCalls).toBe(beforePromote); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB binds one client (passthrough) — proven by behavior. The whole resolve + // cycle (hydrate + flip + enqueue) resolves on the one client. + containerTest( + "single-DB binds one client (passthrough) — full resolve cycle on one client", + async ({ prisma, redisOptions }) => { + const lookup = new StubPendingVersionRunIdLookup([]); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, lookup)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "DEVELOPMENT"); + const taskIdentifier = "test-task"; + const worker = await setupBackgroundWorker(engine, environment, taskIdentifier); + + const r1 = await seedPendingVersionRunInEnv( + engine.runStore as PostgresRunStore, + environment, + taskIdentifier + ); + + lookup.setIds([r1.id]); + await engine.pendingVersionSystem.enqueueRunsForBackgroundWorker(worker.worker.id); + + // The flipped row and the enqueue resolve on the one client. + const row = await prisma.taskRun.findFirstOrThrow({ where: { id: r1.id } }); + expect(row.status).toBe("PENDING"); + const queueLength = await engine.runQueue.lengthOfQueue(environment, r1.queue); + expect(queueLength).toBeGreaterThanOrEqual(1); + } finally { + await engine.quit(); + } + } + ); +}); + +// --- Cross-version / cross-DB proofs (Tests L/M) --- + +async function seedEnvironment(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +async function seedPendingVersionRun( + prisma: PrismaClient, + store: PostgresRunStore, + suffix: string, + runId: string, + createdAt: Date, + env: Awaited> +) { + await store.createRun({ + data: { + id: runId, + engine: "V2", + status: "PENDING_VERSION", + friendlyId: `run_friendly_${suffix}`, + runtimeEnvironmentId: env.environment.id, + environmentType: "DEVELOPMENT", + organizationId: env.organization.id, + projectId: env.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `trace_${suffix}`, + spanId: `span_${suffix}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt, + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING_VERSION", + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }, + } as any); +} + +describe("pendingVersionSystem store routing (cross-version / cross-DB)", () => { + // Hydrate + promotion round-trip identically across PG14/PG17. + heteroPostgresTest( + "hydrate + promotion round-trip across versions", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const envNew = await seedEnvironment(prisma17 as any, "new_l"); + const envOld = await seedEnvironment(prisma14 as any, "old_l"); + + const newIds = ["run_new_l1", "run_new_l2"]; + const oldIds = ["run_old_l1", "run_old_l2"]; + // Seed in reverse createdAt order to prove the createdAt-asc ORDER BY. + await seedPendingVersionRun( + prisma17 as any, + newStore, + "new_l2", + newIds[1], + new Date("2024-02-01T00:00:00.000Z"), + envNew + ); + await seedPendingVersionRun( + prisma17 as any, + newStore, + "new_l1", + newIds[0], + new Date("2024-01-01T00:00:00.000Z"), + envNew + ); + await seedPendingVersionRun( + prisma14 as any, + legacyStore, + "old_l2", + oldIds[1], + new Date("2024-02-01T00:00:00.000Z"), + envOld + ); + await seedPendingVersionRun( + prisma14 as any, + legacyStore, + "old_l1", + oldIds[0], + new Date("2024-01-01T00:00:00.000Z"), + envOld + ); + + const newHydrate = await newStore.findRuns( + { where: { id: { in: newIds }, status: "PENDING_VERSION" }, orderBy: { createdAt: "asc" } }, + prisma17 as any + ); + const oldHydrate = await legacyStore.findRuns( + { where: { id: { in: oldIds }, status: "PENDING_VERSION" }, orderBy: { createdAt: "asc" } }, + prisma14 as any + ); + + // Identical ORDER BY (createdAt asc) across versions. + expect(newHydrate.map((r) => r.friendlyId.replace("new_", ""))).toEqual( + oldHydrate.map((r) => r.friendlyId.replace("old_", "")) + ); + expect(newHydrate.map((r) => r.id)).toEqual(newIds); + expect(oldHydrate.map((r) => r.id)).toEqual(oldIds); + + // Promotion flips identically across versions. + const newPromote = await newStore.promotePendingVersionRuns(newIds[0], prisma17 as any); + const oldPromote = await legacyStore.promotePendingVersionRuns(oldIds[0], prisma14 as any); + expect(newPromote.count).toBe(oldPromote.count); + expect(newPromote.count).toBe(1); + + const newReread = await newStore.findRunOrThrow({ id: newIds[0] }, prisma17 as any); + const oldReread = await legacyStore.findRunOrThrow({ id: oldIds[0] }, prisma14 as any); + expect(newReread.status).toBe(oldReread.status); + expect(newReread.status).toBe("PENDING"); + } + ); + + // Cross-DB seam — lookup ids resolve to the NEW store; the promotion lands on NEW + // only, the LEGACY store is untouched. + heteroPostgresTest( + "cross-DB seam: CH ids resolve + promote on the NEW store only", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const envNew = await seedEnvironment(prisma17 as any, "new_m"); + const envOld = await seedEnvironment(prisma14 as any, "old_m"); + + const newId = "run_new_m"; + const legacyId = "run_legacy_m"; + await seedPendingVersionRun( + prisma17 as any, + newStore, + "new_m", + newId, + new Date("2024-01-01T00:00:00.000Z"), + envNew + ); + await seedPendingVersionRun( + prisma14 as any, + legacyStore, + "old_m", + legacyId, + new Date("2024-01-01T00:00:00.000Z"), + envOld + ); + + // The candidate id from the lookup exists on the NEW store. Hydrate it from NEW only. + const hydrated = await newStore.findRuns( + { + where: { id: { in: [newId] }, status: "PENDING_VERSION" }, + orderBy: { createdAt: "asc" }, + }, + prisma17 as any + ); + expect(hydrated.map((r) => r.id)).toEqual([newId]); + + // Promote on NEW. + const promote = await newStore.promotePendingVersionRuns(newId, prisma17 as any); + expect(promote.count).toBe(1); + + // NEW flipped; LEGACY row untouched. + const newRow = await prisma17.taskRun.findFirstOrThrow({ where: { id: newId } }); + expect(newRow.status).toBe("PENDING"); + const legacyRow = await prisma14.taskRun.findFirstOrThrow({ where: { id: legacyId } }); + expect(legacyRow.status).toBe("PENDING_VERSION"); + // The NEW id does not exist on LEGACY at all. + const newOnLegacy = await prisma14.taskRun.findFirst({ where: { id: newId } }); + expect(newOnLegacy).toBeNull(); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts index 281808d7512..e2f3257d466 100644 --- a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts @@ -128,21 +128,28 @@ export class PendingVersionSystem { }); for (const run of pendingRuns) { - const promoted = await this.$.prisma.$transaction(async (tx) => { + // Atomic unit: the status promotion and the new QUEUED snapshot must commit together + // or a crash between them leaves the run promoted-to-PENDING with no snapshot. Under the run-ops + // split these route to the run's owning DB but, as two router calls, would each auto-commit. + // `runInTransaction` shares ONE owning-DB transaction; the inner writes use the tx-bound `store` + // (promotePendingVersionRuns directly, the snapshot via enqueueRun's `store` passthrough). The + // Redis enqueue inside enqueueRun is NOT in this transaction (Redis never was — unchanged). + const promoted = await this.$.runStore.runInTransaction(run.id, async (store, tx) => { // Idempotency guard: only flips PENDING_VERSION → PENDING. If another // worker already promoted this run between our findMany and the // update, count is 0 and we skip the enqueue. - const updateResult = await this.$.runStore.promotePendingVersionRuns(run.id, tx); + const updateResult = await store.promotePendingVersionRuns(run.id, tx); if (updateResult.count === 0) { return false; } - const updatedRun = await this.$.runStore.findRunOrThrow({ id: run.id }, tx); + const updatedRun = await store.findRunOrThrow({ id: run.id }, tx); await this.enqueueSystem.enqueueRun({ run: updatedRun, env: backgroundWorker.runtimeEnvironment, + store, tx, // PENDING_VERSION re-enqueue is the first time this run is actually // entering the run queue (the original enqueue was held back waiting diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.test.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.test.ts new file mode 100644 index 00000000000..ec921e79c92 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.test.ts @@ -0,0 +1,660 @@ +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore } from "@internal/run-store"; +import type { CreateRunInput } from "@internal/run-store"; +import type { Prisma, PrismaClient } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; +import { setTimeout } from "node:timers/promises"; + +vi.setConfig({ testTimeout: 60_000 }); + +// A real PostgresRunStore subclass (NEVER a mock) that records which dedicated +// RunStore method each runId was routed through, so the lifecycle tests can prove +// the taskRun reads/writes land on the store and not on direct prisma. +class CountingRunStore extends PostgresRunStore { + public readonly calls: Record = { + findRun: [], + startAttempt: [], + completeAttemptSuccess: [], + recordRetryOutcome: [], + requeueRun: [], + cancelRun: [], + failRunPermanently: [], + recordBulkActionMembership: [], + }; + + private record(method: keyof CountingRunStore["calls"], runId: string) { + this.calls[method].push(runId); + } + + countFor(method: keyof CountingRunStore["calls"], runId: string): number { + return this.calls[method].filter((id) => id === runId).length; + } + + override async findRun(where: any, args?: any, client?: any): Promise { + if (where && typeof where.id === "string") { + this.record("findRun", where.id); + } + // Preserve the 2-arg (where, client) overload where args is actually the client. + return super.findRun(where, args, client); + } + + override async startAttempt( + runId: string, + data: { attemptNumber: number; executedAt?: Date; isWarmStart: boolean }, + args: { select: S }, + tx?: any + ): Promise> { + this.record("startAttempt", runId); + return super.startAttempt(runId, data, args, tx); + } + + override async completeAttemptSuccess( + runId: string, + data: any, + args: { select: S }, + tx?: any + ): Promise> { + this.record("completeAttemptSuccess", runId); + return super.completeAttemptSuccess(runId, data, args, tx); + } + + override async recordRetryOutcome( + runId: string, + data: { machinePreset?: string; usageDurationMs: number; costInCents: number }, + args: { include: I }, + tx?: any + ): Promise> { + this.record("recordRetryOutcome", runId); + return super.recordRetryOutcome(runId, data, args, tx); + } + + override async requeueRun( + runId: string, + args: { select: S }, + tx?: any + ): Promise> { + this.record("requeueRun", runId); + return super.requeueRun(runId, args, tx); + } + + override async cancelRun( + runId: string, + data: any, + args: { select: S }, + tx?: any + ): Promise> { + this.record("cancelRun", runId); + return super.cancelRun(runId, data, args, tx); + } + + override async failRunPermanently( + runId: string, + data: any, + args: { select: S }, + tx?: any + ): Promise> { + this.record("failRunPermanently", runId); + return super.failRunPermanently(runId, data, args, tx); + } + + override async recordBulkActionMembership( + runId: string, + bulkActionId: string, + tx?: any + ): Promise { + this.record("recordBulkActionMembership", runId); + return super.recordBulkActionMembership(runId, bulkActionId, tx); + } +} + +function createEngineOptions(redisOptions: any, prisma: any, store: PostgresRunStore | undefined) { + return { + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + "small-2x": { + name: "small-2x" as const, + cpu: 1, + memory: 1, + centsPerMs: 0.0002, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +async function triggerRun( + engine: RunEngine, + environment: any, + prisma: any, + taskIdentifier: string, + overrides: Record = {} +) { + return engine.trigger( + { + number: 1, + friendlyId: `run_${Math.random().toString(36).slice(2, 10)}`, + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `t_${Math.random().toString(36).slice(2, 10)}`, + spanId: `s_${Math.random().toString(36).slice(2, 10)}`, + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + ...overrides, + }, + prisma + ); +} + +async function dequeueOne(engine: RunEngine) { + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_consumer", + workerQueue: "main", + }); + return dequeued; +} + +describe("runAttemptSystem routes through the RunStore", () => { + // startRunAttempt routes the EXECUTING run write (and the minimal load + // read) through the store, resolved by the owning run id. + containerTest( + "startRunAttempt routes the run write through the store", + async ({ prisma, redisOptions }) => { + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, store)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await triggerRun(engine, environment, prisma, taskIdentifier); + const dequeued = await dequeueOne(engine); + expect(dequeued.length).toBe(1); + + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(attemptResult.run.status).toBe("EXECUTING"); + expect(attemptResult.run.attemptNumber).toBe(1); + + const persisted = await prisma.taskRun.findUniqueOrThrow({ where: { id: run.id } }); + expect(persisted.status).toBe("EXECUTING"); + expect(persisted.attemptNumber).toBe(1); + + expect(store.countFor("startAttempt", run.id)).toBeGreaterThanOrEqual(1); + expect(store.countFor("findRun", run.id)).toBeGreaterThanOrEqual(1); + } finally { + await engine.quit(); + } + } + ); + + // attemptSucceeded finalizes COMPLETED_SUCCESSFULLY through the store, with + containerTest( + "attemptSucceeded finalizes through the store", + async ({ prisma, redisOptions }) => { + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, store)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await triggerRun(engine, environment, prisma, taskIdentifier); + const dequeued = await dequeueOne(engine); + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + const result = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: true, + id: dequeued[0].run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + + expect(result.run.status).toBe("COMPLETED_SUCCESSFULLY"); + expect(result.snapshot.executionStatus).toBe("FINISHED"); + + const persisted = await prisma.taskRun.findUniqueOrThrow({ where: { id: run.id } }); + expect(persisted.status).toBe("COMPLETED_SUCCESSFULLY"); + expect(persisted.output).toBe(`{"foo":"bar"}`); + + const execData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(execData); + expect(execData.snapshot.executionStatus).toBe("FINISHED"); + + expect(store.countFor("completeAttemptSuccess", run.id)).toBe(1); + } finally { + await engine.quit(); + } + } + ); + + // attemptFailed -> retry routes the retry update through recordRetryOutcome, + // preserving the deep runtimeEnvironment.{project,organization,orgMember} include. + containerTest( + "attemptFailed retry routes through the store with the deep include preserved", + async ({ prisma, redisOptions }) => { + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, store)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier, undefined, { + outOfMemory: { machine: "small-2x" }, + }); + + const run = await triggerRun(engine, environment, prisma, taskIdentifier); + const dequeued = await dequeueOne(engine); + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + const result = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: false, + id: dequeued[0].run.id, + error: { + type: "INTERNAL_ERROR" as const, + code: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" as const, + message: "Process exited with code -1 after signal SIGKILL.", + stackTrace: "JavaScript heap out of memory", + }, + }, + }); + + expect(result.attemptStatus).toBe("RETRY_QUEUED"); + expect(result.snapshot.executionStatus).toBe("QUEUED"); + + const execData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(execData); + expect(execData.snapshot.executionStatus).toBe("QUEUED"); + + const persisted = await prisma.taskRun.findUniqueOrThrow({ where: { id: run.id } }); + expect(persisted.machinePreset).toBe("small-2x"); + + expect(store.countFor("recordRetryOutcome", run.id)).toBe(1); + } finally { + await engine.quit(); + } + } + ); + + // Single-client passthrough: a start->succeed round-trip on the DEFAULT + containerTest("single-DB binds one client (passthrough)", async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, undefined)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await triggerRun(engine, environment, prisma, taskIdentifier); + const dequeued = await dequeueOne(engine); + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: true, + id: dequeued[0].run.id, + output: `{"ok":true}`, + outputType: "application/json", + }, + }); + + const persisted = await prisma.taskRun.findUniqueOrThrow({ + where: { id: run.id }, + include: { executionSnapshots: true }, + }); + expect(persisted.status).toBe("COMPLETED_SUCCESSFULLY"); + expect(persisted.executionSnapshots.length).toBeGreaterThan(0); + expect(persisted.executionSnapshots.some((s: any) => s.executionStatus === "FINISHED")).toBe( + true + ); + } finally { + await engine.quit(); + } + }); + + // cancelRun routes the CANCELED update through the dedicated cancelRun method + containerTest( + "cancelRun routes the CANCELED update through the store", + async ({ prisma, redisOptions }) => { + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, store)); + + try { + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, environment, [parentTask, childTask]); + + const parentRun = await triggerRun(engine, environment, prisma, parentTask); + const parentDequeued = await dequeueOne(engine); + await engine.startRunAttempt({ + runId: parentDequeued[0].run.id, + snapshotId: parentDequeued[0].snapshot.id, + }); + + // The child carries the associatedWaitpoint (it resumes its parent). Cancelling + // the still-queued child finishes it immediately and completes that waitpoint. + const childRun = await triggerRun(engine, environment, prisma, childTask, { + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + }); + + const associatedWaitpoint = await prisma.waitpoint.findFirstOrThrow({ + where: { completedByTaskRunId: childRun.id }, + }); + expect(associatedWaitpoint.status).toBe("PENDING"); + + const result = await engine.cancelRun({ + runId: childRun.id, + completedAt: new Date(), + reason: "Cancelled by the user", + }); + expect(result.snapshot.executionStatus).toBe("FINISHED"); + + const execData = await engine.getRunExecutionData({ runId: childRun.id }); + expect(execData?.run.status).toBe("CANCELED"); + + expect(store.countFor("cancelRun", childRun.id)).toBe(1); + + // The associated waitpoint was completed via waitpointSystem (inherited). + const completedWaitpoint = await prisma.waitpoint.findUniqueOrThrow({ + where: { id: associatedWaitpoint.id }, + }); + expect(completedWaitpoint.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // cancelRun child fan-out stays single-DB: cancelling a parent enqueues + containerTest("cancelRun child fan-out stays single-DB", async ({ prisma, redisOptions }) => { + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, store)); + + try { + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, environment, [parentTask, childTask]); + + const parentRun = await triggerRun(engine, environment, prisma, parentTask); + + // Two real children in the subgraph. The parent is left un-started so the cancel + // finishes it immediately and runs the fan-out synchronously (an executing parent + // would defer the fan-out to attempt completion). + const childIds: string[] = []; + for (let i = 0; i < 2; i++) { + const childRun = await triggerRun(engine, environment, prisma, childTask, { + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + }); + childIds.push(childRun.id); + } + + const enqueuedIds: string[] = []; + const originalEnqueue = engine.worker.enqueue.bind(engine.worker); + (engine.worker as any).enqueue = async (item: any) => { + if (typeof item?.id === "string" && item.id.startsWith("cancelRun:")) { + enqueuedIds.push(item.id); + } + return originalEnqueue(item); + }; + + await engine.cancelRun({ + runId: parentRun.id, + completedAt: new Date(), + reason: "Cancelled by the user", + }); + + for (const childId of childIds) { + expect(enqueuedIds).toContain(`cancelRun:${childId}`); + } + + expect(store.countFor("cancelRun", parentRun.id)).toBe(1); + for (const childId of childIds) { + expect(store.countFor("cancelRun", childId)).toBe(0); + } + } finally { + await engine.quit(); + } + }); + + // Bulk-action push on an already-finished run routes through the dedicated + containerTest( + "bulk-action push on a finished run routes through recordBulkActionMembership", + async ({ prisma, redisOptions }) => { + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, store)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await triggerRun(engine, environment, prisma, taskIdentifier); + const dequeued = await dequeueOne(engine); + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: true, + id: dequeued[0].run.id, + output: `{}`, + outputType: "application/json", + }, + }); + + const bulkActionId = "bulk_action_1234"; + const result = await engine.cancelRun({ + runId: run.id, + bulkActionId, + }); + + expect(result.alreadyFinished).toBe(true); + expect(store.countFor("recordBulkActionMembership", run.id)).toBe(1); + expect(store.countFor("cancelRun", run.id)).toBe(0); + + const persisted = await prisma.taskRun.findUniqueOrThrow({ where: { id: run.id } }); + expect(persisted.bulkActionGroupIds).toContain(bulkActionId); + } finally { + await engine.quit(); + } + } + ); +}); + +async function seedEnvironment(prisma: PrismaClient) { + const organization = await prisma.organization.create({ + data: { title: "Test Organization", slug: `org-${Math.random().toString(36).slice(2, 8)}` }, + }); + const project = await prisma.project.create({ + data: { + name: "Test Project", + slug: `proj-${Math.random().toString(36).slice(2, 8)}`, + externalRef: "proj_1234", + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: "tr_dev_apikey", + pkApiKey: "pk_dev_apikey", + shortcode: "short_code", + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(p: { + runId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: p.runId, + engine: "V2", + status: "PENDING", + friendlyId: "run_friendly_1", + runtimeEnvironmentId: p.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: p.organizationId, + projectId: p.projectId, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: "trace_1", + spanId: "span_1", + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: p.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: p.projectId, + organizationId: p.organizationId, + }, + }; +} + +describe("runAttemptSystem store routing — cross-version (heterogeneous Postgres)", () => { + // The attempt-lifecycle store methods this unit routes to (startAttempt -> + // completeAttemptSuccess) land their TaskRun write + FINISHED snapshot on the dedicated + // run-ops store, while a legacy/control-plane store over the same migrated schema is + // untouched. Proves the run-ops store owns the attempt lifecycle cross-version. + heteroPostgresTest( + "attempt lifecycle lands on the dedicated run-ops store", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const { organization, project, environment } = await seedEnvironment(prisma17); + + const runId = "run_hetero_lifecycle_1"; + await newStore.createRun( + buildCreateRunInput({ + runId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }) + ); + + const started = await newStore.startAttempt( + runId, + { attemptNumber: 1, executedAt: new Date(), isWarmStart: false }, + { select: { id: true, status: true, attemptNumber: true } } + ); + expect(started.status).toBe("EXECUTING"); + expect(started.attemptNumber).toBe(1); + + const completed = await newStore.completeAttemptSuccess( + runId, + { + completedAt: new Date(), + output: '{"result":"ok"}', + outputType: "application/json", + usageDurationMs: 500, + costInCents: 10, + snapshot: { + executionStatus: "FINISHED", + description: "Task completed successfully", + runStatus: "COMPLETED_SUCCESSFULLY", + attemptNumber: 1, + environmentId: environment.id, + environmentType: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + }, + }, + { select: { id: true, status: true, output: true } } + ); + expect(completed.status).toBe("COMPLETED_SUCCESSFULLY"); + expect(completed.output).toBe('{"result":"ok"}'); + + // The row + FINISHED snapshot are on the dedicated run-ops DB, byte-well-formed. + const onNew = await prisma17.taskRun.findUniqueOrThrow({ + where: { id: runId }, + include: { executionSnapshots: { where: { executionStatus: "FINISHED" } } }, + }); + expect(onNew.status).toBe("COMPLETED_SUCCESSFULLY"); + expect(onNew.executionSnapshots).toHaveLength(1); + expect(onNew.executionSnapshots[0]?.runStatus).toBe("COMPLETED_SUCCESSFULLY"); + + // The legacy/control-plane DB never saw this run — the lifecycle resolved to the owning store. + const onLegacy = await prisma14.taskRun.findUnique({ where: { id: runId } }); + expect(onLegacy).toBeNull(); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index cb733919cb1..93e170f12aa 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -34,12 +34,7 @@ import { getUserProvidedIdempotencyKey, } from "@trigger.dev/core/v3/serverOnly"; import { parsePacket } from "@trigger.dev/core/v3/utils/ioSerialization"; -import { - $transaction, - PrismaClientOrTransaction, - RuntimeEnvironmentType, - TaskRun, -} from "@trigger.dev/database"; +import { PrismaClientOrTransaction, RuntimeEnvironmentType } from "@trigger.dev/database"; import { MAX_TASK_RUN_ATTEMPTS } from "../consts.js"; import { runStatusFromError, ServiceValidationError } from "../errors.js"; import { sendNotificationToWorker } from "../eventBus.js"; @@ -63,7 +58,6 @@ import { import { SystemResources } from "./systems.js"; import { WaitpointSystem } from "./waitpointSystem.js"; import { BatchId, RunId } from "@trigger.dev/core/v3/isomorphic"; -import type { AuthenticatedEnvironment } from "../../shared/index.js"; export type RunAttemptSystemOptions = { resources: SystemResources; @@ -210,16 +204,7 @@ export class RunAttemptSystem { traceContext: true, priorityMs: true, taskIdentifier: true, - runtimeEnvironment: { - select: { - id: true, - slug: true, - type: true, - branchName: true, - git: true, - organizationId: true, - }, - }, + runtimeEnvironmentId: true, parentTaskRunId: true, rootTaskRunId: true, batchId: true, @@ -232,6 +217,12 @@ export class RunAttemptSystem { throw new ServiceValidationError("Task run not found", 404); } + const env = await this.$.controlPlaneResolver.resolveAuthenticatedEnv(run.runtimeEnvironmentId); + + if (!env) { + throw new ServiceValidationError("Task run environment not found", 404); + } + const [task, queue, organization, project, machinePreset, deployment] = await Promise.all([ run.lockedById ? this.#resolveTaskRunExecutionTask(run.lockedById) @@ -242,10 +233,10 @@ export class RunAttemptSystem { this.#resolveTaskRunExecutionQueue({ lockedQueueId: run.lockedQueueId ?? undefined, queueName: run.queue, - runtimeEnvironmentId: run.runtimeEnvironment.id, + runtimeEnvironmentId: env.id, }), - this.#resolveTaskRunExecutionOrganization(run.runtimeEnvironment.organizationId), - this.#resolveTaskRunExecutionProjectByRuntimeEnvironmentId(run.runtimeEnvironment.id), + this.#resolveTaskRunExecutionOrganization(env.organizationId), + this.#resolveTaskRunExecutionProjectByRuntimeEnvironmentId(env.id), run.lockedById ? this.#resolveTaskRunExecutionMachinePreset(run.lockedById, run.machinePreset) : Promise.resolve( @@ -277,7 +268,7 @@ export class RunAttemptSystem { priority: run.priorityMs === 0 ? undefined : run.priorityMs / 1_000, parentTaskRunId: run.parentTaskRunId ? RunId.toFriendlyId(run.parentTaskRunId) : undefined, rootTaskRunId: run.rootTaskRunId ? RunId.toFriendlyId(run.rootTaskRunId) : undefined, - region: run.runtimeEnvironment.type !== "DEVELOPMENT" ? run.workerQueue : undefined, + region: env.type !== "DEVELOPMENT" ? run.workerQueue : undefined, }, attempt: { number: run.attemptNumber ?? 1, @@ -290,11 +281,11 @@ export class RunAttemptSystem { machine: machinePreset, deployment, environment: { - id: run.runtimeEnvironment.id, - slug: run.runtimeEnvironment.slug, - type: run.runtimeEnvironment.type, - branchName: run.runtimeEnvironment.branchName ?? undefined, - git: safeParseGitMeta(run.runtimeEnvironment.git), + id: env.id, + slug: env.slug, + type: env.type, + branchName: env.branchName ?? undefined, + git: safeParseGitMeta(env.git), }, batch: run.batchId ? { id: BatchId.toFriendlyId(run.batchId) } : undefined, }; @@ -322,7 +313,7 @@ export class RunAttemptSystem { "startRunAttempt", async (span) => { return this.$.runLock.lock("startRunAttempt", [runId], async () => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); if (latestSnapshot.id !== snapshotId) { //if there is a big delay between the snapshot and the attempt, the snapshot might have changed @@ -398,10 +389,14 @@ export class RunAttemptSystem { throw new ServiceValidationError("Max attempts reached", 400); } - const result = await $transaction( - prisma, - async (tx) => { - const run = await this.$.runStore.startAttempt( + // Atomic unit: the attempt bump (startAttempt) and the EXECUTING snapshot must + // commit together or a crash between them leaves the run EXECUTING with no snapshot. Under + // the run-ops split these route to the SAME owning DB but, as two router calls, would each + // auto-commit. `runStore.runInTransaction(runId, ...)` wraps both in ONE transaction on the + // run's owning store; the inner writes go through the tx-bound `store` (not the router). + const [transactionError, result] = await tryCatch( + this.$.runStore.runInTransaction(taskRun.id, async (store, tx) => { + const run = await store.startAttempt( taskRun.id, { attemptNumber: nextAttemptNumber, @@ -444,16 +439,7 @@ export class RunAttemptSystem { priorityMs: true, batchId: true, realtimeStreamsVersion: true, - runtimeEnvironment: { - select: { - id: true, - slug: true, - type: true, - branchName: true, - git: true, - organizationId: true, - }, - }, + runtimeEnvironmentId: true, parentTaskRunId: true, rootTaskRunId: true, workerQueue: true, @@ -463,24 +449,28 @@ export class RunAttemptSystem { tx ); - const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(tx, { - run, - snapshot: { - executionStatus: "EXECUTING", - description: `Attempt created, starting execution${ - isWarmStart ? " (warm start)" : "" - }`, + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + tx, + { + run, + snapshot: { + executionStatus: "EXECUTING", + description: `Attempt created, starting execution${ + isWarmStart ? " (warm start)" : "" + }`, + }, + previousSnapshotId: latestSnapshot.id, + environmentId: latestSnapshot.environmentId, + environmentType: latestSnapshot.environmentType, + projectId: latestSnapshot.projectId, + organizationId: latestSnapshot.organizationId, + batchId: latestSnapshot.batchId ?? undefined, + completedWaitpoints: latestSnapshot.completedWaitpoints, + workerId, + runnerId, }, - previousSnapshotId: latestSnapshot.id, - environmentId: latestSnapshot.environmentId, - environmentType: latestSnapshot.environmentType, - projectId: latestSnapshot.projectId, - organizationId: latestSnapshot.organizationId, - batchId: latestSnapshot.batchId ?? undefined, - completedWaitpoints: latestSnapshot.completedWaitpoints, - workerId, - runnerId, - }); + store + ); if (taskRun.ttl) { //don't expire the run, it's going to execute @@ -488,22 +478,19 @@ export class RunAttemptSystem { } return { updatedRun: run, snapshot: newSnapshot }; - }, - (error) => { - this.$.logger.error("RunEngine.createRunAttempt(): prisma.$transaction error", { - code: error.code, - meta: error.meta, - stack: error.stack, - message: error.message, - name: error.name, - }); - throw new ServiceValidationError( - "Failed to update task run and execution snapshot", - 500 - ); - } + }) ); + if (transactionError) { + this.$.logger.error("RunEngine.createRunAttempt(): prisma.$transaction error", { + error: transactionError, + }); + throw new ServiceValidationError( + "Failed to update task run and execution snapshot", + 500 + ); + } + if (!result) { this.$.logger.error("RunEngine.createRunAttempt(): failed to create task run attempt", { runId: taskRun.id, @@ -514,6 +501,14 @@ export class RunAttemptSystem { const { updatedRun, snapshot } = result; + const env = await this.$.controlPlaneResolver.resolveAuthenticatedEnv( + updatedRun.runtimeEnvironmentId + ); + + if (!env) { + throw new ServiceValidationError("Task run environment not found", 404); + } + this.$.eventBus.emit("runAttemptStarted", { time: new Date(), run: { @@ -528,17 +523,17 @@ export class RunAttemptSystem { batchId: updatedRun.batchId, }, organization: { - id: updatedRun.runtimeEnvironment.organizationId, + id: env.organizationId, }, project: { id: updatedRun.projectId, }, environment: { - id: updatedRun.runtimeEnvironment.id, + id: env.id, }, }); - const environmentGit = safeParseGitMeta(updatedRun.runtimeEnvironment.git); + const environmentGit = safeParseGitMeta(env.git); const [metadata, task, queue, organization, project, machinePreset, deployment] = await Promise.all([ @@ -550,14 +545,10 @@ export class RunAttemptSystem { this.#resolveTaskRunExecutionQueue({ lockedQueueId: updatedRun.lockedQueueId ?? undefined, queueName: updatedRun.queue, - runtimeEnvironmentId: updatedRun.runtimeEnvironment.id, + runtimeEnvironmentId: env.id, }), - this.#resolveTaskRunExecutionOrganization( - updatedRun.runtimeEnvironment.organizationId - ), - this.#resolveTaskRunExecutionProjectByRuntimeEnvironmentId( - updatedRun.runtimeEnvironment.id - ), + this.#resolveTaskRunExecutionOrganization(env.organizationId), + this.#resolveTaskRunExecutionProjectByRuntimeEnvironmentId(env.id), this.#resolveTaskRunExecutionMachinePreset( taskRun.lockedById, updatedRun.machinePreset @@ -609,19 +600,16 @@ export class RunAttemptSystem { rootTaskRunId: updatedRun.rootTaskRunId ? RunId.toFriendlyId(updatedRun.rootTaskRunId) : undefined, - region: - updatedRun.runtimeEnvironment.type !== "DEVELOPMENT" - ? updatedRun.workerQueue - : undefined, + region: env.type !== "DEVELOPMENT" ? updatedRun.workerQueue : undefined, realtimeStreamsVersion: updatedRun.realtimeStreamsVersion ?? undefined, }, task, queue, environment: { - id: updatedRun.runtimeEnvironment.id, - slug: updatedRun.runtimeEnvironment.slug, - type: updatedRun.runtimeEnvironment.type, - branchName: updatedRun.runtimeEnvironment.branchName ?? undefined, + id: env.id, + slug: env.slug, + type: env.type, + branchName: env.branchName ?? undefined, git: environmentGit, }, organization, @@ -705,7 +693,7 @@ export class RunAttemptSystem { "#completeRunAttemptSuccess", async (span) => { return this.$.runLock.lock("attemptSucceeded", [runId], async () => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); if (latestSnapshot.id !== snapshotId) { throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); @@ -780,11 +768,6 @@ export class RunAttemptSystem { id: true, }, }, - project: { - select: { - organizationId: true, - }, - }, batchId: true, createdAt: true, completedAt: true, @@ -798,9 +781,16 @@ export class RunAttemptSystem { }, prisma ); - const newSnapshot = await getLatestExecutionSnapshot(prisma, runId); - await this.$.runQueue.acknowledgeMessage(run.project.organizationId, runId); + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + throw new ServiceValidationError("Task run environment not found", 404); + } + + const newSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); + + await this.$.runQueue.acknowledgeMessage(env.organizationId, runId); // We need to manually emit this as we created the final snapshot as part of the task run update this.$.eventBus.emit("executionSnapshotCreated", { @@ -841,7 +831,7 @@ export class RunAttemptSystem { attemptNumber: run.attemptNumber ?? 1, }, organization: { - id: run.project.organizationId, + id: env.organizationId, }, project: { id: run.projectId, @@ -890,7 +880,7 @@ export class RunAttemptSystem { "completeRunAttemptFailure", async (span) => { return this.$.runLock.lock("attemptFailed", [runId], async () => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); if (latestSnapshot.id !== snapshotId) { throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); @@ -933,11 +923,6 @@ export class RunAttemptSystem { status: true, spanId: true, maxAttempts: true, - runtimeEnvironment: { - select: { - organizationId: true, - }, - }, taskEventStore: true, createdAt: true, completedAt: true, @@ -1017,19 +1002,36 @@ export class RunAttemptSystem { costInCents: updatedUsage.costInCents, }, { - include: { - runtimeEnvironment: { - include: { - project: true, - organization: true, - orgMember: true, - }, - }, + select: { + id: true, + friendlyId: true, + status: true, + attemptNumber: true, + spanId: true, + queue: true, + taskIdentifier: true, + traceContext: true, + baseCostInCents: true, + runTags: true, + batchId: true, + createdAt: true, + completedAt: true, + updatedAt: true, + taskEventStore: true, + runtimeEnvironmentId: true, }, }, this.$.prisma ); + const env = await this.$.controlPlaneResolver.resolveAuthenticatedEnv( + run.runtimeEnvironmentId + ); + + if (!env) { + throw new ServiceValidationError("Task run environment not found", 404); + } + const nextAttemptNumber = latestSnapshot.attemptNumber === null ? 1 : latestSnapshot.attemptNumber + 1; @@ -1071,18 +1073,9 @@ export class RunAttemptSystem { batchId: run.batchId, }, organization: { - id: run.runtimeEnvironment.organizationId, + id: env.organizationId, }, - // The Prisma payload structurally satisfies the slim - // AuthenticatedEnvironment except for `concurrencyLimitBurstFactor` - // (Decimal vs number). Coerce that one field; cast away - // the excess-property mismatch (the rest of Prisma's - // RuntimeEnvironment columns are extra, not missing). - environment: { - ...run.runtimeEnvironment, - concurrencyLimitBurstFactor: - run.runtimeEnvironment.concurrencyLimitBurstFactor.toNumber(), - } as unknown as AuthenticatedEnvironment, + environment: env, retryAt, }); @@ -1096,9 +1089,9 @@ export class RunAttemptSystem { //we nack the message, requeuing it for later const nackResult = await this.tryNackAndRequeue({ run, - environment: run.runtimeEnvironment, - orgId: run.runtimeEnvironment.organizationId, - projectId: run.runtimeEnvironment.project.id, + environment: env, + orgId: env.organizationId, + projectId: env.project.id, timestamp: retryAt.getTime(), error: { type: "INTERNAL_ERROR", @@ -1173,7 +1166,7 @@ export class RunAttemptSystem { this.$.tracer, "systemFailure", async (span) => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); //already finished if (latestSnapshot.executionStatus === "FINISHED") { @@ -1225,7 +1218,7 @@ export class RunAttemptSystem { batchId, tx, }: { - run: TaskRun; + run: { id: string }; environment: { id: string; type: RuntimeEnvironmentType; @@ -1344,7 +1337,7 @@ export class RunAttemptSystem { return startSpan(this.$.tracer, "cancelRun", async (span) => { return this.$.runLock.lock("cancelRun", [runId], async () => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); //already finished, do nothing if (latestSnapshot.executionStatus === "FINISHED") { @@ -1429,11 +1422,7 @@ export class RunAttemptSystem { parentTaskRunId: true, delayUntil: true, updatedAt: true, - runtimeEnvironment: { - select: { - organizationId: true, - }, - }, + runtimeEnvironmentId: true, associatedWaitpoint: { select: { id: true, @@ -1449,13 +1438,19 @@ export class RunAttemptSystem { prisma ); + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + throw new ServiceValidationError("Task run environment not found", 404); + } + //if the run is delayed and hasn't started yet, we need to prevent it being added to the queue in future if (isInitialState(latestSnapshot.executionStatus) && run.delayUntil) { await this.delayedRunSystem.preventDelayedRunFromBeingEnqueued({ runId }); } //remove it from the queue and release concurrency - await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId, { + await this.$.runQueue.acknowledgeMessage(env.organizationId, runId, { removeFromWorkerQueue: true, }); @@ -1639,24 +1634,12 @@ export class RunAttemptSystem { updatedAt: true, usageDurationMs: true, costInCents: true, + runtimeEnvironmentId: true, associatedWaitpoint: { select: { id: true, }, }, - runtimeEnvironment: { - select: { - id: true, - type: true, - organizationId: true, - project: { - select: { - id: true, - organizationId: true, - }, - }, - }, - }, taskEventStore: true, createdAt: true, completedAt: true, @@ -1665,6 +1648,12 @@ export class RunAttemptSystem { this.$.prisma ); + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + throw new ServiceValidationError("Task run environment not found", 404); + } + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { run, snapshot: { @@ -1672,15 +1661,15 @@ export class RunAttemptSystem { description: "Run failed", }, previousSnapshotId: latestSnapshot.id, - environmentId: run.runtimeEnvironment.id, - environmentType: run.runtimeEnvironment.type, - projectId: run.runtimeEnvironment.project.id, - organizationId: run.runtimeEnvironment.project.organizationId, + environmentId: env.id, + environmentType: env.type, + projectId: env.projectId, + organizationId: env.organizationId, workerId, runnerId, }); - await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId, { + await this.$.runQueue.acknowledgeMessage(env.organizationId, runId, { removeFromWorkerQueue: true, }); @@ -1708,13 +1697,13 @@ export class RunAttemptSystem { costInCents: run.costInCents, }, organization: { - id: run.runtimeEnvironment.project.organizationId, + id: env.organizationId, }, project: { - id: run.runtimeEnvironment.project.id, + id: env.projectId, }, environment: { - id: run.runtimeEnvironment.id, + id: env.id, }, }); diff --git a/internal-packages/run-engine/src/engine/systems/systems.ts b/internal-packages/run-engine/src/engine/systems/systems.ts index 1b2f1d64c51..b6a00e02f11 100644 --- a/internal-packages/run-engine/src/engine/systems/systems.ts +++ b/internal-packages/run-engine/src/engine/systems/systems.ts @@ -1,5 +1,6 @@ import { Meter, Tracer } from "@internal/tracing"; import { RunStore } from "@internal/run-store"; +import { ControlPlaneResolver } from "../controlPlaneResolver.js"; import { Logger } from "@trigger.dev/core/logger"; import { PrismaClient, PrismaReplicaClient } from "@trigger.dev/database"; import { RunQueue } from "../../run-queue/index.js"; @@ -13,6 +14,7 @@ export type SystemResources = { prisma: PrismaClient; readOnlyPrisma: PrismaReplicaClient; runStore: RunStore; + controlPlaneResolver: ControlPlaneResolver; worker: EngineWorker; eventBus: EventBus; logger: Logger; diff --git a/internal-packages/run-engine/src/engine/systems/ttlSystem.test.ts b/internal-packages/run-engine/src/engine/systems/ttlSystem.test.ts new file mode 100644 index 00000000000..64db9b03e0e --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/ttlSystem.test.ts @@ -0,0 +1,610 @@ +import { + containerTest, + assertNonNullable, + heteroPostgresTest, + network, + redisContainer, + redisOptions, +} from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { expect } from "vitest"; +import { Decimal } from "@trigger.dev/database"; +import { PostgresRunStore } from "@internal/run-store"; +import type { RunStore } from "@internal/run-store"; +import { PrismaClient } from "@trigger.dev/database"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; + +// heteroPostgresTest provides two postgres clients but no redis; compose a fixture +// that adds a per-test redis container + options for the engine. +const heteroEngineTest = heteroPostgresTest.extend<{ + network: any; + redisContainer: any; + redisOptions: any; +}>({ + network, + redisContainer, + redisOptions, +}); + +vi.setConfig({ testTimeout: 60_000 }); + +// Real PostgresRunStore subclass (no mocks) counting routed method calls. +class CountingPostgresRunStore extends PostgresRunStore { + public calls = { + rescheduleRun: 0, + enqueueDelayedRun: 0, + expireRun: 0, + expireRunsBatch: 0, + findRun: 0, + findRuns: 0, + findLatestExecutionSnapshot: 0, + forWaitpointCompletion: 0, + }; + + // expireRun is generic over the select payload; keep the loose arg list so the + // override still satisfies the generic interface signature. + override expireRun(...args: any[]): any { + this.calls.expireRun++; + return super.expireRun(...(args as [any, any, any, any])); + } + + override expireRunsBatch(...args: Parameters) { + this.calls.expireRunsBatch++; + return super.expireRunsBatch(...args); + } + + override findRun(...args: any[]): any { + this.calls.findRun++; + return super.findRun(...(args as [any, any, any])); + } + + override findRuns(...args: any[]): any { + this.calls.findRuns++; + return super.findRuns(...(args as [any, any])); + } + + override findLatestExecutionSnapshot( + ...args: Parameters + ) { + this.calls.findLatestExecutionSnapshot++; + return super.findLatestExecutionSnapshot(...args); + } + + override forWaitpointCompletion(...args: Parameters) { + this.calls.forWaitpointCompletion++; + return super.forWaitpointCompletion(...args); + } +} + +function createEngine( + prisma: PrismaClient, + redisOptions: any, + store?: RunStore, + extraQueueOptions?: Record +) { + return new RunEngine({ + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + processWorkerQueueDebounceMs: 50, + masterQueueConsumersDisabled: true, + ...extraQueueOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); +} + +const triggerDefaults = { + payload: "{}", + payloadType: "application/json" as const, + context: {}, + traceContext: {}, + isTest: false, + tags: [] as string[], + workerQueue: "main", +}; + +describe("TtlSystem store routing", () => { + containerTest( + "expireRun routes snapshot read + findRun + expire through the store and completes the waitpoint via the guard", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + // ttlSystem disabled so the batch path does not race the direct expireRun call. + const engine = createEngine(prisma, redisOptions, store, { + ttlSystem: { disabled: true }, + }); + + try { + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, [parentTask, childTask]); + + // Cap env concurrency at exactly 1 so the parent takes the only slot and the + // child stays PENDING/QUEUED (expireRun only expires PENDING, non-locked runs). + await engine.runQueue.updateEnvConcurrencyLimits({ + ...authenticatedEnvironment, + maximumConcurrencyLimit: 1, + concurrencyLimitBurstFactor: new Decimal(1.0), + }); + + const parentRun = await engine.trigger( + { + ...triggerDefaults, + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier: parentTask, + traceId: "t12345", + spanId: "s12345", + queue: `task/${parentTask}`, + }, + prisma + ); + + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id, 10); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + blockingPopTimeoutSeconds: 1, + }); + expect(dequeued.length).toBe(1); + + const initialExecutionData = await engine.getRunExecutionData({ runId: parentRun.id }); + assertNonNullable(initialExecutionData); + await engine.startRunAttempt({ + runId: parentRun.id, + snapshotId: initialExecutionData.snapshot.id, + }); + + // Child run with a waitpoint resuming the parent. TTL is set but the batch + // path is disabled, so the child stays PENDING until we expire it directly. + const childRun = await engine.trigger( + { + ...triggerDefaults, + number: 2, + friendlyId: "run_c1234", + environment: authenticatedEnvironment, + taskIdentifier: childTask, + traceId: "t12346", + spanId: "s12346", + queue: `task/${childTask}`, + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + ttl: "60s", + }, + prisma + ); + + const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({ + where: { taskRunId: parentRun.id }, + include: { waitpoint: true }, + }); + assertNonNullable(runWaitpoint); + expect(runWaitpoint.waitpoint.type).toBe("RUN"); + expect(runWaitpoint.waitpoint.completedByTaskRunId).toBe(childRun.id); + + // Confirm the child is PENDING (the expireRun precondition) before expiring. + const childBefore = await prisma.taskRun.findUniqueOrThrow({ where: { id: childRun.id } }); + expect(childBefore.status).toBe("PENDING"); + + store.calls.findLatestExecutionSnapshot = 0; + store.calls.findRun = 0; + store.calls.expireRun = 0; + store.calls.forWaitpointCompletion = 0; + + await engine.ttlSystem.expireRun({ runId: childRun.id }); + + expect(store.calls.findLatestExecutionSnapshot).toBeGreaterThanOrEqual(1); + expect(store.calls.findRun).toBeGreaterThanOrEqual(1); + expect(store.calls.expireRun).toBeGreaterThanOrEqual(1); + // The waitpoint-completion guard fired for the expireRun completion path. + expect(store.calls.forWaitpointCompletion).toBeGreaterThanOrEqual(1); + + const expiredChild = await prisma.taskRun.findUniqueOrThrow({ where: { id: childRun.id } }); + expect(expiredChild.status).toBe("EXPIRED"); + + const finishedSnapshots = await prisma.taskRunExecutionSnapshot.findMany({ + where: { runId: childRun.id, executionStatus: "FINISHED" }, + }); + expect(finishedSnapshots.length).toBeGreaterThanOrEqual(1); + + const waitpointAfter = await prisma.waitpoint.findFirstOrThrow({ + where: { id: runWaitpoint.waitpointId }, + }); + expect(waitpointAfter.status).toBe("COMPLETED"); + expect(waitpointAfter.outputIsError).toBe(true); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "expireRun with a caller tx still routes the snapshot read through the store", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + // ttlSystem disabled so the batch path does not race the direct expireRun call. + const engine = createEngine(prisma, redisOptions, store, { + ttlSystem: { disabled: true }, + }); + + try { + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, [parentTask, childTask]); + + // Cap env concurrency at exactly 1 so the parent takes the only slot and the + // child stays PENDING/QUEUED (expireRun only expires PENDING, non-locked runs). + await engine.runQueue.updateEnvConcurrencyLimits({ + ...authenticatedEnvironment, + maximumConcurrencyLimit: 1, + concurrencyLimitBurstFactor: new Decimal(1.0), + }); + + const parentRun = await engine.trigger( + { + ...triggerDefaults, + number: 1, + friendlyId: "run_ptx12", + environment: authenticatedEnvironment, + taskIdentifier: parentTask, + traceId: "tptx12", + spanId: "sptx12", + queue: `task/${parentTask}`, + }, + prisma + ); + + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id, 10); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_ctx12", + workerQueue: "main", + blockingPopTimeoutSeconds: 1, + }); + expect(dequeued.length).toBe(1); + + const initialExecutionData = await engine.getRunExecutionData({ runId: parentRun.id }); + assertNonNullable(initialExecutionData); + await engine.startRunAttempt({ + runId: parentRun.id, + snapshotId: initialExecutionData.snapshot.id, + }); + + const childRun = await engine.trigger( + { + ...triggerDefaults, + number: 2, + friendlyId: "run_ctx12", + environment: authenticatedEnvironment, + taskIdentifier: childTask, + traceId: "tctx13", + spanId: "sctx13", + queue: `task/${childTask}`, + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + ttl: "60s", + }, + prisma + ); + + const childBefore = await prisma.taskRun.findUniqueOrThrow({ where: { id: childRun.id } }); + expect(childBefore.status).toBe("PENDING"); + + store.calls.findLatestExecutionSnapshot = 0; + store.calls.findRun = 0; + store.calls.expireRun = 0; + + // Pass a caller tx: the snapshot read must still route through the store (which + // resolves the owning DB), never read the caller's control-plane tx directly. + await engine.ttlSystem.expireRun({ runId: childRun.id, tx: prisma }); + + expect(store.calls.findLatestExecutionSnapshot).toBeGreaterThanOrEqual(1); + + const persisted = await prisma.taskRun.findUniqueOrThrow({ where: { id: childRun.id } }); + expect(persisted.status).toBe("EXPIRED"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "expireRunsBatch routes bulk fetch + bulk expire through the store", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = createEngine(prisma, redisOptions, store, { + ttlSystem: { disabled: true }, + }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + // Keep the runs queued (PENDING) so the batch can expire them. + await engine.runQueue.updateEnvConcurrencyLimits({ + ...authenticatedEnvironment, + maximumConcurrencyLimit: 0, + }); + + const runs = await Promise.all( + [1, 2, 3].map((n) => + engine.trigger( + { + ...triggerDefaults, + number: n, + friendlyId: `run_batch${n}`, + environment: authenticatedEnvironment, + taskIdentifier, + traceId: `t_b${n}`, + spanId: `s_b${n}`, + queue: "task/test-task", + ttl: "60s", + }, + prisma + ) + ) + ); + + for (const run of runs) { + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("QUEUED"); + } + + store.calls.findRuns = 0; + store.calls.expireRunsBatch = 0; + + const runIds = runs.map((r) => r.id); + const result = await engine.ttlSystem.expireRunsBatch(runIds); + + expect(store.calls.findRuns).toBeGreaterThanOrEqual(1); + expect(store.calls.expireRunsBatch).toBeGreaterThanOrEqual(1); + + expect(result.expired.length).toBe(3); + expect(result.skipped.length).toBe(0); + + for (const run of runs) { + const dbRun = await prisma.taskRun.findUniqueOrThrow({ where: { id: run.id } }); + expect(dbRun.status).toBe("EXPIRED"); + } + } finally { + await engine.quit(); + } + } + ); + + containerTest("single-DB binds one client (passthrough)", async ({ prisma, redisOptions }) => { + // Default-store engine: an expire round-trip must persist on the one client. + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = createEngine(prisma, redisOptions, undefined, { + ttlSystem: { disabled: true }, + }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + await engine.runQueue.updateEnvConcurrencyLimits({ + ...authenticatedEnvironment, + maximumConcurrencyLimit: 0, + }); + + const run = await engine.trigger( + { + ...triggerDefaults, + number: 1, + friendlyId: "run_passttl", + environment: authenticatedEnvironment, + taskIdentifier, + traceId: "t_passttl", + spanId: "s_passttl", + queue: "task/test-task", + ttl: "60s", + }, + prisma + ); + + await engine.ttlSystem.expireRun({ runId: run.id }); + + const dbRun = await prisma.taskRun.findUniqueOrThrow({ where: { id: run.id } }); + expect(dbRun.status).toBe("EXPIRED"); + } finally { + await engine.quit(); + } + }); + + heteroEngineTest( + "expireRun post-migration completion routes via the guard to the owning store", + async ({ prisma14, prisma17, redisOptions }) => { + // An expirable PENDING run + associatedWaitpoint is seeded on the NEW DB (PG17). + // expireRun (unchanged) must route its completion to NEW (waitpoint COMPLETED + + // run EXPIRED on PG17) and leave the LEGACY DB (PG14) untouched. + + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + + const residency = new Map(); + // forWaitpointCompletion has no run id, so default the router to NEW: the seeded + // run and its waitpoint both live on NEW. + const router = createRouter(residency, newStore); + + const env14 = await setupAuthenticatedEnvironment(prisma14, "PRODUCTION"); + const env17 = await setupAuthenticatedEnvironment(prisma17, "PRODUCTION"); + + const engine = createEngine(prisma17, redisOptions, router, { + ttlSystem: { disabled: true }, + }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, env17, taskIdentifier); + + const runId = "run_hetero_ttl"; + const waitpointId = "wp_hetero_ttl"; + + // Create the run with an associated waitpoint + latest snapshot on NEW (PG17). + await newStore.createRun( + { + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: "run_ht1", + runtimeEnvironmentId: env17.id, + environmentType: "PRODUCTION", + organizationId: env17.organization.id, + projectId: env17.project.id, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: "t_ht", + spanId: "s_ht", + queue: "task/test-task", + workerQueue: "main", + isTest: false, + ttl: "60s", + queuedAt: new Date(), + }, + snapshot: { + engine: "V2", + executionStatus: "QUEUED", + description: "Run was created", + runStatus: "PENDING", + environmentId: env17.id, + environmentType: "PRODUCTION", + projectId: env17.project.id, + organizationId: env17.organization.id, + }, + associatedWaitpoint: { + id: waitpointId, + friendlyId: "wp_ht1", + type: "RUN", + status: "PENDING", + idempotencyKey: "idem_ht", + userProvidedIdempotencyKey: false, + projectId: env17.project.id, + environmentId: env17.id, + }, + }, + prisma17 + ); + residency.set(runId, newStore); + + // Sanity: nothing on LEGACY. + const legacyBefore = await prisma14.taskRun.findUnique({ where: { id: runId } }); + expect(legacyBefore).toBeNull(); + + await engine.ttlSystem.expireRun({ runId }); + + // Completion routed to NEW: run EXPIRED + waitpoint COMPLETED with error. + const newRun = await prisma17.taskRun.findUniqueOrThrow({ where: { id: runId } }); + expect(newRun.status).toBe("EXPIRED"); + + const newWaitpoint = await prisma17.waitpoint.findUniqueOrThrow({ + where: { id: waitpointId }, + }); + expect(newWaitpoint.status).toBe("COMPLETED"); + expect(newWaitpoint.outputIsError).toBe(true); + + // LEGACY untouched. + const legacyAfter = await prisma14.taskRun.findUnique({ where: { id: runId } }); + expect(legacyAfter).toBeNull(); + const legacyWaitpoint = await prisma14.waitpoint.findUnique({ where: { id: waitpointId } }); + expect(legacyWaitpoint).toBeNull(); + } finally { + await engine.quit(); + } + } + ); +}); + +// A minimal two-store router implementing RunStore by delegating each method to the +// store that owns the run id. Methods with a runId first arg resolve by residency; +// bulk reads resolve by the first id; methods without a run id use the default store. +export function createRouter(residency: Map, defaultStore: RunStore): RunStore { + const resolveById = (runId: string): RunStore => residency.get(runId) ?? defaultStore; + + const handler: ProxyHandler = { + get(_target, prop: string | symbol) { + switch (prop) { + case "rescheduleRun": + case "enqueueDelayedRun": + case "expireRun": + case "findLatestExecutionSnapshot": + case "startAttempt": + case "completeAttemptSuccess": + case "recordRetryOutcome": + case "requeueRun": + case "recordBulkActionMembership": + case "cancelRun": + case "failRunPermanently": + case "lockRunToWorker": + case "parkPendingVersion": + case "promotePendingVersionRuns": + case "suspendForCheckpoint": + case "resumeFromCheckpoint": + case "rewriteDebouncedRun": + case "updateMetadata": + case "pushTags": + case "pushRealtimeStream": + case "findSnapshotCompletedWaitpointIds": + return (...args: any[]) => (resolveById(args[0]) as any)[prop](...args); + + case "findRun": + case "findRunOrThrow": + return (...args: any[]) => { + const where = args[0]; + const id = where && typeof where.id === "string" ? where.id : undefined; + const store = id ? resolveById(id) : defaultStore; + return (store as any)[prop](...args); + }; + + case "expireRunsBatch": + return (...args: any[]) => { + const runIds: string[] = args[0] ?? []; + const store = runIds.length > 0 ? resolveById(runIds[0]) : defaultStore; + return (store as any)[prop](...args); + }; + + case "findRuns": + return (...args: any[]) => { + const inList = args[0]?.where?.id?.in as string[] | undefined; + const store = inList && inList.length > 0 ? resolveById(inList[0]) : defaultStore; + return (store as any)[prop](...args); + }; + + default: + return (...args: any[]) => (defaultStore as any)[prop](...args); + } + }, + }; + + return new Proxy({} as RunStore, handler); +} diff --git a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts index ac5950b884a..d85a89a07e5 100644 --- a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts @@ -25,7 +25,7 @@ export class TtlSystem { async expireRun({ runId, tx }: { runId: string; tx?: PrismaClientOrTransaction }) { const prisma = tx ?? this.$.prisma; await this.$.runLock.lock("expireRun", [runId], async () => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); + const snapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); //if we're executing then we won't expire the run if (isExecuting(snapshot.executionStatus)) { @@ -81,6 +81,7 @@ export class TtlSystem { { select: { id: true, + runtimeEnvironmentId: true, spanId: true, ttl: true, updatedAt: true, @@ -89,13 +90,6 @@ export class TtlSystem { id: true, }, }, - runtimeEnvironment: { - select: { - organizationId: true, - projectId: true, - id: true, - }, - }, createdAt: true, completedAt: true, taskEventStore: true, @@ -107,13 +101,9 @@ export class TtlSystem { prisma ); - await this.$.runQueue.acknowledgeMessage( - updatedRun.runtimeEnvironment.organizationId, - runId, - { - removeFromWorkerQueue: true, - } - ); + await this.$.runQueue.acknowledgeMessage(snapshot.organizationId, runId, { + removeFromWorkerQueue: true, + }); // Complete the waitpoint if it exists (runs without waiting parents have no waitpoint) if (updatedRun.associatedWaitpoint) { @@ -126,9 +116,9 @@ export class TtlSystem { this.$.eventBus.emit("runExpired", { run: updatedRun, time: new Date(), - organization: { id: updatedRun.runtimeEnvironment.organizationId }, - project: { id: updatedRun.runtimeEnvironment.projectId }, - environment: { id: updatedRun.runtimeEnvironment.id }, + organization: { id: snapshot.organizationId }, + project: { id: snapshot.projectId }, + environment: { id: snapshot.environmentId }, }); }); } @@ -249,7 +239,6 @@ export class TtlSystem { return; } - // Emit event this.$.eventBus.emit("runExpired", { run: { id: run.id, diff --git a/internal-packages/run-engine/src/engine/systems/waitpointSystem.test.ts b/internal-packages/run-engine/src/engine/systems/waitpointSystem.test.ts new file mode 100644 index 00000000000..1a34bd07842 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/waitpointSystem.test.ts @@ -0,0 +1,1565 @@ +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore, RoutingRunStore } from "@internal/run-store"; +import { BatchId, generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import type { Prisma, PrismaClient } from "@trigger.dev/database"; +import { describe, expect } from "vitest"; +import { setTimeout } from "node:timers/promises"; +import { RunEngine } from "../index.js"; +import { UnclassifiableWaitpointId } from "../errors.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "../tests/setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +/** + * Real (non-mock) PostgresRunStore subclass that records which routed waitpoint/edge + * methods the engine actually calls, then delegates to super over the real container. + * This proves the create/block writes route through this.$.runStore. + */ +class CountingRunStore extends PostgresRunStore { + calls: string[] = []; + + override async upsertWaitpoint( + args: Prisma.SelectSubset, + tx?: any + ): Promise> { + this.calls.push("upsertWaitpoint"); + return super.upsertWaitpoint(args, tx); + } + override async createWaitpoint( + args: Prisma.SelectSubset, + tx?: any + ): Promise> { + this.calls.push("createWaitpoint"); + return super.createWaitpoint(args, tx); + } + override async findWaitpoint( + args: Prisma.SelectSubset, + client?: any + ): Promise | null> { + this.calls.push("findWaitpoint"); + return super.findWaitpoint(args, client); + } + override async updateWaitpoint( + args: Prisma.SelectSubset, + tx?: any + ): Promise> { + this.calls.push("updateWaitpoint"); + return super.updateWaitpoint(args, tx); + } + override async blockRunWithWaitpointEdges( + params: Parameters[0] + ): Promise { + this.calls.push("blockRunWithWaitpointEdges"); + return super.blockRunWithWaitpointEdges(params); + } + override async countPendingWaitpoints(waitpointIds: string[], client?: any): Promise { + this.calls.push("countPendingWaitpoints"); + return super.countPendingWaitpoints(waitpointIds, client); + } + override async deleteManyTaskRunWaitpoints( + args: Prisma.TaskRunWaitpointDeleteManyArgs, + tx?: any + ): Promise { + this.calls.push("deleteManyTaskRunWaitpoints"); + return super.deleteManyTaskRunWaitpoints(args, tx); + } + + // The residency store-selection guard. It is the FIRST statement of completeWaitpoint, + // so counting it directly observes "the guard fired" before any completion DB step. + // The single-store super returns `this`, so the SAME store keeps recording downstream. + override forWaitpointCompletion( + waitpointId: string, + context: Parameters[1] + ): PostgresRunStore { + this.calls.push("forWaitpointCompletion"); + return super.forWaitpointCompletion(waitpointId, context) as PostgresRunStore; + } + override async updateManyWaitpoints( + args: Prisma.WaitpointUpdateManyArgs, + tx?: any + ): Promise { + this.calls.push("updateManyWaitpoints"); + return super.updateManyWaitpoints(args, tx); + } + override async findManyTaskRunWaitpoints( + args: Prisma.SelectSubset, + client?: any + ): Promise[]> { + this.calls.push("findManyTaskRunWaitpoints"); + return super.findManyTaskRunWaitpoints(args, client); + } +} + +function buildEngine(prisma: PrismaClient, redisOptions: any, store?: PostgresRunStore) { + return new RunEngine({ + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); +} + +async function triggerExecutingRun( + engine: RunEngine, + prisma: PrismaClient, + authenticatedEnvironment: Awaited>, + taskIdentifier: string, + friendlyId: string, + spanId: string +) { + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId, + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `t-${spanId}`, + spanId, + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: `consumer-${spanId}`, + workerQueue: "main", + }); + await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + return run; +} + +describe("WaitpointSystem create/block write routing", () => { + // DATETIME create routes the (env, idempotencyKey) upsert through the store. + containerTest("DATETIME create routes through the store", async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const completedAfter = new Date(Date.now() + 60_000); + const { waitpoint } = await engine.createDateTimeWaitpoint({ + projectId: env.projectId, + environmentId: env.id, + completedAfter, + }); + + expect(store.calls).toContain("upsertWaitpoint"); + + const row = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(row?.type).toBe("DATETIME"); + expect(row?.environmentId).toBe(env.id); + } finally { + await engine.quit(); + } + }); + + // MANUAL create routes through the store. + containerTest("MANUAL create routes through the store", async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + timeout: new Date(Date.now() + 60_000), + }); + + expect(store.calls).toContain("upsertWaitpoint"); + + const row = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(row?.type).toBe("MANUAL"); + } finally { + await engine.quit(); + } + }); + + // Block routes the CTE + the separate pending check through the store (two + // distinct calls in order), writes exactly one TaskRunWaitpoint + one edge, and the + // ON CONFLICT DO NOTHING idempotency holds on a repeat block. + containerTest( + "block routes the CTE + pending check through the store", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const run = await triggerExecutingRun( + engine, + prisma, + env, + "test-task-c", + "run_c1234", + "sc1234" + ); + + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + + store.calls.length = 0; + + await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + const blockIdx = store.calls.indexOf("blockRunWithWaitpointEdges"); + const pendingIdx = store.calls.indexOf("countPendingWaitpoints"); + expect(blockIdx).toBeGreaterThanOrEqual(0); + expect(pendingIdx).toBeGreaterThan(blockIdx); + + const trws = await prisma.taskRunWaitpoint.findMany({ where: { taskRunId: run.id } }); + expect(trws).toHaveLength(1); + expect(trws[0].waitpointId).toBe(waitpoint.id); + + const connections = await prisma.$queryRaw<{ count: bigint }[]>` + SELECT COUNT(*) as count FROM "_WaitpointRunConnections" + WHERE "A" = ${run.id} AND "B" = ${waitpoint.id}`; + expect(Number(connections[0].count)).toBe(1); + + const execData = await engine.getRunExecutionData({ runId: run.id }); + expect(execData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Re-block with the same waitpoint. The _WaitpointRunConnections edge has a (A,B) + // unique key, so ON CONFLICT DO NOTHING keeps it at exactly one row across repeats — + // that is the idempotency the routed CTE preserves. (TaskRunWaitpoint's unique key is + // (taskRunId, waitpointId, batchIndex); with a NULL batchIndex NULLs never conflict, + // so its row count is not the idempotency signal here — matching today's behavior.) + await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + const connectionsAfter = await prisma.$queryRaw<{ count: bigint }[]>` + SELECT COUNT(*) as count FROM "_WaitpointRunConnections" + WHERE "A" = ${run.id} AND "B" = ${waitpoint.id}`; + expect(Number(connectionsAfter[0].count)).toBe(1); + } finally { + await engine.quit(); + } + } + ); + + // clearBlockingWaitpoints routes the delete through the store. + containerTest("clearBlockingWaitpoints routes the delete", async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const run = await triggerExecutingRun( + engine, + prisma, + env, + "test-task-d", + "run_d1234", + "sd1234" + ); + + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + store.calls.length = 0; + const count = await engine.waitpointSystem.clearBlockingWaitpoints({ runId: run.id }); + + expect(store.calls).toContain("deleteManyTaskRunWaitpoints"); + expect(count).toBe(1); + const remaining = await prisma.taskRunWaitpoint.findMany({ where: { taskRunId: run.id } }); + expect(remaining).toHaveLength(0); + } finally { + await engine.quit(); + } + }); + + // Single-DB binds one client (passthrough), proven by BEHAVIOR — a create + block + // + clear round-trip resolves on the one configured client. The default-store engine has no + // accessible store.prisma member, so we never assert store.prisma === prisma. + containerTest( + "single-DB passthrough: round-trip resolves on the one client (default store)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + // No `store` option => engine constructs its own default PostgresRunStore over `prisma`. + const engine = buildEngine(prisma, redisOptions); + + try { + const run = await triggerExecutingRun( + engine, + prisma, + env, + "test-task-e", + "run_e1234", + "se1234" + ); + + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + const blocked = await prisma.taskRunWaitpoint.findMany({ where: { taskRunId: run.id } }); + expect(blocked).toHaveLength(1); + + const cleared = await engine.waitpointSystem.clearBlockingWaitpoints({ runId: run.id }); + expect(cleared).toBe(1); + const after = await prisma.taskRunWaitpoint.findMany({ where: { taskRunId: run.id } }); + expect(after).toHaveLength(0); + } finally { + await engine.quit(); + } + } + ); + + // Idempotency-key reuse returns the same waitpoint (single authority) — exactly one + // row, no duplicate — for both MANUAL and DATETIME. + containerTest( + "idempotency-key reuse returns the same waitpoint (single authority)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const future = new Date(Date.now() + 60 * 60_000); + + const first = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + idempotencyKey: "ik-manual", + idempotencyKeyExpiresAt: future, + }); + const second = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + idempotencyKey: "ik-manual", + idempotencyKeyExpiresAt: future, + }); + + expect(second.isCached).toBe(true); + expect(second.waitpoint.id).toBe(first.waitpoint.id); + const manualRows = await prisma.waitpoint.findMany({ + where: { environmentId: env.id, idempotencyKey: "ik-manual" }, + }); + expect(manualRows).toHaveLength(1); + + const firstDt = await engine.createDateTimeWaitpoint({ + projectId: env.projectId, + environmentId: env.id, + completedAfter: future, + idempotencyKey: "ik-datetime", + idempotencyKeyExpiresAt: future, + }); + const secondDt = await engine.createDateTimeWaitpoint({ + projectId: env.projectId, + environmentId: env.id, + completedAfter: future, + idempotencyKey: "ik-datetime", + idempotencyKeyExpiresAt: future, + }); + + expect(secondDt.isCached).toBe(true); + expect(secondDt.waitpoint.id).toBe(firstDt.waitpoint.id); + const dtRows = await prisma.waitpoint.findMany({ + where: { environmentId: env.id, idempotencyKey: "ik-datetime" }, + }); + expect(dtRows).toHaveLength(1); + } finally { + await engine.quit(); + } + } + ); + + // An expired idempotency key rotates (read-legacy-first: find -> update -> + // upsert all through the authority store) rather than duplicating. + containerTest( + "expired idempotency key rotates through the store (find -> update -> upsert)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const past = new Date(Date.now() - 60_000); + const first = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + idempotencyKey: "ik-expire", + idempotencyKeyExpiresAt: past, + }); + + store.calls.length = 0; + + const second = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + idempotencyKey: "ik-expire", + idempotencyKeyExpiresAt: new Date(Date.now() + 60 * 60_000), + }); + + // read-legacy-first then rotate then upsert, all via the authority store. + const findIdx = store.calls.indexOf("findWaitpoint"); + const updateIdx = store.calls.indexOf("updateWaitpoint"); + const upsertIdx = store.calls.indexOf("upsertWaitpoint"); + expect(findIdx).toBeGreaterThanOrEqual(0); + expect(updateIdx).toBeGreaterThan(findIdx); + expect(upsertIdx).toBeGreaterThan(updateIdx); + + // The original row had its key rotated to a fresh nanoid + inactiveIdempotencyKey set. + const original = await prisma.waitpoint.findFirst({ where: { id: first.waitpoint.id } }); + expect(original?.idempotencyKey).not.toBe("ik-expire"); + expect(original?.inactiveIdempotencyKey).toBe("ik-expire"); + + // A NEW waitpoint now holds the key. + expect(second.isCached).toBe(false); + expect(second.waitpoint.id).not.toBe(first.waitpoint.id); + const active = await prisma.waitpoint.findFirst({ + where: { environmentId: env.id, idempotencyKey: "ik-expire" }, + }); + expect(active?.id).toBe(second.waitpoint.id); + } finally { + await engine.quit(); + } + } + ); + + // The P2002 retry loop in createManualWaitpoint survives store routing — a single + // unique-constraint conflict resolves to one row without throwing. + containerTest( + "createManualWaitpoint P2002 retry loop preserved through routing", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + // The P2002 retry loop wraps the routed upsertWaitpoint. Count attempts to prove the + // loop drives the store call, and assert that a reused key resolves to a single row + // (the unique-constraint path the loop guards) without throwing. + let upsertAttempts = 0; + class RacyStore extends PostgresRunStore { + override async upsertWaitpoint( + args: Prisma.SelectSubset, + tx?: any + ): Promise> { + upsertAttempts++; + return super.upsertWaitpoint(args, tx); + } + } + const store = new RacyStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const a = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + expect(a.waitpoint.id).toBeDefined(); + + const k1 = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + idempotencyKey: "ik-race", + idempotencyKeyExpiresAt: new Date(Date.now() + 60 * 60_000), + }); + const k2 = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + idempotencyKey: "ik-race", + idempotencyKeyExpiresAt: new Date(Date.now() + 60 * 60_000), + }); + expect(k2.waitpoint.id).toBe(k1.waitpoint.id); + const rows = await prisma.waitpoint.findMany({ + where: { environmentId: env.id, idempotencyKey: "ik-race" }, + }); + expect(rows).toHaveLength(1); + expect(upsertAttempts).toBeGreaterThan(0); + } finally { + await engine.quit(); + } + } + ); + + // DATETIME/MANUAL create round-trips byte-identically across both Postgres major versions + // via the store's upsertWaitpoint/findWaitpoint (the methods this unit's create paths delegate to). + heteroPostgresTest( + "create round-trip is byte-identical across both Postgres major versions", + async ({ prisma14, prisma17 }) => { + const future = new Date("2024-03-03T00:00:00.000Z"); + + const run = async (prisma: PrismaClient, suffix: string) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const env = await seedHeteroEnvironment(prisma, suffix); + + const datetime = await store.upsertWaitpoint({ + where: { + environmentId_idempotencyKey: { environmentId: env.id, idempotencyKey: "dt-key" }, + }, + create: { + id: `wp_dt_${suffix}`, + friendlyId: `waitpoint_dt_${suffix}`, + type: "DATETIME", + idempotencyKey: "dt-key", + idempotencyKeyExpiresAt: future, + userProvidedIdempotencyKey: true, + environmentId: env.id, + projectId: env.projectId, + completedAfter: future, + }, + update: {}, + }); + const manual = await store.upsertWaitpoint({ + where: { + environmentId_idempotencyKey: { environmentId: env.id, idempotencyKey: "mn-key" }, + }, + create: { + id: `wp_mn_${suffix}`, + friendlyId: `waitpoint_mn_${suffix}`, + type: "MANUAL", + idempotencyKey: "mn-key", + idempotencyKeyExpiresAt: future, + userProvidedIdempotencyKey: true, + environmentId: env.id, + projectId: env.projectId, + completedAfter: future, + tags: ["alpha", "beta"], + }, + update: {}, + }); + + return { + dt: await store.findWaitpoint({ where: { id: datetime.id } }), + mn: await store.findWaitpoint({ where: { id: manual.id } }), + }; + }; + + const r14 = await run(prisma14, "i14"); + const r17 = await run(prisma17, "i17"); + + expect(normalizeWaitpoint(r14.dt!)).toEqual(normalizeWaitpoint(r17.dt!)); + expect(normalizeWaitpoint(r14.mn!)).toEqual(normalizeWaitpoint(r17.mn!)); + expect(r14.mn!.tags).toEqual(r17.mn!.tags); + expect(r14.dt!.completedAfter?.toISOString()).toBe(r17.dt!.completedAfter?.toISOString()); + } + ); + + // The block CTE round-trips across both Postgres major versions — one TaskRunWaitpoint + + // one edge on both versions, idempotent on repeat, and the separate pending count reads 1 pre-complete. + heteroPostgresTest( + "block CTE round-trips identically across both Postgres major versions", + async ({ prisma14, prisma17 }) => { + const run = async (prisma: PrismaClient, suffix: string) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const env = await seedHeteroEnvironment(prisma, suffix); + const runId = `run_block_${suffix}`; + await prisma.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: `run_friendly_${suffix}`, + runtimeEnvironmentId: env.id, + organizationId: env.organizationId, + projectId: env.projectId, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + queue: "task/my-task", + traceId: `trace_${suffix}`, + spanId: `span_${suffix}`, + }, + }); + const wId = `wp_block_${suffix}`; + await prisma.waitpoint.create({ + data: { + id: wId, + friendlyId: `waitpoint_block_${suffix}`, + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${wId}`, + userProvidedIdempotencyKey: false, + projectId: env.projectId, + environmentId: env.id, + }, + }); + + await store.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [wId], + projectId: env.projectId, + }); + // Repeat: the _WaitpointRunConnections (A,B) unique key keeps the edge at one row on + // both versions (ON CONFLICT DO NOTHING idempotency for the historical connection). + await store.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [wId], + projectId: env.projectId, + }); + + const trws = await prisma.taskRunWaitpoint.findMany({ where: { taskRunId: runId } }); + const edges = await prisma.$queryRaw<{ count: bigint }[]>` + SELECT COUNT(*) as count FROM "_WaitpointRunConnections" + WHERE "A" = ${runId} AND "B" = ${wId}`; + const pending = await store.countPendingWaitpoints([wId]); + return { trwCount: trws.length, edgeCount: Number(edges[0].count), pending }; + }; + + const r14 = await run(prisma14, "j14"); + const r17 = await run(prisma17, "j17"); + + // Identical across versions: TaskRunWaitpoint inserts (NULL batchIndex never conflicts, + // so two rows on both), one deduped edge on both, pending count of 1 pre-complete on both. + expect(r14).toEqual({ trwCount: 2, edgeCount: 1, pending: 1 }); + expect(r17).toEqual({ trwCount: 2, edgeCount: 1, pending: 1 }); + } + ); +}); + +// Triggers a child run that resumes a parent (so the engine attaches a RUN-type +// associatedWaitpoint to the child). Returns both runs; the child is left QUEUED. +async function triggerChildResumingParent( + engine: RunEngine, + prisma: PrismaClient, + authenticatedEnvironment: Awaited>, + parentTask: string, + childTask: string, + suffix: string +) { + await setupBackgroundWorker(engine, authenticatedEnvironment, [parentTask, childTask]); + + const parentRun = await engine.trigger( + { + number: 1, + friendlyId: `run_p${suffix}`, + environment: authenticatedEnvironment, + taskIdentifier: parentTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `tp-${suffix}`, + spanId: `sp-${suffix}`, + workerQueue: "main", + queue: `task/${parentTask}`, + isTest: false, + tags: [], + }, + prisma + ); + + await setTimeout(500); + const dequeuedParent = await engine.dequeueFromWorkerQueue({ + consumerId: `consumer-p-${suffix}`, + workerQueue: "main", + }); + await engine.startRunAttempt({ + runId: dequeuedParent[0].run.id, + snapshotId: dequeuedParent[0].snapshot.id, + }); + + const childRun = await engine.trigger( + { + number: 1, + friendlyId: `run_c${suffix}`, + environment: authenticatedEnvironment, + taskIdentifier: childTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `tc-${suffix}`, + spanId: `sc-${suffix}`, + workerQueue: "main", + queue: `task/${childTask}`, + isTest: false, + tags: [], + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + }, + prisma + ); + + return { parentRun, childRun }; +} + +/** + * Completion fan-out + residency store-selection guard. + * + * completeWaitpoint's FIRST statement is the residency guard + * (this.$.runStore.forWaitpointCompletion). Every route that reaches + * completeWaitpoint therefore records exactly one `forWaitpointCompletion` + * BEFORE its `updateManyWaitpoints`. A missed unblock route in production is a + * silent permanent run hang, so Group 1 enumerates EVERY route exhaustively — + * the 7 callers plus the 2 in-file wrappers — and proves the + * guard fires on each, not a representative sample. + */ +describe("WaitpointSystem completion fan-out + residency store-selection guard", () => { + // Asserts the guard fired before (or, for async/enqueued completions, no later + // than) the first completion DB write on the same store. + function expectGuardFiredBeforeUpdate(calls: string[]) { + const guardIdx = calls.indexOf("forWaitpointCompletion"); + const updateIdx = calls.indexOf("updateManyWaitpoints"); + expect(guardIdx).toBeGreaterThanOrEqual(0); + expect(updateIdx).toBeGreaterThanOrEqual(0); + expect(guardIdx).toBeLessThanOrEqual(updateIdx); + } + + // ----- Group 1: EXHAUSTIVE route enumeration ----- + + // PUBLIC entry: engine.completeWaitpoint on a MANUAL waitpoint. The canonical + // assertion: guard fires strictly before the update on the synchronous public path. + containerTest( + "guard fires on the public completeWaitpoint route", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + + store.calls.length = 0; + await engine.completeWaitpoint({ id: waitpoint.id }); + + expect(store.calls).toContain("forWaitpointCompletion"); + const guardIdx = store.calls.indexOf("forWaitpointCompletion"); + const updateIdx = store.calls.indexOf("updateManyWaitpoints"); + // Synchronous route: guard is strictly the first DB step. + expect(guardIdx).toBe(0); + expect(updateIdx).toBeGreaterThan(guardIdx); + } finally { + await engine.quit(); + } + } + ); + + // finishWaitpoint redis job (DATETIME): a DATETIME waitpoint with a + // near-future completedAfter is completed by the worker firing finishWaitpoint. + containerTest( + "guard fires on the finishWaitpoint redis-job route (DATETIME)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const completedAfter = new Date(Date.now() + 1_000); + const { waitpoint } = await engine.createDateTimeWaitpoint({ + projectId: env.projectId, + environmentId: env.id, + completedAfter, + }); + + store.calls.length = 0; + + // Let the finishWaitpoint job fire from the worker (it is scheduled at completedAfter). + await vi.waitFor( + async () => { + const row = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(row?.status).toBe("COMPLETED"); + }, + { timeout: 15_000, interval: 100 } + ); + + // The completion went through the guard, driven by the redis job (not by us). + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + } finally { + await engine.quit(); + } + } + ); + + // batch (#tryCompleteBatch): a created batch whose runs are all final has its + // BATCH waitpoint completed by batchSystem. + containerTest( + "guard fires on the batch completion route (#tryCompleteBatch)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const parentTask = "parent-task-r3"; + const childTask = "child-task-r3"; + // Register BOTH tasks once so the child is in the latest worker version. + await setupBackgroundWorker(engine, env, [parentTask, childTask]); + + // Parent run, executing (inline so we don't re-register a parent-only worker). + const parentRun = await engine.trigger( + { + number: 1, + friendlyId: "run_r3p", + environment: env, + taskIdentifier: parentTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "tr3p", + spanId: "sr3p", + workerQueue: "main", + queue: `task/${parentTask}`, + isTest: false, + tags: [], + }, + prisma + ); + await setTimeout(500); + const dequeuedParent = await engine.dequeueFromWorkerQueue({ + consumerId: "consumer-r3p", + workerQueue: "main", + }); + await engine.startRunAttempt({ + runId: dequeuedParent[0].run.id, + snapshotId: dequeuedParent[0].snapshot.id, + }); + + // A v2 batch with a single run; block the parent on it (creates the BATCH waitpoint). + const { id: batchId, friendlyId: batchFriendlyId } = BatchId.generate(); + await prisma.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: batchFriendlyId, + runtimeEnvironmentId: env.id, + status: "PROCESSING", + runCount: 1, + successfulRunCount: 1, + batchVersion: "runengine:v2", + }, + }); + + await engine.blockRunWithCreatedBatch({ + runId: parentRun.id, + batchId, + environmentId: env.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + // A child belonging to the batch, driven to a final (COMPLETED_SUCCESSFULLY) status. + const childRun = await engine.trigger( + { + number: 1, + friendlyId: "run_r3c", + environment: env, + taskIdentifier: childTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "tr3c", + spanId: "sr3c", + workerQueue: "main", + queue: `task/${childTask}`, + isTest: false, + tags: [], + batch: { id: batchId, index: 0 }, + }, + prisma + ); + await setTimeout(500); + const dequeuedChild = await engine.dequeueFromWorkerQueue({ + consumerId: "consumer-r3c", + workerQueue: "main", + }); + const childAttempt = await engine.startRunAttempt({ + runId: dequeuedChild[0].run.id, + snapshotId: dequeuedChild[0].snapshot.id, + }); + await engine.completeRunAttempt({ + runId: childRun.id, + snapshotId: childAttempt.snapshot.id, + completion: { + id: childRun.id, + ok: true, + output: '{"ok":true}', + outputType: "application/json", + }, + }); + + store.calls.length = 0; + + // Synchronous batch-completion entry calls #tryCompleteBatch -> completeWaitpoint. + await engine.batchSystem.performCompleteBatch({ batchId }); + + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + } finally { + await engine.quit(); + } + } + ); + + // runAttemptSystem success: a child that resumes its parent is completed + // successfully, completing its associatedWaitpoint via runAttemptSystem. + containerTest("guard fires on the runAttempt success route", async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const { childRun } = await triggerChildResumingParent( + engine, + prisma, + env, + "parent-task-r4", + "child-task-r4", + "r4" + ); + + await setTimeout(500); + const dequeuedChild = await engine.dequeueFromWorkerQueue({ + consumerId: "consumer-r4c", + workerQueue: "main", + }); + const childAttempt = await engine.startRunAttempt({ + runId: dequeuedChild[0].run.id, + snapshotId: dequeuedChild[0].snapshot.id, + }); + + store.calls.length = 0; + await engine.completeRunAttempt({ + runId: childRun.id, + snapshotId: childAttempt.snapshot.id, + completion: { + id: childRun.id, + ok: true, + output: '{"foo":"bar"}', + outputType: "application/json", + }, + }); + + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + } finally { + await engine.quit(); + } + }); + + // runAttemptSystem cancel: cancelling a still-queued child finishes it + // immediately and completes its associatedWaitpoint via runAttemptSystem. + containerTest("guard fires on the runAttempt cancel route", async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const { childRun } = await triggerChildResumingParent( + engine, + prisma, + env, + "parent-task-r5", + "child-task-r5", + "r5" + ); + + const associatedWaitpoint = await prisma.waitpoint.findFirstOrThrow({ + where: { completedByTaskRunId: childRun.id }, + }); + expect(associatedWaitpoint.status).toBe("PENDING"); + + store.calls.length = 0; + const result = await engine.cancelRun({ + runId: childRun.id, + completedAt: new Date(), + reason: "Cancelled by the user", + }); + expect(result.snapshot.executionStatus).toBe("FINISHED"); + + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + + const completed = await prisma.waitpoint.findUniqueOrThrow({ + where: { id: associatedWaitpoint.id }, + }); + expect(completed.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + }); + + // runAttemptSystem failure: a child that resumes its parent is failed + // permanently, completing its associatedWaitpoint (with an error output). + containerTest("guard fires on the runAttempt failure route", async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const { childRun } = await triggerChildResumingParent( + engine, + prisma, + env, + "parent-task-r6", + "child-task-r6", + "r6" + ); + + const associatedWaitpoint = await prisma.waitpoint.findFirstOrThrow({ + where: { completedByTaskRunId: childRun.id }, + }); + + await setTimeout(500); + const dequeuedChild = await engine.dequeueFromWorkerQueue({ + consumerId: "consumer-r6c", + workerQueue: "main", + }); + const childAttempt = await engine.startRunAttempt({ + runId: dequeuedChild[0].run.id, + snapshotId: dequeuedChild[0].snapshot.id, + }); + + store.calls.length = 0; + // A non-retryable failure finishes the child permanently and completes its waitpoint. + await engine.completeRunAttempt({ + runId: childRun.id, + snapshotId: childAttempt.snapshot.id, + completion: { + ok: false, + id: childRun.id, + error: { + type: "INTERNAL_ERROR" as const, + code: "TASK_RUN_CRASHED" as const, + message: "boom", + }, + }, + }); + + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + + const completed = await prisma.waitpoint.findUniqueOrThrow({ + where: { id: associatedWaitpoint.id }, + }); + expect(completed.status).toBe("COMPLETED"); + expect(completed.outputIsError).toBe(true); + } finally { + await engine.quit(); + } + }); + + // ttlSystem: a still-PENDING child that resumes its parent is expired by TTL, + // completing its associatedWaitpoint via ttlSystem. + containerTest("guard fires on the ttlSystem expiry route", async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const { childRun } = await triggerChildResumingParent( + engine, + prisma, + env, + "parent-task-r7", + "child-task-r7", + "r7" + ); + + const associatedWaitpoint = await prisma.waitpoint.findFirstOrThrow({ + where: { completedByTaskRunId: childRun.id }, + }); + // The child is still QUEUED/PENDING (never dequeued), so the per-run expireRun + // path will expire it and complete the associated waitpoint. + expect(associatedWaitpoint.status).toBe("PENDING"); + + store.calls.length = 0; + await engine.ttlSystem.expireRun({ runId: childRun.id }); + + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + + const completed = await prisma.waitpoint.findUniqueOrThrow({ + where: { id: associatedWaitpoint.id }, + }); + expect(completed.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + }); + + // in-file wrapper blockRunAndCompleteWaitpoint: blocks then immediately + // completes, so the guard must fire on the inner completeWaitpoint call. + containerTest( + "guard fires on the blockRunAndCompleteWaitpoint wrapper", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const run = await triggerExecutingRun(engine, prisma, env, "task-w1", "run_w1", "sw1"); + + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + + store.calls.length = 0; + await engine.waitpointSystem.blockRunAndCompleteWaitpoint({ + runId: run.id, + waitpointId: waitpoint.id, + output: { value: '{"done":true}', type: "application/json", isError: false }, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + + const completed = await prisma.waitpoint.findUniqueOrThrow({ where: { id: waitpoint.id } }); + expect(completed.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // in-file wrapper getOrCreateRunWaitpoint FINISHED branch: a run that has + // already FINISHED (per its snapshot) and has no associatedWaitpoint gets a + // freshly-created waitpoint that is immediately completed. + containerTest( + "guard fires on the getOrCreateRunWaitpoint FINISHED branch", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + // A standalone run (no parent), driven to FINISHED with no associatedWaitpoint. + const run = await triggerExecutingRun(engine, prisma, env, "task-w2", "run_w2", "sw2"); + const execData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(execData); + await engine.completeRunAttempt({ + runId: run.id, + snapshotId: execData.snapshot.id, + completion: { + id: run.id, + ok: true, + output: '{"r":1}', + outputType: "application/json", + }, + }); + + const finished = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(finished); + expect(finished.snapshot.executionStatus).toBe("FINISHED"); + + store.calls.length = 0; + // FINISHED + no associatedWaitpoint => getOrCreateRunWaitpoint creates one and + // immediately completes it (the FINISHED branch). + const waitpoint = await engine.waitpointSystem.getOrCreateRunWaitpoint({ + runId: run.id, + projectId: env.projectId, + environmentId: env.id, + }); + + expect(store.calls).toContain("forWaitpointCompletion"); + expectGuardFiredBeforeUpdate(store.calls); + expect(waitpoint.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // ----- Group 2: fan-out ----- + + // A completed waitpoint blocking >=2 runs unblocks every blocked run and reads the + // blocked TaskRunWaitpoint set exactly once on the completion. + containerTest( + "a completed waitpoint fans out to every blocked run", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const runA = await triggerExecutingRun( + engine, + prisma, + env, + "task-fan-a", + "run_fanA", + "sfanA" + ); + const runB = await triggerExecutingRun( + engine, + prisma, + env, + "task-fan-b", + "run_fanB", + "sfanB" + ); + + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + + for (const runId of [runA.id, runB.id]) { + await engine.blockRunWithWaitpoint({ + runId, + waitpoints: waitpoint.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + } + + store.calls.length = 0; + await engine.completeWaitpoint({ id: waitpoint.id }); + + // The completion reads the blocked TaskRunWaitpoint set once (fan-out source). + expect(store.calls.filter((c) => c === "findManyTaskRunWaitpoints")).toHaveLength(1); + + // Both blocked runs resume (one continueRunIfUnblocked job each). + await vi.waitFor( + async () => { + for (const runId of [runA.id, runB.id]) { + const data = await engine.getRunExecutionData({ runId }); + expect(data?.snapshot.executionStatus).toBe("EXECUTING"); + } + }, + { timeout: 15_000, interval: 100 } + ); + } finally { + await engine.quit(); + } + } + ); + + // ----- Group 3: continueRunIfUnblocked routing ----- + + // continueRunIfUnblocked routes both the blocking-waitpoints read and the clear + // through the run-store seam, and transitions the run out of the blocked state. + containerTest( + "continueRunIfUnblocked routes the blocking read + clear through the store", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = buildEngine(prisma, redisOptions, store); + + try { + const run = await triggerExecutingRun( + engine, + prisma, + env, + "task-cont", + "run_cont", + "scont" + ); + + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + // Complete the waitpoint so the blocking edge is COMPLETED but still present. + await engine.completeWaitpoint({ id: waitpoint.id }); + + store.calls.length = 0; + const result = await engine.waitpointSystem.continueRunIfUnblocked({ runId: run.id }); + + expect(store.calls).toContain("findManyTaskRunWaitpoints"); + expect(store.calls).toContain("deleteManyTaskRunWaitpoints"); + // The blocking read precedes the clear. + expect(store.calls.indexOf("findManyTaskRunWaitpoints")).toBeLessThan( + store.calls.indexOf("deleteManyTaskRunWaitpoints") + ); + + // The run left the blocked state. + const data = await engine.getRunExecutionData({ runId: run.id }); + expect(["EXECUTING", "QUEUED"]).toContain(data?.snapshot.executionStatus); + expect(result.status).not.toBe("blocked"); + } finally { + await engine.quit(); + } + } + ); + + // ----- Group 4: single-DB no-op (the classifier is NEVER consulted) ----- + + // The default single store's forWaitpointCompletion returns `this` without calling + // the classifier. Proven BY BEHAVIOR: a normal round-trip resolves on the one + // client, and an UNCLASSIFIABLE id does NOT throw UnclassifiableWaitpointId — it + // simply finds no row and throws the ordinary "Waitpoint not found". + containerTest( + "single-DB completion never consults the classifier (default store)", + async ({ prisma, redisOptions }) => { + // No `store` => engine builds its own default PostgresRunStore over `prisma`. + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = buildEngine(prisma, redisOptions); + + try { + const run = await triggerExecutingRun( + engine, + prisma, + env, + "task-noop", + "run_noop", + "snoop" + ); + + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: env.id, + projectId: env.projectId, + }); + await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.id, + projectId: env.projectId, + organizationId: env.organizationId, + }); + + // (a) the completion reads back exactly as written, COMPLETED, on the one client. + const completed = await engine.completeWaitpoint({ + id: waitpoint.id, + output: { value: '{"v":1}', type: "application/json", isError: false }, + }); + expect(completed.status).toBe("COMPLETED"); + const row = await prisma.waitpoint.findUniqueOrThrow({ where: { id: waitpoint.id } }); + expect(row.status).toBe("COMPLETED"); + expect(row.output).toBe('{"v":1}'); + + // (b) the continued run's blocking edges clear and its snapshot transitions. + await vi.waitFor( + async () => { + const trws = await prisma.taskRunWaitpoint.findMany({ where: { taskRunId: run.id } }); + expect(trws).toHaveLength(0); + const data = await engine.getRunExecutionData({ runId: run.id }); + expect(data?.snapshot.executionStatus).toBe("EXECUTING"); + }, + { timeout: 15_000, interval: 100 } + ); + + // (c) the load-bearing no-op: an unclassifiable id (length 26) must NOT throw + // UnclassifiableWaitpointId under the default single store — the classifier is + // never consulted. It finds no PENDING row, the re-read fails, and the ordinary + // "Waitpoint not found" surfaces instead. + const unclassifiableId = "waitpoint_" + "a".repeat(26); + await expect(engine.completeWaitpoint({ id: unclassifiableId })).rejects.toThrow( + "Waitpoint not found" + ); + await expect(engine.completeWaitpoint({ id: unclassifiableId })).rejects.not.toBeInstanceOf( + UnclassifiableWaitpointId + ); + } finally { + await engine.quit(); + } + } + ); + + // ----- Group 5: cross-seam two-store + loud-ambiguity + pinning ----- + + // Cross-seam completion applied to the OWNING store: a ksuid waitpoint resides on the dedicated + // run-ops (NEW) DB, a cuid waitpoint on the legacy/control-plane DB. Driving the completion at the + // store seam (forWaitpointCompletion -> updateManyWaitpoints, as the engine does) must apply each + // completion to its owning store only. + heteroPostgresTest( + "cross-seam completion lands on the owning store only", + async ({ prisma14, prisma17 }) => { + const legacy = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy }); + + const envLegacy = await seedHeteroEnvironment(prisma14, "csl"); + const envNew = await seedHeteroEnvironment(prisma17, "csn"); + + // 27-char body => ksuid => NEW (dedicated run-ops DB); 25-char body => cuid => LEGACY. + const ksuidId = "waitpoint_" + "a".repeat(27); + const cuidId = "waitpoint_" + "b".repeat(25); + + await prisma17.waitpoint.create({ + data: { + id: ksuidId, + friendlyId: "waitpoint_ks", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${ksuidId}`, + userProvidedIdempotencyKey: false, + projectId: envNew.projectId, + environmentId: envNew.id, + }, + }); + await prisma14.waitpoint.create({ + data: { + id: cuidId, + friendlyId: "waitpoint_cu", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${cuidId}`, + userProvidedIdempotencyKey: false, + projectId: envLegacy.projectId, + environmentId: envLegacy.id, + }, + }); + + const completedAt = new Date(); + const ownerKsuid = await router.forWaitpointCompletion(ksuidId, { routeKind: "MANUAL" }); + await ownerKsuid.updateManyWaitpoints({ + where: { id: ksuidId, status: "PENDING" }, + data: { status: "COMPLETED", completedAt }, + }); + const ownerCuid = await router.forWaitpointCompletion(cuidId, { routeKind: "MANUAL" }); + await ownerCuid.updateManyWaitpoints({ + where: { id: cuidId, status: "PENDING" }, + data: { status: "COMPLETED", completedAt }, + }); + + // ksuid completed on the dedicated run-ops (NEW) DB only. + expect((await prisma17.waitpoint.findUniqueOrThrow({ where: { id: ksuidId } })).status).toBe( + "COMPLETED" + ); + expect(await prisma14.waitpoint.findUnique({ where: { id: ksuidId } })).toBeNull(); + // cuid completed on the legacy DB only. + expect((await prisma14.waitpoint.findUniqueOrThrow({ where: { id: cuidId } })).status).toBe( + "COMPLETED" + ); + expect(await prisma17.waitpoint.findUnique({ where: { id: cuidId } })).toBeNull(); + } + ); + + // Ambiguity resolution: forWaitpointCompletion safe-classifies an id matching neither cuid nor + // ksuid to LEGACY, then probes both DBs. With no row anywhere it resolves to the LEGACY fallback + // rather than throwing — the loud-failure contract lives at the engine seam (completeWaitpoint + // re-reads and surfaces "Waitpoint not found"). The residency probe made this method async. + heteroPostgresTest( + "cross-seam forWaitpointCompletion safe-classifies an ambiguous id to legacy", + async ({ prisma14, prisma17 }) => { + const legacy = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy }); + + const ambiguousId = "waitpoint_" + "a".repeat(26); + const handle = await router.forWaitpointCompletion(ambiguousId, { routeKind: "MANUAL" }); + expect(handle).toBe(legacy); + } + ); + + // Pinning proof: a cross-tree-idempotency completion of a ksuid + // (NEW residency) waitpoint pins to the LEGACY store. + heteroPostgresTest( + "cross-seam cross-tree-idempotency completion pins to legacy", + async ({ prisma14, prisma17 }) => { + const legacy = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy }); + + // pin is DRIVEN via explicit context at the store seam; the engine completeWaitpoint entry cannot derive it — the organic cross-tree-idempotency pin is applied at the webapp idempotency caller. + const ksuidId = "waitpoint_" + "a".repeat(27); + const handle = await router.forWaitpointCompletion(ksuidId, { + routeKind: "IDEMPOTENCY_REUSE", + isCrossTreeIdempotency: true, + }); + expect(handle).toBe(legacy); + } + ); +}); + +// --- hetero helpers (mirror run-store/src/runOpsStore.waitpoints.test.ts) --- + +async function seedHeteroEnvironment(prisma: PrismaClient, slugSuffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { + id: environment.id, + projectId: project.id, + organizationId: organization.id, + }; +} + +// Strip per-DB / prisma-managed fields so rows compare field-for-field across versions. +function normalizeWaitpoint(row: Record) { + const r = { ...row }; + delete r.id; + delete r.friendlyId; + delete r.createdAt; + delete r.updatedAt; + delete r.projectId; + delete r.environmentId; + return r; +} diff --git a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts index 85e1334ef40..aacbd505d23 100644 --- a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts @@ -9,8 +9,10 @@ import { TaskRunExecutionStatus, Waitpoint, } from "@trigger.dev/database"; +import { RunStore } from "@internal/run-store"; import { assertNever } from "assert-never"; import { nanoid } from "nanoid"; +import { UnclassifiableWaitpointId } from "../errors.js"; import { sendNotificationToWorker } from "../eventBus.js"; import { EnqueueSystem } from "./enqueueSystem.js"; import { ExecutionSnapshotSystem, getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; @@ -57,12 +59,18 @@ export class WaitpointSystem { runId: string; tx?: PrismaClientOrTransaction; }) { - const prisma = tx ?? this.$.prisma; - const deleted = await prisma.taskRunWaitpoint.deleteMany({ - where: { - taskRunId: runId, - }, - }); + // A tx pins a specific client and must not be re-routed through the store. + const deleted = tx + ? await tx.taskRunWaitpoint.deleteMany({ + where: { + taskRunId: runId, + }, + }) + : await this.$.runStore.deleteManyTaskRunWaitpoints({ + where: { + taskRunId: runId, + }, + }); return deleted.count; } @@ -80,9 +88,24 @@ export class WaitpointSystem { isError: boolean; }; }): Promise { + // Residency store-selection guard. completeWaitpoint arrives with only + // (waitpointId, output) — no run id — so the owning run-ops store is selected + // by the waitpoint's own residency. In single-DB this is the one store + // (no classification). An unclassifiable id throws loud — never default-routes. + let store: RunStore; + try { + store = await this.$.runStore.forWaitpointCompletion(id, { routeKind: "MANUAL" }); + } catch (error) { + this.$.logger.error("completeWaitpoint: unclassifiable waitpointId", { + waitpointId: id, + error, + }); + throw new UnclassifiableWaitpointId(id, { cause: error }); + } + // 1. Complete the Waitpoint (if not completed) const [updateError, updateResult] = await tryCatch( - this.$.prisma.waitpoint.updateMany({ + store.updateManyWaitpoints({ where: { id, status: "PENDING" }, data: { status: "COMPLETED", @@ -106,7 +129,10 @@ export class WaitpointSystem { ); } - const waitpoint = await this.$.prisma.waitpoint.findFirst({ + // Re-read the just-written row from the RESOLVED store's PRIMARY: the replica (findWaitpoint's + // default) can miss it under lag → false "not found" → the parent hangs; this.$.prisma would + // instead hit the wrong DB. findWaitpointOnPrimary reads the owning store's primary. + const waitpoint = await store.findWaitpointOnPrimary({ where: { id }, }); @@ -122,11 +148,17 @@ export class WaitpointSystem { throw new Error("Waitpoint not completed"); } - // 2. Find the TaskRuns blocked by this waitpoint - const affectedTaskRuns = await this.$.prisma.taskRunWaitpoint.findMany({ - where: { waitpointId: id }, - select: { taskRunId: true, spanIdToComplete: true, createdAt: true }, - }); + // 2. Find the TaskRuns blocked by this waitpoint. The edge (TaskRunWaitpoint) co-locates + // with its RUN, not this token, so it can live on the OTHER run-ops DB: read via the router + // (which fans the waitpointId lookup across both DBs) rather than the token's own `store`, + // or a cross-DB blocked run is never found and hangs forever. + const affectedTaskRuns = await this.$.runStore.findManyTaskRunWaitpoints( + { + where: { waitpointId: id }, + select: { taskRunId: true, spanIdToComplete: true, createdAt: true }, + }, + this.$.prisma + ); if (affectedTaskRuns.length === 0) { this.$.logger.debug(`completeWaitpoint: no TaskRunWaitpoints found for waitpoint`, { @@ -178,6 +210,7 @@ export class WaitpointSystem { * If you pass an `idempotencyKey`, the waitpoint will be created only if it doesn't already exist. */ async createDateTimeWaitpoint({ + runId, projectId, environmentId, completedAfter, @@ -185,6 +218,7 @@ export class WaitpointSystem { idempotencyKeyExpiresAt, tx, }: { + runId?: string; projectId: string; environmentId: string; completedAfter: Date; @@ -192,15 +226,33 @@ export class WaitpointSystem { idempotencyKeyExpiresAt?: Date; tx?: PrismaClientOrTransaction; }) { - const prisma = tx ?? this.$.prisma; - + // Co-location invariant: a DATETIME wait waitpoint lives on the same run-ops DB as the run that + // blocks on it (so the block edge's local `Waitpoint` join resolves and completion/resume stay + // local). The minted waitpoint id is always a cuid, so without `coLocateWithRunId` the upsert + // would always route to LEGACY and a ksuid run on NEW would hang. The (env,idempotencyKey) dedup + // is within the owning run/tree (co-resident on one DB), so the dedup probe + rotation target the + // SAME store. With no run id (a standalone token has no owning run yet) the lookup falls back to + // a cross-DB NEW-then-LEGACY scan and the upsert routes by id-shape. A caller-supplied tx pins a + // client (same physical DB as the control-plane tx → LEGACY), so it stays on direct prisma. + const colocate = runId ? { coLocateWithRunId: runId } : undefined; const existingWaitpoint = idempotencyKey - ? await prisma.waitpoint.findFirst({ - where: { - environmentId, - idempotencyKey, - }, - }) + ? tx + ? await tx.waitpoint.findFirst({ + where: { + environmentId, + idempotencyKey, + }, + }) + : await this.$.runStore.findWaitpoint( + { + where: { + environmentId, + idempotencyKey, + }, + }, + undefined, + colocate + ) : undefined; if (existingWaitpoint) { @@ -210,7 +262,7 @@ export class WaitpointSystem { ) { //the idempotency key has expired //remove the waitpoint idempotencyKey - await prisma.waitpoint.update({ + const rotateArgs = { where: { id: existingWaitpoint.id, }, @@ -218,7 +270,12 @@ export class WaitpointSystem { idempotencyKey: nanoid(24), inactiveIdempotencyKey: existingWaitpoint.idempotencyKey, }, - }); + }; + if (tx) { + await tx.waitpoint.update(rotateArgs); + } else { + await this.$.runStore.updateWaitpoint(rotateArgs, undefined, colocate); + } //let it fall through to create a new waitpoint } else { @@ -226,7 +283,7 @@ export class WaitpointSystem { } } - const waitpoint = await prisma.waitpoint.upsert({ + const upsertArgs = { where: { environmentId_idempotencyKey: { environmentId, @@ -235,7 +292,7 @@ export class WaitpointSystem { }, create: { ...WaitpointId.generate(), - type: "DATETIME", + type: "DATETIME" as const, idempotencyKey: idempotencyKey ?? nanoid(24), idempotencyKeyExpiresAt, userProvidedIdempotencyKey: !!idempotencyKey, @@ -244,7 +301,10 @@ export class WaitpointSystem { completedAfter, }, update: {}, - }); + }; + const waitpoint = tx + ? await tx.waitpoint.upsert(upsertArgs) + : await this.$.runStore.upsertWaitpoint(upsertArgs, undefined, colocate); await this.$.worker.enqueue({ id: `finishWaitpoint.${waitpoint.id}`, @@ -260,6 +320,7 @@ export class WaitpointSystem { * If you pass an `idempotencyKey` and it already exists, it will return the existing waitpoint. */ async createManualWaitpoint({ + runId, environmentId, projectId, idempotencyKey, @@ -267,6 +328,7 @@ export class WaitpointSystem { timeout, tags, }: { + runId?: string; environmentId: string; projectId: string; idempotencyKey?: string; @@ -274,13 +336,23 @@ export class WaitpointSystem { timeout?: Date; tags?: string[]; }): Promise<{ waitpoint: Waitpoint; isCached: boolean }> { + // Co-location invariant (see createDateTimeWaitpoint): when a `runId` is supplied the waitpoint + // co-locates with that run's DB and the (env,idempotencyKey) dedup is per-run (co-resident). A + // standalone token (api.v1.waitpoints.tokens.ts) passes no run id — it is created without an + // owner, blocked later by whichever run waits on it (possibly cross-DB, resolved by the + // run-co-resident block edge + completion fan-out), so it routes by id-shape and dedups cross-DB. No tx here. + const colocate = runId ? { coLocateWithRunId: runId } : undefined; const existingWaitpoint = idempotencyKey - ? await this.$.prisma.waitpoint.findFirst({ - where: { - environmentId, - idempotencyKey, + ? await this.$.runStore.findWaitpoint( + { + where: { + environmentId, + idempotencyKey, + }, }, - }) + undefined, + colocate + ) : undefined; if (existingWaitpoint) { @@ -290,15 +362,19 @@ export class WaitpointSystem { ) { //the idempotency key has expired //remove the waitpoint idempotencyKey - await this.$.prisma.waitpoint.update({ - where: { - id: existingWaitpoint.id, - }, - data: { - idempotencyKey: nanoid(24), - inactiveIdempotencyKey: existingWaitpoint.idempotencyKey, + await this.$.runStore.updateWaitpoint( + { + where: { + id: existingWaitpoint.id, + }, + data: { + idempotencyKey: nanoid(24), + inactiveIdempotencyKey: existingWaitpoint.idempotencyKey, + }, }, - }); + undefined, + colocate + ); //let it fall through to create a new waitpoint } else { @@ -311,26 +387,30 @@ export class WaitpointSystem { while (attempts < maxRetries) { try { - const waitpoint = await this.$.prisma.waitpoint.upsert({ - where: { - environmentId_idempotencyKey: { - environmentId, + const waitpoint = await this.$.runStore.upsertWaitpoint( + { + where: { + environmentId_idempotencyKey: { + environmentId, + idempotencyKey: idempotencyKey ?? nanoid(24), + }, + }, + create: { + ...WaitpointId.generate(), + type: "MANUAL", idempotencyKey: idempotencyKey ?? nanoid(24), + idempotencyKeyExpiresAt, + userProvidedIdempotencyKey: !!idempotencyKey, + environmentId, + projectId, + completedAfter: timeout, + tags, }, + update: {}, }, - create: { - ...WaitpointId.generate(), - type: "MANUAL", - idempotencyKey: idempotencyKey ?? nanoid(24), - idempotencyKeyExpiresAt, - userProvidedIdempotencyKey: !!idempotencyKey, - environmentId, - projectId, - completedAfter: timeout, - tags, - }, - update: {}, - }); + undefined, + colocate + ); //schedule the timeout if (timeout) { @@ -367,21 +447,20 @@ export class WaitpointSystem { /** * Prevents a run from continuing until the waitpoint is completed. * - * This method uses two separate SQL statements intentionally: - * - * 1. A CTE that INSERTs TaskRunWaitpoint rows (blocking connections) and - * _WaitpointRunConnections rows (historical connections). + * The block edge is written via the run-ops store, routed by the owning run id so it co-resides + * with the run (`blockRunWithWaitpointEdges`). It is NOT pinned to the caller's control-plane tx: + * doing so joined `Waitpoint` on the wrong DB for a run whose waitpoint lives on the run-ops DB, + * wrote 0 edges, and silently never suspended the parent. Like `blockRunWithCreatedBatch`, this is + * a routed, run-co-resident write rather than part of the control-plane trigger tx — there is no + * cross-DB transaction. The edge write is idempotent (ON CONFLICT DO NOTHING) and the snapshot + * transition is re-derivable, so a crash between the two leaves no corruption: a retry re-writes + * the same edge and re-checks the pending count. * - * 2. A separate SELECT that checks if any of the requested waitpoints are still PENDING. - * - * These MUST be separate statements because of PostgreSQL MVCC in READ COMMITTED isolation: - * each statement gets its own snapshot. If a concurrent `completeWaitpoint` commits between - * the CTE starting and finishing, the CTE's snapshot won't see the COMPLETED status. By using - * a separate SELECT, we get a fresh snapshot that reflects the latest committed state. - * - * The pending check queries ALL requested waitpoint IDs (not just the ones actually inserted - * by the CTE). This is intentional: if a TaskRunWaitpoint row already existed (ON CONFLICT - * DO NOTHING skipped the insert), a still-PENDING waitpoint should still count as blocking. + * The pending check is a SEPARATE store call (not folded into the edge write) on purpose: under + * PostgreSQL READ COMMITTED each statement gets its own snapshot, so if a concurrent + * `completeWaitpoint` commits between the edge write and the check, this fresh query still sees the + * COMPLETED status. It queries ALL requested waitpoint IDs (not just the ones inserted): a row + * that already existed (ON CONFLICT skipped the insert) but is still PENDING must still block. */ async blockRunWithWaitpoint({ runId, @@ -413,51 +492,29 @@ export class WaitpointSystem { let $waitpoints = typeof waitpoints === "string" ? [waitpoints] : waitpoints; return await this.$.runLock.lock("blockRunWithWaitpoint", [runId], async () => { - let snapshot: TaskRunExecutionSnapshot = await getLatestExecutionSnapshot(prisma, runId); - - // Insert the blocking connections and the historical run connections. - // We use a CTE to do both inserts atomically. Data-modifying CTEs are - // always executed regardless of whether they're referenced in the outer query. - await prisma.$queryRaw` - WITH inserted AS ( - INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex") - SELECT - gen_random_uuid(), - ${runId}, - w.id, - ${projectId}, - NOW(), - NOW(), - ${spanIdToComplete ?? null}, - ${batch?.id ?? null}, - ${batch?.index ?? null} - FROM "Waitpoint" w - WHERE w.id IN (${Prisma.join($waitpoints)}) - ON CONFLICT DO NOTHING - RETURNING "waitpointId" - ), - connected_runs AS ( - INSERT INTO "_WaitpointRunConnections" ("A", "B") - SELECT ${runId}, w.id - FROM "Waitpoint" w - WHERE w.id IN (${Prisma.join($waitpoints)}) - ON CONFLICT DO NOTHING - ) - SELECT COUNT(*) FROM inserted`; - - // Check if the run is actually blocked using a separate query. - // This MUST be a separate statement from the CTE above because in READ COMMITTED - // isolation, each statement gets its own snapshot. The CTE's snapshot is taken when - // it starts, so if a concurrent completeWaitpoint commits during the CTE, the CTE - // won't see it. This fresh query gets a new snapshot that reflects the latest commits. - const pendingCheck = await prisma.$queryRaw<{ pending_count: bigint }[]>` - SELECT COUNT(*) as pending_count - FROM "Waitpoint" - WHERE id IN (${Prisma.join($waitpoints)}) - AND status = 'PENDING' - `; - - const isRunBlocked = Number(pendingCheck.at(0)?.pending_count ?? 0) > 0; + let snapshot: TaskRunExecutionSnapshot = await getLatestExecutionSnapshot( + prisma, + runId, + this.$.runStore + ); + + // Insert the blocking + historical connections via the run-ops store, routed by the owning + // run id so the edge co-resides with the run. Never pinned to the caller's control-plane tx: + // that joined `Waitpoint` on the wrong DB and wrote 0 edges. The pending check stays a + // SEPARATE store call so it gets its own READ COMMITTED snapshot (see the doc comment above). + await this.$.runStore.blockRunWithWaitpointEdges({ + runId, + waitpointIds: $waitpoints, + projectId, + spanIdToComplete, + batchId: batch?.id, + batchIndex: batch?.index, + }); + + // Check if the run is actually blocked using a separate query (see above). + const pendingCount = await this.$.runStore.countPendingWaitpoints($waitpoints); + + const isRunBlocked = pendingCount > 0; let newStatus: TaskRunExecutionStatus = "SUSPENDED"; if ( @@ -544,7 +601,6 @@ export class WaitpointSystem { timeout, spanIdToComplete, batch, - tx, }: { runId: string; waitpoints: string | string[]; @@ -552,41 +608,20 @@ export class WaitpointSystem { timeout?: Date; spanIdToComplete?: string; batch: { id: string; index?: number }; - tx?: PrismaClientOrTransaction; }): Promise { - const prisma = tx ?? this.$.prisma; const $waitpoints = typeof waitpoints === "string" ? [waitpoints] : waitpoints; - // Insert the blocking connections and the historical run connections. - // No lock needed: ON CONFLICT DO NOTHING makes concurrent inserts safe, - // and the parent snapshot is already EXECUTING_WITH_WAITPOINTS from - // blockRunWithCreatedBatch. - await prisma.$queryRaw` - WITH inserted AS ( - INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex") - SELECT - gen_random_uuid(), - ${runId}, - w.id, - ${projectId}, - NOW(), - NOW(), - ${spanIdToComplete ?? null}, - ${batch.id}, - ${batch.index ?? null} - FROM "Waitpoint" w - WHERE w.id IN (${Prisma.join($waitpoints)}) - ON CONFLICT DO NOTHING - RETURNING "waitpointId" - ), - connected_runs AS ( - INSERT INTO "_WaitpointRunConnections" ("A", "B") - SELECT ${runId}, w.id - FROM "Waitpoint" w - WHERE w.id IN (${Prisma.join($waitpoints)}) - ON CONFLICT DO NOTHING - ) - SELECT COUNT(*) FROM inserted`; + // Same routed edge write as blockRunWithWaitpoint, routed by the owning run id. No lock + // needed: ON CONFLICT DO NOTHING makes concurrent inserts safe, and the parent snapshot is + // already EXECUTING_WITH_WAITPOINTS from blockRunWithCreatedBatch. + await this.$.runStore.blockRunWithWaitpointEdges({ + runId, + waitpointIds: $waitpoints, + projectId, + spanIdToComplete, + batchId: batch.id, + batchIndex: batch.index, + }); // Schedule timeout jobs if needed if (timeout) { @@ -653,17 +688,20 @@ export class WaitpointSystem { return await this.$.runLock.lock("continueRunIfUnblocked", [runId], async () => { // 1. Get the any blocking waitpoints - const blockingWaitpoints = await this.$.prisma.taskRunWaitpoint.findMany({ - where: { taskRunId: runId }, - select: { - id: true, - batchId: true, - batchIndex: true, - waitpoint: { - select: { id: true, status: true, type: true, completedAfter: true }, + const blockingWaitpoints = await this.$.runStore.findManyTaskRunWaitpoints( + { + where: { taskRunId: runId }, + select: { + id: true, + batchId: true, + batchIndex: true, + waitpoint: { + select: { id: true, status: true, type: true, completedAfter: true }, + }, }, }, - }); + this.$.prisma + ); // 2. There are blockers still, so do nothing if (blockingWaitpoints.some((w) => w.waitpoint.status !== "COMPLETED")) { @@ -678,25 +716,12 @@ export class WaitpointSystem { }; } - // 3. Get the run with environment + // 3. Get the run (run-ops scalars) + resolve its environment via the control-plane resolver, + // so the run-ops DB can split without a cross-provider join. const run = await this.$.runStore.findRun( { id: runId, }, - { - include: { - runtimeEnvironment: { - select: { - id: true, - type: true, - maximumConcurrencyLimit: true, - concurrencyLimitBurstFactor: true, - project: { select: { id: true } }, - organization: { select: { id: true } }, - }, - }, - }, - }, this.$.prisma ); @@ -707,8 +732,20 @@ export class WaitpointSystem { throw new Error(`continueRunIfUnblocked: run not found: ${runId}`); } + const env = await this.$.controlPlaneResolver.resolveEnv(run.runtimeEnvironmentId); + + if (!env) { + this.$.logger.error(`continueRunIfUnblocked: environment not found`, { + runId, + runtimeEnvironmentId: run.runtimeEnvironmentId, + }); + throw new Error( + `continueRunIfUnblocked: environment not found: ${run.runtimeEnvironmentId}` + ); + } + //4. Continue the run whether it's executing or not - const snapshot = await getLatestExecutionSnapshot(this.$.prisma, runId); + const snapshot = await getLatestExecutionSnapshot(this.$.prisma, runId, this.$.runStore); switch (snapshot.executionStatus) { case "RUN_CREATED": { @@ -867,7 +904,7 @@ export class WaitpointSystem { //this prioritizes dequeuing waiting runs over new runs const newSnapshot = await this.enqueueSystem.enqueueRun({ run, - env: run.runtimeEnvironment, + env, snapshot: { status: "QUEUED", description: "Run was QUEUED, because all waitpoints are completed", @@ -895,7 +932,7 @@ export class WaitpointSystem { if (blockingWaitpoints.length > 0) { //5. Remove the blocking waitpoints - await this.$.prisma.taskRunWaitpoint.deleteMany({ + await this.$.runStore.deleteManyTaskRunWaitpoints({ where: { taskRunId: runId, id: { in: blockingWaitpoints.map((b) => b.id) }, @@ -1009,12 +1046,13 @@ export class WaitpointSystem { } // Operational decision: use latest execution snapshot, not TaskRun status - const snapshot = await getLatestExecutionSnapshot(prisma, runId); + const snapshot = await getLatestExecutionSnapshot(prisma, runId, this.$.runStore); // Create waitpoint and link to run atomically const waitpointData = this.buildRunAssociatedWaitpoint({ projectId, environmentId }); - const waitpoint = await prisma.waitpoint.create({ + // RUN-type within-tree waitpoint that belongs to runId; routes by owning run id. + const waitpoint = await this.$.runStore.createWaitpoint({ data: { ...waitpointData, completedByTaskRunId: runId, diff --git a/internal-packages/run-engine/src/engine/tests/blockEdgeResidency.test.ts b/internal-packages/run-engine/src/engine/tests/blockEdgeResidency.test.ts new file mode 100644 index 00000000000..f4d64a844e0 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/blockEdgeResidency.test.ts @@ -0,0 +1,311 @@ +// Block-edge write goes to the wrong DB so the parent never suspends. Two-physical-DB topology with +// the real dedicated run-ops schema on #new (prisma17). RED before the fix: the control-plane tx +// threaded by RunEngine.trigger forces the raw CTE to join `Waitpoint` on #legacy, where the ksuid +// waitpoint does not exist, so 0 edges are written and the parent stays EXECUTING. GREEN after: the +// block path always routes through the store, landing the edge + WaitpointRunConnection on #new and +// suspending the parent. (Snapshot reads/writes route by run id regardless of tx.) + +import { + heteroRunOpsPostgresTest, + network, + redisContainer, + redisOptions, +} from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore, RoutingRunStore, type CreateRunInput } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; + +// Compose the two-physical-DB run-ops fixture (prisma14 = full control-plane DB, +// prisma17 = dedicated run-ops subset DB) with a per-test redis the RunEngine needs. +const twoDbEngineTest = heteroRunOpsPostgresTest.extend<{ + redisContainer: any; + redisOptions: any; +}>({ + network, + redisContainer, + redisOptions, +}); + +// ksuid (27-char internal id) → classified NEW → routed to the run-ops (#new) store. +const KSUID_A = "k".repeat(27); +const KSUID_B = "m".repeat(27); + +function baseEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +// On the dedicated subset there are no Organization/Project/RuntimeEnvironment models — the run-ops +// rows carry FK-free scalar owning ids. On legacy (control-plane) we seed the real env the engine's +// resolver / enqueue path reads (maxConc etc.). +async function seedControlPlaneEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "EXECUTING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "PRODUCTION", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "parent-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/parent-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "PRODUCTION", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +// Seed an EXECUTING ksuid parent run on #new (prisma17) via the routed store, then a ksuid PENDING +// RUN waitpoint co-resident on #new. Returns the env + ids the block path needs. +async function seedExecutingKsuidParent( + prisma14: PrismaClient, + prisma17: RunOpsPrismaClient, + router: RoutingRunStore, + parentRunId: string, + waitpointId: string, + suffix: string +) { + const env = await seedControlPlaneEnv(prisma14, suffix); + + await router.createRun( + buildCreateRunInput({ + runId: parentRunId, + friendlyId: `run_${suffix}_parent`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + // Move the parent to EXECUTING (so blockRunWithWaitpoint transitions it to + // EXECUTING_WITH_WAITPOINTS rather than SUSPENDED) — written via the routed store onto #new. + const created = await router.findLatestExecutionSnapshot(parentRunId); + await router.createExecutionSnapshot( + { + run: { id: parentRunId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "parent executing" }, + previousSnapshotId: created!.id, + environmentId: env.environment.id, + environmentType: "PRODUCTION", + projectId: env.project.id, + organizationId: env.organization.id, + }, + prisma14 + ); + + // The associated waitpoint lives on #new (co-resident with the ksuid run). + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: `wp_${suffix}`, + type: "RUN", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + + return env; +} + +function makeRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); + return new RoutingRunStore({ new: newStore, legacy: legacyStore }); +} + +describe("RunEngine block-edge residency (two physical DBs, dedicated #new)", () => { + // RED before fix / GREEN after: a ksuid parent blocked by a #new-resident waitpoint, with the + // control-plane tx threaded exactly as RunEngine.trigger does, ends EXECUTING_WITH_WAITPOINTS with + // the edge + WaitpointRunConnection physically on #new. + twoDbEngineTest( + "blockRunWithWaitpoint suspends a ksuid parent with the edge on #new (control-plane tx threaded)", + async ({ prisma14, prisma17, redisOptions }) => { + const router = makeRouter(prisma14 as unknown as PrismaClient, prisma17); + const engine = new RunEngine({ + store: router, + ...baseEngineOptions(redisOptions, prisma14), + }); + + try { + const parentRunId = `run_${KSUID_A}`; + const waitpointId = `waitpoint_${KSUID_A}`; + const env = await seedExecutingKsuidParent( + prisma14 as unknown as PrismaClient, + prisma17, + router, + parentRunId, + waitpointId, + "blockedge-a" + ); + + // RunEngine.trigger threads the control-plane client as `tx` — the wrong-DB trigger. + await engine.blockRunWithWaitpoint({ + runId: parentRunId, + waitpoints: waitpointId, + projectId: env.project.id, + organizationId: env.organization.id, + tx: prisma14 as unknown as PrismaClient, + }); + + const edgesOnNew = await prisma17.taskRunWaitpoint.count({ + where: { taskRunId: parentRunId }, + }); + const connectionsOnNew = await prisma17.waitpointRunConnection.count({ + where: { taskRunId: parentRunId, waitpointId }, + }); + const edgesOnLegacy = await (prisma14 as unknown as PrismaClient).taskRunWaitpoint.count({ + where: { taskRunId: parentRunId }, + }); + + expect(edgesOnNew).toBe(1); // RED: 0 (CTE on #legacy found no waitpoint → no edge) + expect(connectionsOnNew).toBe(1); // the explicit join replacing legacy _WaitpointRunConnections + expect(edgesOnLegacy).toBe(0); // never written to the wrong DB + + // And the engine actually suspended the parent. + const data = await engine.getRunExecutionData({ runId: parentRunId }); + expect(data?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + } finally { + await engine.quit(); + } + } + ); + + // The lockless (batch-item) path likewise must write the edge on #new. The lockless method does + // not transition the snapshot (the parent is already EXECUTING_WITH_WAITPOINTS from + // blockRunWithCreatedBatch), so we assert the edge + connection land on #new. + twoDbEngineTest( + "blockRunWithWaitpointLockless writes the edge on #new (control-plane tx threaded)", + async ({ prisma14, prisma17, redisOptions }) => { + const router = makeRouter(prisma14 as unknown as PrismaClient, prisma17); + const engine = new RunEngine({ + store: router, + ...baseEngineOptions(redisOptions, prisma14), + }); + + try { + const parentRunId = `run_${KSUID_B}`; + const waitpointId = `waitpoint_${KSUID_B}`; + const env = await seedExecutingKsuidParent( + prisma14 as unknown as PrismaClient, + prisma17, + router, + parentRunId, + waitpointId, + "blockedge-b" + ); + + // The lockless method lives on the waitpoint system; reach it via the engine instance. + await (engine as any).waitpointSystem.blockRunWithWaitpointLockless({ + runId: parentRunId, + waitpoints: waitpointId, + projectId: env.project.id, + batch: { id: `batch_${KSUID_B}`, index: 0 }, + tx: prisma14 as unknown as PrismaClient, + }); + + const edgesOnNew = await prisma17.taskRunWaitpoint.count({ + where: { taskRunId: parentRunId }, + }); + const connectionsOnNew = await prisma17.waitpointRunConnection.count({ + where: { taskRunId: parentRunId, waitpointId }, + }); + const edgesOnLegacy = await (prisma14 as unknown as PrismaClient).taskRunWaitpoint.count({ + where: { taskRunId: parentRunId }, + }); + + expect(edgesOnNew).toBe(1); // RED: 0 (lockless CTE on #legacy found no waitpoint) + expect(connectionsOnNew).toBe(1); + expect(edgesOnLegacy).toBe(0); + + // countPendingWaitpoints fans out and sees the #new PENDING waitpoint as a live blocker. + expect(await router.countPendingWaitpoints([waitpointId])).toBe(1); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/checkpointSystem.controlPlaneResolver.test.ts b/internal-packages/run-engine/src/engine/tests/checkpointSystem.controlPlaneResolver.test.ts new file mode 100644 index 00000000000..5fe12c4426c --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/checkpointSystem.controlPlaneResolver.test.ts @@ -0,0 +1,214 @@ +// Cross-DB inversion proof for the checkpoint env include (suspendForCheckpoint + createCheckpoint). +// Cloud topology: run-ops = new DB (PG17, cross-seam FKs DROPPED), control-plane = legacy DB (PG14). +// The env/project/organization live on PG14; the run-ops scalar row on PG17. The +// PassthroughControlPlaneResolver over PG14 resolves the env half (used for the runStatusChanged +// emit, the TaskRunCheckpoint data, and enqueueRun) while the run scalars come from PG17 — no +// cross-DB join. The DB is never mocked. A single-DB passthrough case proves createCheckpoint +// stamps the resolved env onto the checkpoint row byte-identically. +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import type { PrismaClient } from "@trigger.dev/database"; +import { setTimeout } from "node:timers/promises"; +import { expect } from "vitest"; +import { PassthroughControlPlaneResolver } from "../controlPlaneResolver.js"; +import { PostgresRunStore } from "@internal/run-store"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +async function seedControlPlaneEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +describe("CheckpointSystem controlPlaneResolver (hetero cross-DB)", () => { + heteroPostgresTest( + "env resolves from PG14 (control-plane) while the run scalars live on PG17 (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedControlPlaneEnv(prisma14 as unknown as PrismaClient, "cpcp"); + + const runId = "run_cpcp_pg17"; + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_friendly_cpcp", + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + queue: "task/my-task", + traceId: "trace_cpcp", + spanId: "span_cpcp", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + // suspendForCheckpoint with an empty include returns the run-ops scalars and flips status. + const run = await runStore.suspendForCheckpoint(runId, { include: {} }); + assertNonNullable(run); + expect(run.status).toBe("WAITING_TO_RESUME"); + expect(run.runtimeEnvironmentId).toBe(cp.environment.id); + + // The control-plane env (project/organization) resolves from PG14; these are exactly the + // fields checkpoint stamps onto the runStatusChanged emit, the TaskRunCheckpoint data, and + // enqueueRun — all without touching the run-ops DB. + const env = await resolver.resolveEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + expect(env.id).toBe(cp.environment.id); + expect(env.projectId).toBe(cp.project.id); + expect(env.organizationId).toBe(cp.organization.id); + + // Inversion: the run-ops DB (PG17) holds no env row; a co-located join would resolve null. + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + } + ); +}); + +describe("CheckpointSystem controlPlaneResolver (single-DB passthrough)", () => { + containerTest( + "createCheckpoint stamps the resolved env onto the checkpoint row byte-identically", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_cpcppassthru1", + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-cpcp", + spanId: "s-cpcp", + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); + assertNonNullable(dequeued[0]); + + await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + const waitpoint = await engine.createManualWaitpoint({ + environmentId: environment.id, + projectId: environment.projectId, + }); + const blocked = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.waitpoint.id, + projectId: environment.projectId, + organizationId: environment.organizationId, + }); + + const checkpointResult = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blocked.id, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + expect(checkpointResult.ok).toBe(true); + + const persisted = await prisma.taskRunCheckpoint.findFirst({ + where: { executionSnapshot: { some: { runId: run.id } } }, + }); + assertNonNullable(persisted); + // The resolved env's projectId + runtimeEnvironmentId were stamped onto the checkpoint. + expect(persisted.projectId).toBe(environment.projectId); + expect(persisted.runtimeEnvironmentId).toBe(environment.id); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/checkpointSystemStore.test.ts b/internal-packages/run-engine/src/engine/tests/checkpointSystemStore.test.ts new file mode 100644 index 00000000000..c9a4d636015 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/checkpointSystemStore.test.ts @@ -0,0 +1,495 @@ +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import { trace } from "@internal/tracing"; +import { CheckpointId, SnapshotId } from "@trigger.dev/core/v3/isomorphic"; +import type { Prisma, PrismaClient } from "@trigger.dev/database"; +import { setTimeout } from "node:timers/promises"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { getLatestExecutionSnapshot } from "../systems/executionSnapshotSystem.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any, store?: PostgresRunStore) { + return { + prisma, + ...(store ? { store } : {}), + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * A real PostgresRunStore subclass that counts the checkpoint + snapshot write methods this unit + * routes through, so the routing can be observed over real containers without ever mocking prisma. + * super.* runs the genuine store implementation. + */ +class CountingPostgresRunStore extends PostgresRunStore { + public checkpointCreates = 0; + public snapshotCreates = 0; + public latestReads = 0; + + override async createTaskRunCheckpoint( + args: Prisma.SelectSubset, + tx?: any + ): Promise> { + this.checkpointCreates++; + return super.createTaskRunCheckpoint(args, tx); + } + + override async createExecutionSnapshot( + input: any, + tx?: any + ): ReturnType { + this.snapshotCreates++; + return super.createExecutionSnapshot(input, tx); + } + + override async findLatestExecutionSnapshot( + runId: string, + client?: any + ): ReturnType { + this.latestReads++; + return super.findLatestExecutionSnapshot(runId, client); + } +} + +/** + * Drives a freshly triggered run to a checkpointable (EXECUTING_WITH_WAITPOINTS) state, returning + * the run + the blocking snapshot id + the waitpoint id. Mirrors the existing checkpoints.test.ts + * "Create checkpoint and continue execution" preamble. + */ +async function driveToCheckpointable(engine: RunEngine, prisma: PrismaClient, friendlyId: string) { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId, + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); + expect(dequeued.length).toBe(1); + assertNonNullable(dequeued[0]); + + await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + const waitpointResult = await engine.createManualWaitpoint({ + environmentId: environment.id, + projectId: environment.projectId, + }); + + const blockedResult = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpointResult.waitpoint.id, + projectId: environment.projectId, + organizationId: environment.organizationId, + }); + + return { + environment, + run, + blockingSnapshotId: blockedResult.id, + waitpointId: waitpointResult.waitpoint.id, + }; +} + +describe("CheckpointSystem store routing (single-DB passthrough)", () => { + // createCheckpoint routes the TaskRunCheckpoint write + the SUSPENDED snapshot write + // through the store, both resolved by owning run id. + containerTest( + "checkpoint create routes the TaskRunCheckpoint write + SUSPENDED snapshot through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const { run, blockingSnapshotId } = await driveToCheckpointable( + engine, + prisma, + "run_cpcreate1" + ); + + const checkpointsBefore = countingStore.checkpointCreates; + const snapshotsBefore = countingStore.snapshotCreates; + + const checkpointResult = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blockingSnapshotId, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + + expect(checkpointResult.ok).toBe(true); + + // (1) a TaskRunCheckpoint row exists for the run (joined via the SUSPENDED snapshot). + const persistedCheckpoint = await prisma.taskRunCheckpoint.findFirst({ + where: { executionSnapshot: { some: { runId: run.id } } }, + }); + expect(persistedCheckpoint).not.toBeNull(); + expect(persistedCheckpoint?.type).toBe("DOCKER"); + expect(persistedCheckpoint?.reason).toBe("TEST_CHECKPOINT"); + + // (2) the latest snapshot is SUSPENDED with checkpointId set to that row. + const latest = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId: run.id, isValid: true }, + orderBy: { createdAt: "desc" }, + }); + expect(latest?.executionStatus).toBe("SUSPENDED"); + expect(latest?.checkpointId).toBe(persistedCheckpoint!.id); + + // (3) the checkpoint create + the snapshot create went through the store. + expect(countingStore.checkpointCreates).toBeGreaterThan(checkpointsBefore); + expect(countingStore.snapshotCreates).toBeGreaterThan(snapshotsBefore); + } finally { + await engine.quit(); + } + } + ); + + // A full checkpoint create + restore round-trip through continueRunExecution; the latest + // snapshot becomes EXECUTING and the read through the store returns it. + containerTest( + "restore round-trip via continueRunExecution reads + writes through the store", + async ({ prisma, redisOptions }) => { + const countingStore = new CountingPostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma, countingStore)); + + try { + const { run, blockingSnapshotId, waitpointId } = await driveToCheckpointable( + engine, + prisma, + "run_cprestore1" + ); + + // Suspend (create checkpoint). + const checkpointResult = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blockingSnapshotId, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + expect(checkpointResult.ok).toBe(true); + + // Unblock + re-dequeue to reach a QUEUED_WITH_CHECKPOINT/pending-executing state. + await engine.completeWaitpoint({ id: waitpointId }); + await setTimeout(500); + + const dequeuedAgain = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); + expect(dequeuedAgain.length).toBe(1); + assertNonNullable(dequeuedAgain[0]); + + const continueResult = await engine.continueRunExecution({ + runId: run.id, + snapshotId: dequeuedAgain[0].snapshot.id, + }); + + // The latest snapshot becomes EXECUTING. + expect(continueResult.snapshot.executionStatus).toBe("EXECUTING"); + expect(continueResult.run.status).toBe("EXECUTING"); + + // The store read returns exactly that EXECUTING snapshot, routed by run id. + const latest = await getLatestExecutionSnapshot(prisma, run.id, countingStore); + expect(latest.runId).toBe(run.id); + expect(latest.executionStatus).toBe("EXECUTING"); + expect(latest.id).toBe(continueResult.snapshot.id); + // friendlyId is a valid SnapshotId derived from the cuid (route by owning run id). + expect(latest.friendlyId).toMatch(/^snapshot_/); + expect(SnapshotId.fromFriendlyId(latest.friendlyId)).toBe(latest.id); + expect(countingStore.latestReads).toBeGreaterThan(0); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB binds one client (passthrough) — proven by behavior, not by reaching into a + // private prisma member. The default-store engine round-trips create+restore on the one client. + containerTest( + "single-DB binds one client (passthrough) — create + restore round-trip on one client", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const { run, blockingSnapshotId, waitpointId } = await driveToCheckpointable( + engine, + prisma, + "run_cppassthru" + ); + + const checkpointResult = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blockingSnapshotId, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + expect(checkpointResult.ok).toBe(true); + + // The SUSPENDED snapshot just written is exactly what the store reads back on one client. + const suspended = await getLatestExecutionSnapshot(prisma, run.id, engine.runStore); + const persistedSuspended = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId: run.id, isValid: true }, + orderBy: { createdAt: "desc" }, + }); + expect(suspended.executionStatus).toBe("SUSPENDED"); + expect(suspended.id).toBe(persistedSuspended!.id); + + await engine.completeWaitpoint({ id: waitpointId }); + await setTimeout(500); + + const dequeuedAgain = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); + assertNonNullable(dequeuedAgain[0]); + + const continueResult = await engine.continueRunExecution({ + runId: run.id, + snapshotId: dequeuedAgain[0].snapshot.id, + }); + expect(continueResult.snapshot.executionStatus).toBe("EXECUTING"); + + // The EXECUTING snapshot read resolves on the same single client to exactly the row written. + const executing = await getLatestExecutionSnapshot(prisma, run.id, engine.runStore); + expect(executing.id).toBe(continueResult.snapshot.id); + expect(executing.runId).toBe(run.id); + } finally { + await engine.quit(); + } + } + ); +}); + +// --- Checkpoint-family FK-drop app-integrity (Tests D/E): FK retained (self-host) + FK dropped (cloud) --- + +const CHECKPOINT_FAMILY_CROSS_SEAM_FKS = [ + ["TaskRunCheckpoint", "TaskRunCheckpoint_projectId_fkey"], + ["TaskRunCheckpoint", "TaskRunCheckpoint_runtimeEnvironmentId_fkey"], + ["Checkpoint", "Checkpoint_projectId_fkey"], + ["Checkpoint", "Checkpoint_runtimeEnvironmentId_fkey"], + ["CheckpointRestoreEvent", "CheckpointRestoreEvent_projectId_fkey"], + ["CheckpointRestoreEvent", "CheckpointRestoreEvent_runtimeEnvironmentId_fkey"], +] as const; + +/** Model the cloud-only physical drop of the checkpoint-family cross-seam Cascade FKs. */ +async function dropCheckpointFamilyCrossSeamFks(prisma: PrismaClient) { + for (const [table, constraint] of CHECKPOINT_FAMILY_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "${table}" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +async function seedEnvironment(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +describe("CheckpointSystem checkpoint-family FK-drop app-integrity (both modes)", () => { + // createTaskRunCheckpoint succeeds with the cross-seam FK retained (self-host) and + // dropped (cloud). The fixture must provision BOTH versions (no silent single-DB no-op). + heteroPostgresTest( + "checkpoint create succeeds with the cross-seam FK retained (self-host) and dropped (cloud)", + async ({ prisma14, prisma17, pinnedCollation }) => { + // Assert the hetero fixture actually provisioned both clients on the pinned collation — a + // hetero test that silently no-ops on a single DB would be a false green. + expect(pinnedCollation).toBe("und-x-icu"); + const v14 = ( + await prisma14.$queryRawUnsafe<{ server_version: string }[]>(`SHOW server_version`) + )[0]!.server_version; + const v17 = ( + await prisma17.$queryRawUnsafe<{ server_version: string }[]>(`SHOW server_version`) + )[0]!.server_version; + expect(parseInt(v14, 10)).toBe(14); + expect(parseInt(v17, 10)).toBe(17); + + // Cloud shape: drop the checkpoint-family cross-seam Cascade FKs on the cloud DB only. + await dropCheckpointFamilyCrossSeamFks(prisma17 as unknown as PrismaClient); + + const store14 = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const store17 = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + + // Self-host shape (FK retained): insert against present env/project rows succeeds. + const env14 = await seedEnvironment(prisma14 as unknown as PrismaClient, "fkd_14"); + const cp14 = await store14.createTaskRunCheckpoint({ + data: { + ...CheckpointId.generate(), + type: "DOCKER", + location: "loc-14", + reason: "TEST", + runtimeEnvironmentId: env14.environment.id, + projectId: env14.project.id, + }, + }); + const persisted14 = await prisma14.taskRunCheckpoint.findUnique({ where: { id: cp14.id } }); + expect(persisted14).not.toBeNull(); + + // Cloud shape (FK dropped): insert succeeds with present env/project rows... + const env17 = await seedEnvironment(prisma17 as unknown as PrismaClient, "fkd_17"); + const cp17 = await store17.createTaskRunCheckpoint({ + data: { + ...CheckpointId.generate(), + type: "DOCKER", + location: "loc-17", + reason: "TEST", + runtimeEnvironmentId: env17.environment.id, + projectId: env17.project.id, + }, + }); + const persisted17 = await prisma17.taskRunCheckpoint.findUnique({ where: { id: cp17.id } }); + expect(persisted17).not.toBeNull(); + + // ...and, because the constraint is gone on the cloud shape, also succeeds with no + // control-plane row required at insert (the defining property of the dropped FK). + const orphanId = CheckpointId.generate(); + const orphan = await store17.createTaskRunCheckpoint({ + data: { + ...orphanId, + type: "DOCKER", + location: "loc-17-orphan", + reason: "TEST", + runtimeEnvironmentId: "env_does_not_exist", + projectId: "proj_does_not_exist", + }, + }); + const persistedOrphan = await prisma17.taskRunCheckpoint.findUnique({ + where: { id: orphan.id }, + }); + expect(persistedOrphan).not.toBeNull(); + } + ); + + // Env-delete parity on this unit's write surface (TaskRunCheckpoint, the only + // checkpoint-family row the V2 engine creates — Checkpoint/CheckpointRestoreEvent are V1-residual + // and require a full run+attempt graph, out of this unit's write scope). After deleting the owning + // env, the TaskRunCheckpoint count is deep-equal across the self-host shape (the retained Cascade + // FK fires) and the cloud shape (the app-level deleteMany-by-env cleanup contract fires). The webapp cleanup service + // is not importable from a run-engine test, so we exercise the same deleteMany-by-env contract + // over the real two clients (no mocks). + heteroPostgresTest( + "env-delete leaves no TaskRunCheckpoint orphan on the FK-dropped DB (parity with FK-retained)", + async ({ prisma14, prisma17 }) => { + await dropCheckpointFamilyCrossSeamFks(prisma17 as unknown as PrismaClient); + + async function seedCheckpoint(prisma: PrismaClient, suffix: string) { + const { environment, project } = await seedEnvironment(prisma, suffix); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + await store.createTaskRunCheckpoint({ + data: { + ...CheckpointId.generate(), + type: "DOCKER", + location: "loc", + reason: "TEST", + runtimeEnvironmentId: environment.id, + projectId: project.id, + }, + }); + return { environment, project }; + } + + const seed14 = await seedCheckpoint(prisma14 as unknown as PrismaClient, "edel_14"); + const seed17 = await seedCheckpoint(prisma17 as unknown as PrismaClient, "edel_17"); + + // Self-host shape: deleting the env fires the retained Cascade FK. + await prisma14.runtimeEnvironment.delete({ where: { id: seed14.environment.id } }); + + // Cloud shape: the FK is gone, so the app-level cleanup contract (delete checkpoint by env, + // before deleting the env) must run. + const envId17 = seed17.environment.id; + await prisma17.taskRunCheckpoint.deleteMany({ where: { runtimeEnvironmentId: envId17 } }); + await prisma17.runtimeEnvironment.delete({ where: { id: envId17 } }); + + const count14 = await prisma14.taskRunCheckpoint.count(); + const count17 = await prisma17.taskRunCheckpoint.count(); + + // Parity: no orphan TaskRunCheckpoint left on either DB after the owning env is deleted. + expect(count17).toEqual(count14); + expect(count14).toBe(0); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/completeWaitpointCrossSeamGuard.test.ts b/internal-packages/run-engine/src/engine/tests/completeWaitpointCrossSeamGuard.test.ts new file mode 100644 index 00000000000..b9bd714951b --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/completeWaitpointCrossSeamGuard.test.ts @@ -0,0 +1,140 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import type { CrossSeamGuardHook } from "../types.js"; +import { setupAuthenticatedEnvironment } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function engineOptions(redisOptions: any, prisma: any, crossSeamGuard?: CrossSeamGuardHook) { + return { + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + ...(crossSeamGuard ? { crossSeamGuard } : {}), + }; +} + +describe("RunEngine completeWaitpoint cross-seam guard", () => { + containerTest( + "consults the crossSeamGuard first (RESUME_TOKEN), then delegates", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const seen: Array<{ waitpointId: string; routeKind: string }> = []; + const engine = new RunEngine( + engineOptions(redisOptions, prisma, async ({ waitpointId, routeKind }) => { + seen.push({ waitpointId, routeKind }); + // Single-store / split-OFF returns the single ("legacy") store; the engine + // delegates regardless of decision.store (routing lives below, in waitpointSystem). + return { store: "legacy", residency: "LEGACY", routeKind }; + }) + ); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + expect(waitpoint.status).toBe("PENDING"); + + await engine.completeWaitpoint({ + id: waitpoint.id, + output: { value: "{}", isError: false }, + }); + + // (A) the guard was consulted first, with the right id + RESUME_TOKEN route kind. + expect(seen).toEqual([{ waitpointId: waitpoint.id, routeKind: "RESUME_TOKEN" }]); + + // (B) the completion was then applied via delegation (single-store path). + const after = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(after?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "propagates a guard throw and does NOT apply the completion (loud)", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine( + engineOptions(redisOptions, prisma, async () => { + throw new Error("UnclassifiableRunId"); + }) + ); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + expect(waitpoint.status).toBe("PENDING"); + + await expect( + engine.completeWaitpoint({ id: waitpoint.id, output: { value: "{}", isError: false } }) + ).rejects.toThrow(); + + // (C) the throw short-circuited before delegation — no silent local apply. + const after = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(after?.status).toBe("PENDING"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "with no crossSeamGuard behaves exactly as today", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine(engineOptions(redisOptions, prisma)); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + expect(waitpoint.status).toBe("PENDING"); + + await engine.completeWaitpoint({ + id: waitpoint.id, + output: { value: "{}", isError: false }, + }); + + // (D) unwired path applies exactly as today. + const after = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(after?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/completeWaitpointReadResidency.test.ts b/internal-packages/run-engine/src/engine/tests/completeWaitpointReadResidency.test.ts new file mode 100644 index 00000000000..dc1d7e3509b --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/completeWaitpointReadResidency.test.ts @@ -0,0 +1,433 @@ +// completeWaitpoint re-read residency — the COMPLETED-waitpoint re-read inside +// WaitpointSystem.completeWaitpoint must use the RESOLVED store's OWN client, not the +// control-plane client. Two-physical-DB topology with the real dedicated run-ops schema on +// #new (prisma17), modelled on the block-edge residency test. +// +// RED before the fix: completeWaitpoint resolved the #new store (where the ksuid RUN waitpoint +// lives), marked it COMPLETED there, then re-read it via `store.findWaitpoint({where:{id}}, this.$.prisma)`. +// A resolved PostgresRunStore HONORS the passed client, so the re-read hit the control-plane DB +// (#legacy / prisma14), found nothing, and threw "Waitpoint not found" BEFORE enqueueing +// continueRunIfUnblocked → the blocked parent never resumed. +// +// GREEN after: the re-read drops the control-plane client, reads #new's own client, finds the +// COMPLETED waitpoint, and the edge fan-out enqueues continueRunIfUnblocked for the parent, which +// then transitions out of EXECUTING_WITH_WAITPOINTS. + +import { + heteroRunOpsPostgresTest, + network, + redisContainer, + redisOptions, +} from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore, RoutingRunStore, type CreateRunInput } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { expect, vi } from "vitest"; +import { RunEngine } from "../index.js"; + +const twoDbEngineTest = heteroRunOpsPostgresTest.extend<{ + redisContainer: any; + redisOptions: any; +}>({ + network, + redisContainer, + redisOptions, +}); + +// ksuid (27-char internal id) → classified NEW → routed to the run-ops (#new) store. +const KSUID_A = "n".repeat(27); +// A second ksuid run for the cross-DB (NEW-run → LEGACY-token) case. +const KSUID_X = "x".repeat(27); +// cuid (25-char) → classified LEGACY → a standalone token resident on #legacy (prisma14). +const CUID_25 = "c".repeat(25); + +function baseEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +async function seedControlPlaneEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "EXECUTING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "PRODUCTION", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "parent-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/parent-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "PRODUCTION", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +// Seed an EXECUTING ksuid parent on #new (prisma17) via the routed store, plus a ksuid PENDING RUN +// waitpoint co-resident on #new. Returns the env + ids the block/complete path needs. +async function seedExecutingKsuidParent( + prisma14: PrismaClient, + prisma17: RunOpsPrismaClient, + router: RoutingRunStore, + parentRunId: string, + waitpointId: string, + suffix: string +) { + const env = await seedControlPlaneEnv(prisma14, suffix); + + await router.createRun( + buildCreateRunInput({ + runId: parentRunId, + friendlyId: `run_${suffix}_parent`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const created = await router.findLatestExecutionSnapshot(parentRunId); + await router.createExecutionSnapshot( + { + run: { id: parentRunId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "parent executing" }, + previousSnapshotId: created!.id, + environmentId: env.environment.id, + environmentType: "PRODUCTION", + projectId: env.project.id, + organizationId: env.organization.id, + }, + prisma14 + ); + + // The RUN waitpoint lives on #new, co-resident with the ksuid run, and is completed-by that run. + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: `wp_${suffix}`, + type: "RUN", + status: "PENDING", + completedByTaskRunId: parentRunId, + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + + return env; +} + +// Seed an EXECUTING ksuid parent on #new (prisma17) AND a standalone MANUAL token resident on +// #legacy (prisma14, cuid) — the tolerated NEW-run → LEGACY-token cross-DB direction (standalone +// tokens are minted on LEGACY). The token is NOT created on #new. Returns both envs + ids. +async function seedKsuidParentAndLegacyToken( + prisma14: PrismaClient, + prisma17: RunOpsPrismaClient, + router: RoutingRunStore, + parentRunId: string, + waitpointId: string, + suffix: string +) { + const env = await seedControlPlaneEnv(prisma14, suffix); + + await router.createRun( + buildCreateRunInput({ + runId: parentRunId, + friendlyId: `run_${suffix}_parent`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const created = await router.findLatestExecutionSnapshot(parentRunId); + await router.createExecutionSnapshot( + { + run: { id: parentRunId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "parent executing" }, + previousSnapshotId: created!.id, + environmentId: env.environment.id, + environmentType: "PRODUCTION", + projectId: env.project.id, + organizationId: env.organization.id, + }, + prisma14 + ); + + // The standalone MANUAL token lives on #legacy ONLY (cuid id, no owning run) — NOT on #new. + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: `wp_${suffix}`, + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + + return env; +} + +function makeRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); + return new RoutingRunStore({ new: newStore, legacy: legacyStore }); +} + +describe("RunEngine completeWaitpoint re-read residency (two physical DBs, dedicated #new)", () => { + twoDbEngineTest( + "completeWaitpoint finds the #new-resident RUN waitpoint and unblocks the parent", + async ({ prisma14, prisma17, redisOptions }) => { + const router = makeRouter(prisma14 as unknown as PrismaClient, prisma17); + const engine = new RunEngine({ + store: router, + ...baseEngineOptions(redisOptions, prisma14), + }); + + try { + const parentRunId = `run_${KSUID_A}`; + const waitpointId = `waitpoint_${KSUID_A}`; + const env = await seedExecutingKsuidParent( + prisma14 as unknown as PrismaClient, + prisma17, + router, + parentRunId, + waitpointId, + "wpread" + ); + + // Block the parent on the #new waitpoint (the edge routes onto #new by owning run id). The + // parent transitions to EXECUTING_WITH_WAITPOINTS. + await engine.blockRunWithWaitpoint({ + runId: parentRunId, + waitpoints: waitpointId, + projectId: env.project.id, + organizationId: env.organization.id, + tx: prisma14 as unknown as PrismaClient, + }); + + const blocked = await engine.getRunExecutionData({ runId: parentRunId }); + expect(blocked?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Capture the unblock enqueue. RED: completeWaitpoint throws "Waitpoint not found" before + // we ever reach this enqueue. GREEN: it enqueues continueRunIfUnblocked for the parent. + const enqueueSpy = vi.spyOn((engine as any).worker, "enqueue"); + + // RED before fix: rejects with "Waitpoint not found" (re-read hit the control-plane DB). + // GREEN after fix: completes, returning the COMPLETED waitpoint. + const completed = await engine.completeWaitpoint({ + id: waitpointId, + output: { value: '{"ok":true}', isError: false }, + }); + + expect(completed.id).toBe(waitpointId); + expect(completed.status).toBe("COMPLETED"); + + // The waitpoint is COMPLETED on its OWN DB (#new), never on the control-plane DB. + const onNew = await prisma17.waitpoint.findFirst({ where: { id: waitpointId } }); + expect(onNew?.status).toBe("COMPLETED"); + const onLegacy = await (prisma14 as unknown as PrismaClient).waitpoint.findFirst({ + where: { id: waitpointId }, + }); + expect(onLegacy).toBeNull(); + + // The unblock path ran: a continueRunIfUnblocked job was enqueued for the blocked parent. + const continueEnqueued = enqueueSpy.mock.calls.some( + ([arg]) => + (arg as any)?.job === "continueRunIfUnblocked" && + (arg as any)?.payload?.runId === parentRunId + ); + expect(continueEnqueued).toBe(true); + + // Drive the enqueued job's body to prove the parent actually resumes (no longer blocked). + const result = await (engine as any).waitpointSystem.continueRunIfUnblocked({ + runId: parentRunId, + }); + expect(result.status).toBe("unblocked"); + + const after = await engine.getRunExecutionData({ runId: parentRunId }); + expect(after?.snapshot.executionStatus).not.toBe("EXECUTING_WITH_WAITPOINTS"); + } finally { + await engine.quit(); + } + } + ); + + // End-to-end cross-DB gate: a ksuid run on #new blocked on a standalone MANUAL token resident on + // #legacy (the tolerated NEW-run → LEGACY-token direction — standalone tokens are minted on + // LEGACY). RED before the writer fix: blockRunWithWaitpointEdges' dedicated branch joined + // `FROM "Waitpoint" w`, which matched 0 rows on #new (the token is on #legacy) → 0 edges → the run + // stays EXECUTING_WITH_WAITPOINTS forever and completing the token finds no edge to resume. + // GREEN after: the edge is written on #new from the waitpointId directly; completing the LEGACY + // token (the completion fan-out discovers the #new edge and resolves its COMPLETED status across + // both DBs) resumes the NEW run. + twoDbEngineTest( + "completeWaitpoint on a LEGACY-resident token unblocks a ksuid run whose edge lives on #new", + async ({ prisma14, prisma17, redisOptions }) => { + const router = makeRouter(prisma14 as unknown as PrismaClient, prisma17); + const engine = new RunEngine({ + store: router, + ...baseEngineOptions(redisOptions, prisma14), + }); + + try { + const parentRunId = `run_${KSUID_X}`; // ksuid run → #new + const waitpointId = `waitpoint_${CUID_25}`; // cuid standalone token → #legacy + const env = await seedKsuidParentAndLegacyToken( + prisma14 as unknown as PrismaClient, + prisma17, + router, + parentRunId, + waitpointId, + "xdbtok" + ); + + // Block the NEW run on the LEGACY token. The edge must land on #new (FK-free), NOT require the + // token to be local. RED: 0 edges (the wrong-DB Waitpoint join), so the run never suspends. + await engine.blockRunWithWaitpoint({ + runId: parentRunId, + waitpoints: waitpointId, + projectId: env.project.id, + organizationId: env.organization.id, + tx: prisma14 as unknown as PrismaClient, + }); + + // The block edge is physically on #new; #legacy holds none for the ksuid run (safety invariant). + expect(await prisma17.taskRunWaitpoint.count({ where: { taskRunId: parentRunId } })).toBe( + 1 + ); + expect( + await (prisma14 as unknown as PrismaClient).taskRunWaitpoint.count({ + where: { taskRunId: parentRunId }, + }) + ).toBe(0); + + const blocked = await engine.getRunExecutionData({ runId: parentRunId }); + expect(blocked?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + const enqueueSpy = vi.spyOn((engine as any).worker, "enqueue"); + + // Complete the LEGACY token via the engine path. completeWaitpoint resolves the token's own + // store (#legacy), marks it COMPLETED there, then fans the waitpointId edge read across BOTH + // DBs → discovers the #new-resident edge → enqueues continueRunIfUnblocked. + const completed = await engine.completeWaitpoint({ + id: waitpointId, + output: { value: '{"resumed":"cross-db"}', isError: false }, + }); + expect(completed.status).toBe("COMPLETED"); + + // Token COMPLETED on #legacy only. + expect( + ( + await (prisma14 as unknown as PrismaClient).waitpoint.findFirst({ + where: { id: waitpointId }, + }) + )?.status + ).toBe("COMPLETED"); + expect(await prisma17.waitpoint.findFirst({ where: { id: waitpointId } })).toBeNull(); + + // The fan-out enqueued the unblock for the NEW run. + const continueEnqueued = enqueueSpy.mock.calls.some( + ([arg]) => + (arg as any)?.job === "continueRunIfUnblocked" && + (arg as any)?.payload?.runId === parentRunId + ); + expect(continueEnqueued).toBe(true); + + // Driving the unblock body resolves the LEGACY token's COMPLETED status across both DBs + // and resumes the NEW run. + const result = await (engine as any).waitpointSystem.continueRunIfUnblocked({ + runId: parentRunId, + }); + expect(result.status).toBe("unblocked"); + + const after = await engine.getRunExecutionData({ runId: parentRunId }); + expect(after?.snapshot.executionStatus).not.toBe("EXECUTING_WITH_WAITPOINTS"); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/controlPlaneResolverInjectability.test.ts b/internal-packages/run-engine/src/engine/tests/controlPlaneResolverInjectability.test.ts new file mode 100644 index 00000000000..bfe6c3a85bb --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/controlPlaneResolverInjectability.test.ts @@ -0,0 +1,191 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Prisma } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { + PassthroughControlPlaneResolver, + type ControlPlaneResolver, + type ResolvedAuthenticatedEnv, + type ResolvedEngineEnv, + type ResolvedWorkerVersion, +} from "../controlPlaneResolver.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +describe("RunEngine controlPlaneResolver injectability", () => { + containerTest( + "defaults to a PassthroughControlPlaneResolver when none is injected", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + expect(engine.controlPlaneResolver).toBeDefined(); + expect(engine.controlPlaneResolver).toBeInstanceOf(PassthroughControlPlaneResolver); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "the default passthrough resolves env, worker version, and env existence", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const setup = await setupBackgroundWorker(engine, environment, "test-task"); + + // resolveEnv returns the seeded env with the flat + nested + concurrency fields. + const env = await engine.controlPlaneResolver.resolveEnv(environment.id); + expect(env).not.toBeNull(); + expect(env!.id).toBe(environment.id); + expect(env!.type).toBe("PRODUCTION"); + expect(env!.projectId).toBe(environment.projectId); + expect(env!.organizationId).toBe(environment.organizationId); + expect(env!.project.id).toBe(environment.projectId); + expect(env!.organization.id).toBe(environment.organizationId); + expect(env!.maximumConcurrencyLimit).toBe(10); + expect(env!.concurrencyLimitBurstFactor.toNumber()).toBe(2); + expect(env!.archivedAt).toBeNull(); + + // resolveWorkerVersion (no workerId, deployed env) returns the promoted deployment's worker. + const version = await engine.controlPlaneResolver.resolveWorkerVersion({ + environmentId: environment.id, + type: "PRODUCTION", + }); + expect(version).not.toBeNull(); + expect(version!.worker.id).toBe(setup.worker.id); + expect(version!.tasks.map((t) => t.slug)).toContain("test-task"); + expect(version!.queues.length).toBeGreaterThan(0); + expect(version!.deployment?.id).toBe( + "deployment" in setup ? setup.deployment.id : undefined + ); + + // assertEnvExists resolves for a present env and rejects for a missing one. + await expect( + engine.controlPlaneResolver.assertEnvExists(environment.id) + ).resolves.toBeUndefined(); + await expect( + engine.controlPlaneResolver.assertEnvExists("env_does_not_exist") + ).rejects.toThrow(); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "the default passthrough resolveAuthenticatedEnv returns the slim env + git, null for a missing id", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const env = await engine.controlPlaneResolver.resolveAuthenticatedEnv(environment.id); + expect(env).not.toBeNull(); + expect(env!.id).toBe(environment.id); + expect(env!.slug).toBe(environment.slug); + expect(env!.type).toBe("PRODUCTION"); + expect(env!.organizationId).toBe(environment.organizationId); + expect(env!.projectId).toBe(environment.projectId); + expect(env!.branchName).toBeNull(); + expect(env!.git).toBeNull(); + expect(env!.project.id).toBe(environment.projectId); + expect(env!.project.organizationId).toBe(environment.organizationId); + expect(env!.organization.id).toBe(environment.organizationId); + // concurrencyLimitBurstFactor is coerced to a plain number by the mapping. + expect(typeof env!.concurrencyLimitBurstFactor).toBe("number"); + + const missing = await engine.controlPlaneResolver.resolveAuthenticatedEnv("env_nope"); + expect(missing).toBeNull(); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "uses an explicitly injected resolver as-is, visible to systems via this.$", + async ({ prisma, redisOptions }) => { + const sentinelEnv: ResolvedEngineEnv = { + id: "env_sentinel", + type: "PRODUCTION", + archivedAt: null, + maximumConcurrencyLimit: 7, + concurrencyLimitBurstFactor: new Prisma.Decimal(3), + projectId: "proj_sentinel", + organizationId: "org_sentinel", + project: { id: "proj_sentinel" }, + organization: { id: "org_sentinel" }, + }; + + const sentinel: ControlPlaneResolver = { + async resolveEnv(): Promise { + return sentinelEnv; + }, + async resolveAuthenticatedEnv(): Promise { + return null; + }, + async resolveWorkerVersion(): Promise { + return null; + }, + async assertEnvExists(): Promise {}, + }; + + const engine = new RunEngine({ + ...createEngineOptions(redisOptions, prisma), + controlPlaneResolver: sentinel, + }); + + try { + // The engine holds exactly the injected instance... + expect(engine.controlPlaneResolver).toBe(sentinel); + // ...and the systems received it via the shared SystemResources (this.$). + expect((engine.dequeueSystem as any).$.controlPlaneResolver).toBe(sentinel); + expect((engine.waitpointSystem as any).$.controlPlaneResolver).toBe(sentinel); + + const resolved = await engine.controlPlaneResolver.resolveEnv("anything"); + expect(resolved).toBe(sentinelEnv); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/datetimeWaitpointColocation.test.ts b/internal-packages/run-engine/src/engine/tests/datetimeWaitpointColocation.test.ts new file mode 100644 index 00000000000..04680161881 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/datetimeWaitpointColocation.test.ts @@ -0,0 +1,402 @@ +// DATETIME / MANUAL waitpoint co-location with the owning run (run-ops split). +// +// The bug: `wait.for`/`wait.until` (DATETIME) and wait-token (MANUAL) waitpoints over the ~5s +// checkpoint threshold hang a ksuid run forever. `createDateTimeWaitpoint`/`createManualWaitpoint` +// mint an ALWAYS-cuid WaitpointId, and the routing store routed the upsert by that id → #legacy, +// even though the owning ksuid run lives on #new. `blockRunWithWaitpoint` then writes its block edge +// on #new (routed by run id), but the CTE joins `Waitpoint` LOCALLY on #new — where the +// waitpoint does not exist — so it writes 0 edges and the run is never actually blocked nor resumed. +// +// The fix: thread the owning `runId` into `createDateTimeWaitpoint`/`createManualWaitpoint` and route +// the waitpoint upsert by that run id, co-locating the waitpoint with its run on #new, exactly like +// RUN waitpoints already co-locate via `completedByTaskRunId` and the block edge co-locates via the +// run id. RED before the fix (waitpoint on #legacy, 0 edges, never resumes); GREEN after (waitpoint +// on #new, edge resolves, run resumes after completion). + +import { + heteroRunOpsPostgresTest, + network, + redisContainer, + redisOptions, +} from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore, RoutingRunStore, type CreateRunInput } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { expect, vi } from "vitest"; +import { RunEngine } from "../index.js"; + +const twoDbEngineTest = heteroRunOpsPostgresTest.extend<{ + redisContainer: any; + redisOptions: any; +}>({ + network, + redisContainer, + redisOptions, +}); + +// ksuid (27-char internal id) → classified NEW → routed to the run-ops (#new) store. +const KSUID_A = "k".repeat(27); +const KSUID_B = "m".repeat(27); +const KSUID_C = "n".repeat(27); +const KSUID_D = "p".repeat(27); + +function baseEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +async function seedControlPlaneEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "EXECUTING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "PRODUCTION", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "parent-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/parent-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "PRODUCTION", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +// Seed an EXECUTING ksuid run on #new (prisma17) via the routed store. Returns the env + run id. +async function seedExecutingKsuidRun( + prisma14: PrismaClient, + router: RoutingRunStore, + runId: string, + suffix: string +) { + const env = await seedControlPlaneEnv(prisma14, suffix); + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_${suffix}`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const created = await router.findLatestExecutionSnapshot(runId); + await router.createExecutionSnapshot( + { + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "run executing" }, + previousSnapshotId: created!.id, + environmentId: env.environment.id, + environmentType: "PRODUCTION", + projectId: env.project.id, + organizationId: env.organization.id, + }, + prisma14 + ); + + return env; +} + +function makeRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); + return new RoutingRunStore({ new: newStore, legacy: legacyStore }); +} + +describe("DATETIME/MANUAL waitpoint co-location with the owning run (two physical DBs)", () => { + // RED before fix: the DATETIME waitpoint created for a ksuid run lands on #legacy (routed by its + // own cuid id), so the block edge (on #new) finds no local waitpoint and the run never blocks/resumes. + // GREEN after: the waitpoint co-locates on #new, the edge resolves, and the run resumes once the + // datetime waitpoint completes via the engine's finishWaitpoint timer. + twoDbEngineTest( + "createDateTimeWaitpoint co-locates the waitpoint on #new and the run resumes after completion", + async ({ prisma14, prisma17, redisOptions }) => { + const p14 = prisma14 as unknown as PrismaClient; + const router = makeRouter(p14, prisma17); + const engine = new RunEngine({ store: router, ...baseEngineOptions(redisOptions, prisma14) }); + + try { + const runId = `run_${KSUID_A}`; + const env = await seedExecutingKsuidRun(p14, router, runId, "dta"); + + // ~600ms out so the finishWaitpoint timer fires within the test window. + const date = new Date(Date.now() + 600); + const { waitpoint } = await engine.createDateTimeWaitpoint({ + runId, + projectId: env.project.id, + environmentId: env.environment.id, + completedAfter: date, + }); + + // CO-LOCATION: the waitpoint must live on #new next to the run. + const onNew = await prisma17.waitpoint.findUnique({ where: { id: waitpoint.id } }); + const onLegacy = await p14.waitpoint.findUnique({ where: { id: waitpoint.id } }); + expect(onNew).not.toBeNull(); // RED: null (routed to #legacy by cuid id-shape) + expect(onLegacy).toBeNull(); // RED: the waitpoint is here instead + + // Block the run on it — the edge co-locates on #new and the CTE joins the local waitpoint. + await engine.blockRunWithWaitpoint({ + runId, + waitpoints: waitpoint.id, + projectId: env.project.id, + organizationId: env.organization.id, + }); + + const edgesOnNew = await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } }); + expect(edgesOnNew).toBe(1); // RED: 0 (no local waitpoint to join) + + const blocked = await engine.getRunExecutionData({ runId }); + expect(blocked?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // The finishWaitpoint timer completes the waitpoint and the run resumes to EXECUTING. + await vi.waitFor( + async () => { + const ed = await engine.getRunExecutionData({ runId }); + expect(ed?.snapshot.executionStatus).toBe("EXECUTING"); + }, + { timeout: 10_000, interval: 100 } + ); + + const completed = await prisma17.waitpoint.findUnique({ where: { id: waitpoint.id } }); + expect(completed?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // MANUAL (wait-token) analog: the waitpoint co-locates with the owning run on #new, the run blocks, + // and an explicit engine.completeWaitpoint resumes it. + twoDbEngineTest( + "createManualWaitpoint co-locates the token on #new and the run resumes after completeWaitpoint", + async ({ prisma14, prisma17, redisOptions }) => { + const p14 = prisma14 as unknown as PrismaClient; + const router = makeRouter(p14, prisma17); + const engine = new RunEngine({ store: router, ...baseEngineOptions(redisOptions, prisma14) }); + + try { + const runId = `run_${KSUID_B}`; + const env = await seedExecutingKsuidRun(p14, router, runId, "mna"); + + const { waitpoint } = await engine.createManualWaitpoint({ + runId, + environmentId: env.environment.id, + projectId: env.project.id, + }); + + const onNew = await prisma17.waitpoint.findUnique({ where: { id: waitpoint.id } }); + const onLegacy = await p14.waitpoint.findUnique({ where: { id: waitpoint.id } }); + expect(onNew).not.toBeNull(); // RED: null + expect(onLegacy).toBeNull(); + + await engine.blockRunWithWaitpoint({ + runId, + waitpoints: waitpoint.id, + projectId: env.project.id, + organizationId: env.organization.id, + }); + + expect(await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } })).toBe(1); + expect((await engine.getRunExecutionData({ runId }))?.snapshot.executionStatus).toBe( + "EXECUTING_WITH_WAITPOINTS" + ); + + await engine.completeWaitpoint({ + id: waitpoint.id, + output: { value: '{"ok":true}', type: "application/json", isError: false }, + }); + + await vi.waitFor( + async () => { + const ed = await engine.getRunExecutionData({ runId }); + expect(ed?.snapshot.executionStatus).toBe("EXECUTING"); + }, + { timeout: 10_000, interval: 100 } + ); + } finally { + await engine.quit(); + } + } + ); + + // Idempotency-keyed path (no deferral). A DATETIME waitpoint created twice with the same + // (env, idempotencyKey) for the SAME run dedups within the run's own store on #new — the second + // call returns the cached #new-resident waitpoint, never a phantom #legacy row. + twoDbEngineTest( + "idempotency-keyed createDateTimeWaitpoint dedups within the owning run's store on #new", + async ({ prisma14, prisma17, redisOptions }) => { + const p14 = prisma14 as unknown as PrismaClient; + const router = makeRouter(p14, prisma17); + const engine = new RunEngine({ store: router, ...baseEngineOptions(redisOptions, prisma14) }); + + try { + const runId = `run_${KSUID_C}`; + const env = await seedExecutingKsuidRun(p14, router, runId, "idem"); + const idempotencyKey = "dedup-key-1"; + const date = new Date(Date.now() + 60_000); + + const first = await engine.createDateTimeWaitpoint({ + runId, + projectId: env.project.id, + environmentId: env.environment.id, + completedAfter: date, + idempotencyKey, + }); + expect(first.isCached).toBe(false); + + const second = await engine.createDateTimeWaitpoint({ + runId, + projectId: env.project.id, + environmentId: env.environment.id, + completedAfter: date, + idempotencyKey, + }); + expect(second.isCached).toBe(true); + expect(second.waitpoint.id).toBe(first.waitpoint.id); + + // Both the dedup probe and the create must target #new — exactly one row, and it is on #new. + expect( + await prisma17.waitpoint.findUnique({ where: { id: first.waitpoint.id } }) + ).not.toBeNull(); + expect(await p14.waitpoint.findUnique({ where: { id: first.waitpoint.id } })).toBeNull(); + expect( + await prisma17.waitpoint.count({ + where: { environmentId: env.environment.id, idempotencyKey }, + }) + ).toBe(1); + expect( + await p14.waitpoint.count({ + where: { environmentId: env.environment.id, idempotencyKey }, + }) + ).toBe(0); + } finally { + await engine.quit(); + } + } + ); + + // Idempotency-keyed MANUAL analog with the same per-run-DB dedup invariant. + twoDbEngineTest( + "idempotency-keyed createManualWaitpoint dedups within the owning run's store on #new", + async ({ prisma14, prisma17, redisOptions }) => { + const p14 = prisma14 as unknown as PrismaClient; + const router = makeRouter(p14, prisma17); + const engine = new RunEngine({ store: router, ...baseEngineOptions(redisOptions, prisma14) }); + + try { + const runId = `run_${KSUID_D}`; + const env = await seedExecutingKsuidRun(p14, router, runId, "idemm"); + const idempotencyKey = "dedup-key-2"; + + const first = await engine.createManualWaitpoint({ + runId, + environmentId: env.environment.id, + projectId: env.project.id, + idempotencyKey, + }); + expect(first.isCached).toBe(false); + + const second = await engine.createManualWaitpoint({ + runId, + environmentId: env.environment.id, + projectId: env.project.id, + idempotencyKey, + }); + expect(second.isCached).toBe(true); + expect(second.waitpoint.id).toBe(first.waitpoint.id); + + expect( + await prisma17.waitpoint.findUnique({ where: { id: first.waitpoint.id } }) + ).not.toBeNull(); + expect(await p14.waitpoint.findUnique({ where: { id: first.waitpoint.id } })).toBeNull(); + expect( + await prisma17.waitpoint.count({ + where: { environmentId: env.environment.id, idempotencyKey }, + }) + ).toBe(1); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/debounce.test.ts b/internal-packages/run-engine/src/engine/tests/debounce.test.ts index 089756c5bcd..d926ddecb34 100644 --- a/internal-packages/run-engine/src/engine/tests/debounce.test.ts +++ b/internal-packages/run-engine/src/engine/tests/debounce.test.ts @@ -1995,11 +1995,11 @@ describe("RunEngine debounce", () => { // Construct the Redis key (same format as DebounceSystem.getDebounceRedisKey) const redisKey = `${environmentId}:${taskIdentifier}:${debounceKey}`; - // Step 1: Server A claims the key with claimId-A + // Server A claims the key with claimId-A const claimIdA = "claim-server-A"; await simulatedServerRedis.set(redisKey, `pending:${claimIdA}`, "PX", 60_000); - // Step 2 & 3: Simulate Server B claiming and registering (after A's claim "expires") + // Simulate Server B claiming and registering (after A's claim "expires") // In reality, this simulates the race where B's claim overwrites A's pending claim const runIdB = "run_server_B"; await simulatedServerRedis.set(redisKey, runIdB, "PX", 60_000); @@ -2008,7 +2008,7 @@ describe("RunEngine debounce", () => { const valueAfterB = await simulatedServerRedis.get(redisKey); expect(valueAfterB).toBe(runIdB); - // Step 4: Server A attempts to register with its stale claimId-A + // Server A attempts to register with its stale claimId-A // This should FAIL because the key no longer contains "pending:claim-server-A" const runIdA = "run_server_A"; const registered = await engine.debounceSystem.registerDebouncedRun({ @@ -2020,10 +2020,10 @@ describe("RunEngine debounce", () => { claimId: claimIdA, // Stale claim ID }); - // Step 5: Verify Server A's registration failed + // Verify Server A's registration failed expect(registered).toBe(false); - // Step 6: Verify Redis still contains runId-B (not overwritten by Server A) + // Verify Redis still contains runId-B (not overwritten by Server A) const finalValue = await simulatedServerRedis.get(redisKey); expect(finalValue).toBe(runIdB); } finally { @@ -2097,14 +2097,14 @@ describe("RunEngine debounce", () => { // Construct the Redis key (same format as DebounceSystem.getDebounceRedisKey) const redisKey = `${environmentId}:${taskIdentifier}:${debounceKey}`; - // Step 1: Server A claims the key with a pending claim + // Server A claims the key with a pending claim const claimIdA = "claim-server-A"; await simulatedServerRedis.set(redisKey, `pending:${claimIdA}`, "PX", 60_000); - // Step 2: Delete the key to simulate Server A's claim expiring + // Delete the key to simulate Server A's claim expiring await simulatedServerRedis.del(redisKey); - // Step 3: Server B calls handleDebounce - since key is gone, it should atomically claim + // Server B calls handleDebounce - since key is gone, it should atomically claim const debounceResult = await engine.debounceSystem.handleDebounce({ environmentId, taskIdentifier, @@ -2114,18 +2114,18 @@ describe("RunEngine debounce", () => { }, }); - // Step 4: Verify result is { status: "new" } WITH a claimId + // Verify result is { status: "new" } WITH a claimId expect(debounceResult.status).toBe("new"); if (debounceResult.status === "new") { expect(debounceResult.claimId).toBeDefined(); expect(typeof debounceResult.claimId).toBe("string"); expect(debounceResult.claimId!.length).toBeGreaterThan(0); - // Step 5: Verify the key now contains Server B's pending claim + // Verify the key now contains Server B's pending claim const valueAfterB = await simulatedServerRedis.get(redisKey); expect(valueAfterB).toBe(`pending:${debounceResult.claimId}`); - // Step 6: Server C tries to claim the same key - should fail + // Server C tries to claim the same key - should fail const claimIdC = "claim-server-C"; const claimResultC = await simulatedServerRedis.set( redisKey, @@ -2136,7 +2136,7 @@ describe("RunEngine debounce", () => { ); expect(claimResultC).toBeNull(); // NX fails because key exists - // Step 7: Server B registers its run using its claimId + // Server B registers its run using its claimId const runIdB = "run_server_B"; const delayUntil = new Date(Date.now() + 60_000); const registered = await engine.debounceSystem.registerDebouncedRun({ @@ -2148,10 +2148,10 @@ describe("RunEngine debounce", () => { claimId: debounceResult.claimId, }); - // Step 8: Verify Server B's registration succeeded + // Verify Server B's registration succeeded expect(registered).toBe(true); - // Step 9: Verify Redis contains Server B's run ID + // Verify Redis contains Server B's run ID const finalValue = await simulatedServerRedis.get(redisKey); expect(finalValue).toBe(runIdB); } diff --git a/internal-packages/run-engine/src/engine/tests/delayedRunSystem.controlPlaneResolver.test.ts b/internal-packages/run-engine/src/engine/tests/delayedRunSystem.controlPlaneResolver.test.ts new file mode 100644 index 00000000000..2d35d37db27 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/delayedRunSystem.controlPlaneResolver.test.ts @@ -0,0 +1,175 @@ +// Cross-DB inversion proof for the delayTTL env include (#enqueueDelayedRun). +// Cloud topology: run-ops = new DB (PG17, cross-seam FKs DROPPED), control-plane = legacy DB (PG14). +// The env/project/organization live on PG14; the run-ops scalar row on PG17. The +// PassthroughControlPlaneResolver over PG14 resolves the env half (used for enqueueRun + the +// runEnqueuedAfterDelay emit) while the run scalars come from PG17 — no cross-DB join. The DB is +// never mocked. A single-DB passthrough case proves a delayed run becomes QUEUED byte-identically. +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import type { PrismaClient } from "@trigger.dev/database"; +import { setTimeout } from "node:timers/promises"; +import { expect } from "vitest"; +import { PassthroughControlPlaneResolver } from "../controlPlaneResolver.js"; +import { PostgresRunStore } from "@internal/run-store"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { redis: redisOptions }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +async function seedControlPlaneEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +describe("DelayedRunSystem controlPlaneResolver (hetero cross-DB)", () => { + heteroPostgresTest( + "env resolves from PG14 (control-plane) while the run scalars live on PG17 (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedControlPlaneEnv(prisma14 as unknown as PrismaClient, "cpdl"); + + const runId = "run_cpdl_pg17"; + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "DELAYED", + friendlyId: "run_friendly_cpdl", + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + queue: "task/my-task", + traceId: "trace_cpdl", + spanId: "span_cpdl", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + const run = await runStore.findRun({ id: runId }); + assertNonNullable(run); + expect(run.runtimeEnvironmentId).toBe(cp.environment.id); + + // The env resolves from PG14 — exactly the fields #enqueueDelayedRun reads (type for the DEV + // TTL branch; organizationId/projectId for the runEnqueuedAfterDelay emit; the env object + // for enqueueRun). + const env = await resolver.resolveEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + expect(env.type).toBe("PRODUCTION"); + expect(env.organizationId).toBe(cp.organization.id); + expect(env.projectId).toBe(cp.project.id); + + // Inversion: the run-ops DB (PG17) holds no env row; a co-located join would resolve null. + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + } + ); +}); + +describe("DelayedRunSystem controlPlaneResolver (single-DB passthrough)", () => { + containerTest( + "a delayed run becomes QUEUED byte-identically through the resolved env", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_cpdlpassthru1", + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-cpdl", + spanId: "s-cpdl", + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + delayUntil: new Date(Date.now() + 500), + }, + prisma + ); + + const delayed = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(delayed); + expect(delayed.snapshot.executionStatus).toBe("DELAYED"); + + await setTimeout(1_000); + + const queued = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(queued); + expect(queued.snapshot.executionStatus).toBe("QUEUED"); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/dequeueSystem.controlPlaneResolver.test.ts b/internal-packages/run-engine/src/engine/tests/dequeueSystem.controlPlaneResolver.test.ts new file mode 100644 index 00000000000..ee0b4147aa7 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/dequeueSystem.controlPlaneResolver.test.ts @@ -0,0 +1,540 @@ +// Cross-DB inversion proof for the dequeue control-plane join (#getRunWithBackgroundWorkerTasks). +// Cloud topology: run-ops = the new DB (PG17, cross-seam FKs DROPPED), control-plane = the legacy +// DB (PG14, FKs retained). The env + worker version (deployment/tasks/queues) live on PG14; the +// run-ops scalar row lives on PG17 with no env/worker present. The PassthroughControlPlaneResolver +// over PG14 resolves the control-plane half while the PostgresRunStore over PG17 resolves the run +// scalars — proving the two halves resolve from separate providers with NO cross-DB join. The DB +// is never mocked. A single-DB passthrough case proves the engine dequeue is byte-identical. +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { DequeuedMessage } from "@trigger.dev/core/v3"; +import { + CURRENT_DEPLOYMENT_LABEL, + generateFriendlyId, + sanitizeQueueName, +} from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient } from "@trigger.dev/database"; +import { setTimeout } from "node:timers/promises"; +import { expect } from "vitest"; +import { PassthroughControlPlaneResolver } from "../controlPlaneResolver.js"; +import { PostgresRunStore } from "@internal/run-store"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * Seed a control-plane env + a promoted MANAGED deployment with worker/tasks/queues directly on a + * client (no engine), so the control-plane half can live on a DISTINCT provider from the run-ops + * row. Mirrors setup.ts's PRODUCTION deployment path. + */ +async function seedControlPlane(prisma: PrismaClient, suffix: string, taskSlug: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: generateFriendlyId("worker"), + contentHash: "hash", + projectId: project.id, + runtimeEnvironmentId: environment.id, + version: "20240101.1", + metadata: {}, + engine: "V2", + }, + }); + + const task = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: generateFriendlyId("task"), + slug: taskSlug, + filePath: `/trigger/${taskSlug}.ts`, + exportName: taskSlug, + workerId: worker.id, + runtimeEnvironmentId: environment.id, + projectId: project.id, + retryConfig: { maxAttempts: 3, factor: 1, minTimeoutInMs: 100, maxTimeoutInMs: 100 }, + }, + }); + + const queueName = sanitizeQueueName(`task/${taskSlug}`); + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: generateFriendlyId("queue"), + name: queueName, + concurrencyLimit: 10, + runtimeEnvironmentId: environment.id, + projectId: project.id, + type: "VIRTUAL", + workers: { connect: { id: worker.id } }, + tasks: { connect: { id: task.id } }, + }, + }); + + const deployment = await prisma.workerDeployment.create({ + data: { + friendlyId: generateFriendlyId("deployment"), + contentHash: worker.contentHash, + version: worker.version, + shortCode: `short_code_${worker.version}`, + imageReference: `trigger/${project.externalRef}:${worker.version}.${environment.slug}`, + status: "DEPLOYED", + projectId: project.id, + environmentId: environment.id, + workerId: worker.id, + type: "MANAGED", + }, + }); + + await prisma.workerDeploymentPromotion.create({ + data: { + deploymentId: deployment.id, + environmentId: environment.id, + label: CURRENT_DEPLOYMENT_LABEL, + }, + }); + + return { organization, project, environment, worker, task, queue, deployment, queueName }; +} + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +describe("DequeueSystem controlPlaneResolver (hetero cross-DB)", () => { + heteroPostgresTest( + "env + worker version resolve from PG14 while the run scalars resolve from PG17 (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + const taskSlug = "test-task"; + // Cloud shape: drop the run-ops -> control-plane Cascade FKs on the run-ops (new) DB only. + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + + const cp = await seedControlPlane(prisma14 as unknown as PrismaClient, "cpdq", taskSlug); + + // The run-ops row lives ONLY on PG17, which holds NO env/worker/deployment rows, so any + // in-DB join against PG17 would resolve null — the resolver against PG14 is the only path. + const runId = "run_cpdq_pg17"; + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: `run_friendly_cpdq`, + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: taskSlug, + payload: "{}", + payloadType: "application/json", + queue: cp.queueName, + traceId: "trace_cpdq", + spanId: "span_cpdq", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + // Run-ops scalars resolve from PG17. + const run = await runStore.findRun( + { id: runId }, + { + select: { + id: true, + taskIdentifier: true, + runtimeEnvironmentId: true, + queue: true, + }, + } + ); + assertNonNullable(run); + expect(run.id).toBe(runId); + expect(run.runtimeEnvironmentId).toBe(cp.environment.id); + + // The control-plane env resolves from PG14. + const env = await resolver.resolveEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + expect(env.id).toBe(cp.environment.id); + expect(env.type).toBe("PRODUCTION"); + + // The worker version (promoted MANAGED deployment + tasks + queues) resolves from PG14. + const version = await resolver.resolveWorkerVersion({ + environmentId: run.runtimeEnvironmentId, + type: env.type, + }); + assertNonNullable(version); + expect(version.worker.id).toBe(cp.worker.id); + expect(version.deployment?.id).toBe(cp.deployment.id); + expect(version.tasks.find((t) => t.slug === run.taskIdentifier)?.id).toBe(cp.task.id); + expect(version.queues.find((q) => q.name === run.queue)?.id).toBe(cp.queue.id); + + // Proof of inversion: the run-ops DB (PG17) has no env/worker rows; a co-located join on + // PG17 would have resolved null. The run row is absent from the control-plane DB (PG14). + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma17 as unknown as PrismaClient).backgroundWorker.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); + +describe("DequeueSystem controlPlaneResolver (latest-v2 fallback + workerId branches)", () => { + // Deployed + no workerId, where the CURRENT-promoted deployment is NOT MANAGED. + // #getManagedWorkerFromCurrentlyPromotedDeployment must fall back to the latest MANAGED + // WorkerDeployment for the env (controlPlaneResolver.ts ~line 244). Every other test promotes a + // MANAGED deployment, so this fallback branch was previously unexercised. + containerTest( + "resolveWorkerVersion (deployed, no workerId) falls back to the latest MANAGED deployment when the promoted one is not MANAGED", + async ({ prisma }) => { + const taskSlug = "test-task"; + + const organization = await prisma.organization.create({ + data: { title: "Org fallback", slug: "org-fallback" }, + }); + const project = await prisma.project.create({ + data: { + name: "Project fallback", + slug: "project-fallback", + externalRef: "proj_fallback", + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: "prod-fallback", + projectId: project.id, + organizationId: organization.id, + apiKey: "tr_prod_fallback", + pkApiKey: "pk_prod_fallback", + shortcode: "short_fallback", + maximumConcurrencyLimit: 10, + }, + }); + + // The CURRENT-promoted deployment is a NON-MANAGED (V1) deployment with its own worker. + const promotedWorker = await prisma.backgroundWorker.create({ + data: { + friendlyId: generateFriendlyId("worker"), + contentHash: "hash-v1", + projectId: project.id, + runtimeEnvironmentId: environment.id, + version: "20240101.1", + metadata: {}, + engine: "V1", + }, + }); + const promotedDeployment = await prisma.workerDeployment.create({ + data: { + friendlyId: generateFriendlyId("deployment"), + contentHash: promotedWorker.contentHash, + version: promotedWorker.version, + shortCode: "short_code_v1", + imageReference: `trigger/${project.externalRef}:v1.${environment.slug}`, + status: "DEPLOYED", + projectId: project.id, + environmentId: environment.id, + workerId: promotedWorker.id, + type: "V1", + }, + }); + await prisma.workerDeploymentPromotion.create({ + data: { + deploymentId: promotedDeployment.id, + environmentId: environment.id, + label: CURRENT_DEPLOYMENT_LABEL, + }, + }); + + // A SEPARATE, later (higher id) MANAGED deployment + worker with tasks/queues. This is the + // latest-v2 deployment the fallback must select. + const managedWorker = await prisma.backgroundWorker.create({ + data: { + friendlyId: generateFriendlyId("worker"), + contentHash: "hash-managed", + projectId: project.id, + runtimeEnvironmentId: environment.id, + version: "20240101.2", + metadata: {}, + engine: "V2", + }, + }); + const managedTask = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: generateFriendlyId("task"), + slug: taskSlug, + filePath: `/trigger/${taskSlug}.ts`, + exportName: taskSlug, + workerId: managedWorker.id, + runtimeEnvironmentId: environment.id, + projectId: project.id, + retryConfig: { maxAttempts: 3, factor: 1, minTimeoutInMs: 100, maxTimeoutInMs: 100 }, + }, + }); + const managedQueueName = sanitizeQueueName(`task/${taskSlug}`); + const managedQueue = await prisma.taskQueue.create({ + data: { + friendlyId: generateFriendlyId("queue"), + name: managedQueueName, + concurrencyLimit: 10, + runtimeEnvironmentId: environment.id, + projectId: project.id, + type: "VIRTUAL", + workers: { connect: { id: managedWorker.id } }, + tasks: { connect: { id: managedTask.id } }, + }, + }); + const managedDeployment = await prisma.workerDeployment.create({ + data: { + friendlyId: generateFriendlyId("deployment"), + contentHash: managedWorker.contentHash, + version: managedWorker.version, + shortCode: "short_code_managed", + imageReference: `trigger/${project.externalRef}:managed.${environment.slug}`, + status: "DEPLOYED", + projectId: project.id, + environmentId: environment.id, + workerId: managedWorker.id, + type: "MANAGED", + }, + }); + + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma as unknown as PrismaClient, + }); + + const version = await resolver.resolveWorkerVersion({ + environmentId: environment.id, + type: "PRODUCTION", + }); + + assertNonNullable(version); + // The fallback selects the MANAGED deployment/worker, NOT the promoted non-MANAGED one. + expect(version.worker.id).toBe(managedWorker.id); + expect(version.worker.id).not.toBe(promotedWorker.id); + expect(version.deployment?.id).toBe(managedDeployment.id); + expect(version.deployment?.id).not.toBe(promotedDeployment.id); + // Tasks/queues come from the MANAGED worker. + expect(version.tasks.find((t) => t.slug === taskSlug)?.id).toBe(managedTask.id); + expect(version.queues.find((q) => q.name === managedQueueName)?.id).toBe(managedQueue.id); + } + ); + + // The dequeue hot path computes `workerId = run.lockedToVersionId ?? backgroundWorkerId` + // and passes it to resolveWorkerVersion. A locked-to-version run exercises the workerId branches, + // which no other test covers. + containerTest( + "resolveWorkerVersion (deployed, with workerId) returns that exact worker + deployment", + async ({ prisma }) => { + const taskSlug = "test-task"; + const cp = await seedControlPlane(prisma as unknown as PrismaClient, "wid", taskSlug); + + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma as unknown as PrismaClient, + }); + + // Covers #getWorkerDeploymentFromWorker. + const version = await resolver.resolveWorkerVersion({ + environmentId: cp.environment.id, + type: "PRODUCTION", + workerId: cp.worker.id, + }); + + assertNonNullable(version); + expect(version.worker.id).toBe(cp.worker.id); + expect(version.deployment?.id).toBe(cp.deployment.id); + expect(version.tasks.find((t) => t.slug === taskSlug)?.id).toBe(cp.task.id); + expect(version.queues.find((q) => q.name === cp.queueName)?.id).toBe(cp.queue.id); + } + ); + + containerTest( + "resolveWorkerVersion (DEVELOPMENT, with workerId) returns that worker with deployment populated", + async ({ prisma }) => { + const organization = await prisma.organization.create({ + data: { title: "Org dev wid", slug: "org-dev-wid" }, + }); + const project = await prisma.project.create({ + data: { + name: "Project dev wid", + slug: "project-dev-wid", + externalRef: "proj_dev_wid", + organizationId: organization.id, + }, + }); + const devEnv = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev-wid", + projectId: project.id, + organizationId: organization.id, + apiKey: "tr_dev_wid", + pkApiKey: "pk_dev_wid", + shortcode: "short_dev_wid", + maximumConcurrencyLimit: 10, + }, + }); + + const devWorker = await prisma.backgroundWorker.create({ + data: { + friendlyId: generateFriendlyId("worker"), + contentHash: "hash-dev", + projectId: project.id, + runtimeEnvironmentId: devEnv.id, + version: "20240101.1", + metadata: {}, + engine: "V2", + }, + }); + const devDeployment = await prisma.workerDeployment.create({ + data: { + friendlyId: generateFriendlyId("deployment"), + contentHash: devWorker.contentHash, + version: devWorker.version, + shortCode: "short_code_dev", + imageReference: `trigger/${project.externalRef}:dev.${devEnv.slug}`, + status: "DEPLOYED", + projectId: project.id, + environmentId: devEnv.id, + workerId: devWorker.id, + type: "MANAGED", + }, + }); + + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma as unknown as PrismaClient, + }); + + // Covers #getWorkerById (which includes `deployment: true`). + const version = await resolver.resolveWorkerVersion({ + environmentId: devEnv.id, + type: "DEVELOPMENT", + workerId: devWorker.id, + }); + + assertNonNullable(version); + expect(version.worker.id).toBe(devWorker.id); + expect(version.deployment?.id).toBe(devDeployment.id); + } + ); +}); + +describe("DequeueSystem controlPlaneResolver (single-DB passthrough)", () => { + containerTest( + "default passthrough dequeue is byte-identical (resolves env + worker version end-to-end)", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + await engine.trigger( + { + number: 1, + friendlyId: "run_cpdqpassthru1", + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-cpdq", + spanId: "s-cpdq", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + await setTimeout(500); + const dequeued: DequeuedMessage[] = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); + + expect(dequeued.length).toBe(1); + assertNonNullable(dequeued[0]); + // The resolved env + worker version flow into the message exactly as before. + expect(dequeued[0].environment.id).toBe(environment.id); + expect(dequeued[0].environment.type).toBe("PRODUCTION"); + expect(dequeued[0].run.id).toBeDefined(); + expect(dequeued[0].backgroundWorker.id).toBeDefined(); + expect(dequeued[0].image).toBeDefined(); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/dequeueSystem.recovery.controlPlaneResolver.test.ts b/internal-packages/run-engine/src/engine/tests/dequeueSystem.recovery.controlPlaneResolver.test.ts new file mode 100644 index 00000000000..c4e058e7299 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/dequeueSystem.recovery.controlPlaneResolver.test.ts @@ -0,0 +1,104 @@ +import { assertNonNullable, heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import { sanitizeQueueName } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { PassthroughControlPlaneResolver } from "../controlPlaneResolver.js"; +import { PostgresRunStore } from "@internal/run-store"; + +vi.setConfig({ testTimeout: 60_000 }); + +async function seedControlPlane(prisma: PrismaClient, suffix: string, taskSlug: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + const queueName = sanitizeQueueName(`task/${taskSlug}`); + return { organization, project, environment, queueName }; +} + +describe("DequeueSystem recovery controlPlaneResolver (hetero cross-DB, dedicated run-ops client)", () => { + heteroRunOpsPostgresTest( + "the nack/requeue recovery read resolves env via the resolver and never reads a null run.runtimeEnvironment relation", + async ({ prisma14, prisma17 }) => { + const taskSlug = "test-task"; + + // The dedicated run-ops schema has NO control-plane tables and NO cross-seam FKs, so there is + // nothing to drop on prisma17. Control-plane rows are seeded on PG14 only. + const cp = await seedControlPlane(prisma14 as unknown as PrismaClient, "recov", taskSlug); + + const runId = "run_recov_pg17"; + await prisma17.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: "run_friendly_recov", + // scalar control-plane FK ids — no control-plane row exists on the dedicated DB. + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: taskSlug, + payload: "{}", + payloadType: "application/json", + queue: cp.queueName, + traceId: "trace_recov", + spanId: "span_recov", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + schemaVariant: "dedicated", + }); + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + // Regression guard: an include-based read of the control-plane `runtimeEnvironment` relation + // is invalid on the dedicated subset client (the relation does not exist there), so it throws. + await expect( + runStore.findRun({ id: runId }, { include: { runtimeEnvironment: true } }) + ).rejects.toThrow(); + + // Fixed shape: scalars-only select resolved from the dedicated run-ops client + resolveEnv + // against PG14. + const run = await runStore.findRun( + { id: runId }, + { select: { id: true, runtimeEnvironmentId: true, projectId: true } } + ); + assertNonNullable(run); + expect(run.id).toBe(runId); + expect(run.runtimeEnvironmentId).toBe(cp.environment.id); + expect(run.projectId).toBe(cp.project.id); + + const env = await resolver.resolveEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + expect(env.id).toBe(cp.environment.id); + expect(env.type).toBe("PRODUCTION"); + expect(run.projectId).toBe(env.projectId); + + // Inversion proof: no run on PG14 (control-plane). + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/engineResidualInversions.controlPlaneResolver.test.ts b/internal-packages/run-engine/src/engine/tests/engineResidualInversions.controlPlaneResolver.test.ts new file mode 100644 index 00000000000..e448b96aa9e --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/engineResidualInversions.controlPlaneResolver.test.ts @@ -0,0 +1,100 @@ +import { assertNonNullable, heteroPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { PassthroughControlPlaneResolver } from "../controlPlaneResolver.js"; +import { PostgresRunStore } from "@internal/run-store"; + +vi.setConfig({ testTimeout: 60_000 }); + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +async function seedEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${suffix}`, + pkApiKey: `pk_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + return { organization, project, environment }; +} + +describe("engine residual inversions controlPlaneResolver (hetero cross-DB)", () => { + heteroPostgresTest( + "resolveEnv covers ttl + parkPendingVersion env reads from the control-plane DB while runs live on the run-ops DB", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedEnv(prisma14 as unknown as PrismaClient, "resid"); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: "run_resid", + engine: "V2", + status: "PENDING", + friendlyId: "run_friendly_resid", + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: "resid-task", + payload: "{}", + payloadType: "application/json", + queue: "task/resid-task", + traceId: "trace_resid", + spanId: "span_resid", + }, + }); + + const run = await runStore.findRun( + { id: "run_resid" }, + { select: { id: true, runtimeEnvironmentId: true } } + ); + assertNonNullable(run); + const env = await resolver.resolveEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + // ttl reads organizationId/projectId/id; parkPendingVersion reads id/type/projectId/project.organizationId. + expect(env.id).toBe(cp.environment.id); + expect(env.type).toBe("PRODUCTION"); + expect(env.organizationId).toBe(cp.organization.id); + expect(env.projectId).toBe(cp.project.id); + + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/lifecycleRouter.test.ts b/internal-packages/run-engine/src/engine/tests/lifecycleRouter.test.ts new file mode 100644 index 00000000000..5bcbfac8ce3 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/lifecycleRouter.test.ts @@ -0,0 +1,658 @@ +import { containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore, RoutingRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import { setTimeout } from "node:timers/promises"; +import { describe, expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 120_000 }); + +function baseEngineOptions(redisOptions: any) { + return { + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +const baseTriggerParams = (friendlyId: string, environment: any, taskIdentifier: string) => ({ + number: 1, + friendlyId, + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [] as string[], +}); + +/** + * A real `PostgresRunStore` that records the snapshot-read and findRuns calls this + * unit's lifecycle sites route through. NOT a mock — every override still issues the + * real query via `super.*`; it only counts and records the forwarded read client so + * the tests can prove the engine threaded `this.runStore` (and which client a read + * was directed at). There is no `PassthroughRunStore` class to subclass — the single + * `PostgresRunStore` IS the single-DB passthrough behavior. + */ +class CountingRunStore extends PostgresRunStore { + label: string; + latestSnapshotReads = 0; + latestSnapshotRunIds: string[] = []; + // The forwarded read client (positional arg index 1) for each snapshot read. Lets + // the tests prove a routed read stayed on the primary and never fell to the replica. + latestSnapshotClients: unknown[] = []; + executionSnapshotReads = 0; + executionSnapshotClients: unknown[] = []; + manyExecutionSnapshotReads = 0; + manyExecutionSnapshotClients: unknown[] = []; + completedWaitpointReads = 0; + findRunsCalls: Array<{ client: unknown }> = []; + + constructor(opts: { prisma: any; readOnlyPrisma: any; label?: string }) { + super({ prisma: opts.prisma, readOnlyPrisma: opts.readOnlyPrisma }); + this.label = opts.label ?? "counting"; + } + + override findLatestExecutionSnapshot(...args: any[]) { + this.latestSnapshotReads++; + this.latestSnapshotRunIds.push(args[0]); + this.latestSnapshotClients.push(args[1]); + return (super.findLatestExecutionSnapshot as any)(...args); + } + + override findExecutionSnapshot(...args: any[]) { + this.executionSnapshotReads++; + this.executionSnapshotClients.push(args[1]); + return (super.findExecutionSnapshot as any)(...args); + } + + override findManyExecutionSnapshots(...args: any[]) { + this.manyExecutionSnapshotReads++; + this.manyExecutionSnapshotClients.push(args[1]); + return (super.findManyExecutionSnapshots as any)(...args); + } + + override findSnapshotCompletedWaitpointIds(...args: any[]) { + this.completedWaitpointReads++; + return (super.findSnapshotCompletedWaitpointIds as any)(...args); + } + + override findRuns(...args: any[]) { + this.findRunsCalls.push({ client: args[1] }); + return (super.findRuns as any)(...args); + } +} + +describe("RunEngine lifecycle read routing (single-DB)", () => { + // getRunExecutionData routes its latest-snapshot read through this.runStore + // (the threaded getLatestExecutionSnapshot(prisma, runId, this.runStore) call). + containerTest( + "getRunExecutionData reads the latest snapshot through the store", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const friendlyId = generateFriendlyId("run"); + const run = await engine.trigger( + baseTriggerParams(friendlyId, environment, taskIdentifier), + prisma + ); + + const readsBefore = store.latestSnapshotReads; + const data = await engine.getRunExecutionData({ runId: run.id }); + + expect(data).not.toBeNull(); + expect(data!.run.id).toBe(run.id); + expect(store.latestSnapshotReads).toBeGreaterThan(readsBefore); + // Routed by owning run id (snapshots never route by snapshot id). + expect(store.latestSnapshotRunIds).toContain(run.id); + } finally { + await engine.quit(); + } + } + ); + + // getSnapshotsSince routes through the store's snapshot read methods (the + // since-marker lookup, the page read, and the latest snapshot's waitpoint hydrate). + containerTest("getSnapshotsSince reads through the store", async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + baseTriggerParams(generateFriendlyId("run"), environment, taskIdentifier), + prisma + ); + + await setTimeout(500); + await engine.dequeueFromWorkerQueue({ consumerId: "test_since", workerQueue: "main" }); + + const allSnapshots = await prisma.taskRunExecutionSnapshot.findMany({ + where: { runId: run.id, isValid: true }, + orderBy: { createdAt: "asc" }, + }); + expect(allSnapshots.length).toBeGreaterThan(1); + + const executionReadsBefore = store.executionSnapshotReads; + const manyReadsBefore = store.manyExecutionSnapshotReads; + + const result = await engine.getSnapshotsSince({ + runId: run.id, + snapshotId: allSnapshots[0].id, + }); + + expect(result).not.toBeNull(); + expect(result!.length).toBeGreaterThan(0); + // The since-marker lookup + the page read both went through the store. + expect(store.executionSnapshotReads).toBeGreaterThan(executionReadsBefore); + expect(store.manyExecutionSnapshotReads).toBeGreaterThan(manyReadsBefore); + } finally { + await engine.quit(); + } + }); + + // With the replica-off default (readReplicaSnapshotsSinceEnabled unset), + // getSnapshotsSince reads on the PRIMARY client. Distinct primary/replica-Proxy setup + // proves both the since-marker (findExecutionSnapshot) and page (findManyExecutionSnapshots) + // reads carried the primary handle and the replica was never touched. + containerTest( + "getSnapshotsSince reads on the primary client when the replica flag is off", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const primary = prisma; + let replicaReads = 0; + const replicaProxy = new Proxy(prisma, { + get(target, prop, receiver) { + if (prop === "taskRunExecutionSnapshot") { + replicaReads++; + } + return Reflect.get(target, prop, receiver); + }, + }) as unknown as typeof prisma; + + const store = new CountingRunStore({ prisma: primary, readOnlyPrisma: replicaProxy }); + const engine = new RunEngine({ prisma: primary, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + baseTriggerParams(generateFriendlyId("run"), environment, taskIdentifier), + primary + ); + + await setTimeout(500); + await engine.dequeueFromWorkerQueue({ consumerId: "test_since_b2", workerQueue: "main" }); + + const allSnapshots = await primary.taskRunExecutionSnapshot.findMany({ + where: { runId: run.id, isValid: true }, + orderBy: { createdAt: "asc" }, + }); + expect(allSnapshots.length).toBeGreaterThan(1); + + const result = await engine.getSnapshotsSince({ + runId: run.id, + snapshotId: allSnapshots[0].id, + }); + + expect(result).not.toBeNull(); + // Both the since-marker and the page read carried the primary handle (default off)... + expect(store.executionSnapshotClients.at(-1)).toBe(primary); + expect(store.manyExecutionSnapshotClients.at(-1)).toBe(primary); + // ...and the read-only (replica) handle was never accessed. + expect(replicaReads).toBe(0); + } finally { + await engine.quit(); + } + } + ); + + // The concurrency sweeper read goes through this.runStore.findRuns (already + // routed on the baseline). The store's default findRuns read targets the read-only + // client, so the sweeper scan stays off the primary without an explicit client arg. + containerTest( + "the sweeper reads finished runs through the store (default read client)", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + baseTriggerParams(generateFriendlyId("run"), environment, taskIdentifier), + prisma + ); + + // Make it look like a run that finished more than the sweeper's offset ago. + await prisma.taskRun.update({ + where: { id: run.id }, + data: { + status: "COMPLETED_SUCCESSFULLY", + completedAt: new Date(Date.now() - 1000 * 60 * 20), + }, + }); + + const callsBefore = store.findRunsCalls.length; + // Drive the private sweeper callback through the run-queue wiring it is bound into. + const callback = (engine as any).runQueue.options.concurrencySweeper.callback as ( + runIds: string[] + ) => Promise>; + const found = await callback([run.id]); + + expect(store.findRunsCalls.length).toBeGreaterThan(callsBefore); + expect(found).toEqual([{ id: run.id, orgId: environment.organization.id }]); + // The default findRuns read carries no explicit client — it resolves to the + // store's read-only client (the replica in a split deployment). + expect(store.findRunsCalls.at(-1)!.client).toBeUndefined(); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB binds one client (the `passthrough` field), proven BY BEHAVIOR. + // A round-trip through the default-store engine returns exactly the snapshot just + // written on the one configured client — no second DB/connection is configured. We + // do NOT assert store.prisma === engine.prisma (the store exposes no such member). + containerTest( + "single-DB passthrough round-trip returns the snapshot just written", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + // No `store` injected → the engine defaults to a single PostgresRunStore over + // the one prisma client (the passthrough single-DB behavior). + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + baseTriggerParams(generateFriendlyId("run"), environment, taskIdentifier), + prisma + ); + + const data = await engine.getRunExecutionData({ runId: run.id }); + expect(data).not.toBeNull(); + + // The read returns exactly the latest snapshot persisted on the single client. + const latest = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId: run.id, isValid: true }, + orderBy: { createdAt: "desc" }, + }); + expect(latest).not.toBeNull(); + expect(data!.snapshot.id).toBe(latest!.id); + } finally { + await engine.quit(); + } + } + ); + + // getRunExecutionData's latest-snapshot read stays on the PRIMARY client. + // The store resolves a routed read as `client ?? readOnlyPrisma`, so the only thing + // keeping the engine off the replica is that it threads `this.prisma`. We give the + // store distinct primary vs read-only handles (the read-only one a Proxy that counts + // any `taskRunExecutionSnapshot` access, mirroring the read-through proof) and prove the read landed + // on the primary and the replica was never touched. + containerTest( + "getRunExecutionData reads the latest snapshot on the primary client", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const primary = prisma; + let replicaReads = 0; + const replicaProxy = new Proxy(prisma, { + get(target, prop, receiver) { + if (prop === "taskRunExecutionSnapshot") { + replicaReads++; + } + return Reflect.get(target, prop, receiver); + }, + }) as unknown as typeof prisma; + + const store = new CountingRunStore({ prisma: primary, readOnlyPrisma: replicaProxy }); + const engine = new RunEngine({ prisma: primary, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + baseTriggerParams(generateFriendlyId("run"), environment, taskIdentifier), + primary + ); + + const data = await engine.getRunExecutionData({ runId: run.id }); + + expect(data).not.toBeNull(); + expect(data!.run.id).toBe(run.id); + // The routed latest-snapshot read carried the primary handle... + expect(store.latestSnapshotClients.at(-1)).toBe(primary); + // ...and the read-only (replica) handle was never accessed. + expect(replicaReads).toBe(0); + } finally { + await engine.quit(); + } + } + ); + + // The recovery path (#repairRun, driven via the public repairEnvironment) + // also reads the latest snapshot on the PRIMARY client, never the replica. Same + // distinct primary/replica-Proxy setup as the getRunExecutionData primary-read proof. A dequeued run holds environment + // concurrency, so repairEnvironment's getCurrentConcurrencyOfEnvironment returns it; + // dryRun=true keeps the path deterministic and enqueues no worker job. + containerTest( + "repairEnvironment reads the latest snapshot on the primary client", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const primary = prisma; + let replicaReads = 0; + const replicaProxy = new Proxy(prisma, { + get(target, prop, receiver) { + if (prop === "taskRunExecutionSnapshot") { + replicaReads++; + } + return Reflect.get(target, prop, receiver); + }, + }) as unknown as typeof prisma; + + const store = new CountingRunStore({ prisma: primary, readOnlyPrisma: replicaProxy }); + const engine = new RunEngine({ prisma: primary, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + baseTriggerParams(generateFriendlyId("run"), environment, taskIdentifier), + primary + ); + + // Dequeue so the run holds environment concurrency (otherwise the repair scan + // finds no runIds to repair). + await setTimeout(500); + await engine.dequeueFromWorkerQueue({ consumerId: "test_repair", workerQueue: "main" }); + + const concurrency = await (engine as any).runQueue.getCurrentConcurrencyOfEnvironment( + environment + ); + expect(concurrency).toContain(run.id); + + const readsBefore = store.latestSnapshotReads; + await engine.repairEnvironment(environment, /* dryRun */ true); + + expect(store.latestSnapshotReads).toBeGreaterThan(readsBefore); + expect(store.latestSnapshotRunIds).toContain(run.id); + // The repair-path latest-snapshot read carried the primary handle... + expect(store.latestSnapshotClients.at(-1)).toBe(primary); + // ...and the read-only (replica) handle was never accessed. + expect(replicaReads).toBe(0); + } finally { + await engine.quit(); + } + } + ); + + // Two further latest-snapshot read sites — #handleStalledSnapshot (heartbeat-timeout) + // and #handleRepairSnapshot (deferred repair job) — route through the same + // primary-threaded getLatestExecutionSnapshot(this.prisma, ...) path proven above. + // They are driven only by redis-worker timeout/repair jobs, so they are left + // un-unit-covered here to avoid timing-dependent flakiness; the primary-routing + // guarantee they share is established by the getRunExecutionData and repairEnvironment primary-read proofs above. +}); + +// --------------------------------------------------------------------------- +// Read-through / cross-version proofs (PG14 legacy <-> PG17 run-ops). These test +// the routing layer the engine's threaded reads delegate to: a real RoutingRunStore +// over two real PostgresRunStores on two real containers (NEVER mocked). A new run +// (ksuid id, born on PG17) resolves from the run-ops store; an old in-retention run +// (cuid id, on PG14) reads THROUGH the legacy store's read-only (replica) client. +// --------------------------------------------------------------------------- + +// A cuid-length (25-char) internal id → classifies LEGACY; a ksuid-length (27-char) +// internal id → classifies NEW. The `run_` prefix is stripped before classification. +const legacyRunId = (suffix: string) => `run_${suffix.padEnd(25, "0").slice(0, 25)}`; +const newRunId = (suffix: string) => `run_${suffix.padEnd(27, "0").slice(0, 27)}`; + +async function seedRunWithSnapshot( + prisma: PrismaClient, + runId: string, + suffix: string +): Promise<{ snapshotId: string }> { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + + await prisma.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "EXECUTING", + friendlyId: `friendly_${suffix}`, + runtimeEnvironmentId: environment.id, + organizationId: organization.id, + projectId: project.id, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + queue: "task/test-task", + traceId: `trace_${suffix}`, + spanId: `span_${suffix}`, + }, + }); + + const snapshot = await prisma.taskRunExecutionSnapshot.create({ + data: { + engine: "V2", + executionStatus: "EXECUTING", + description: `snapshot ${suffix}`, + isValid: true, + runId, + runStatus: "EXECUTING", + environmentId: environment.id, + environmentType: "PRODUCTION", + projectId: project.id, + organizationId: organization.id, + }, + }); + + return { snapshotId: snapshot.id }; +} + +describe("RunEngine lifecycle read-through routing (PG14/PG17)", () => { + // A NEW run (ksuid id) seeded only on the run-ops (PG17/new) store resolves + // its latest snapshot from that store, and the legacy store is never touched. + heteroPostgresTest( + "a new run resolves its latest snapshot from the run-ops store", + async ({ prisma14, prisma17 }) => { + const newReadClient = prisma17 as unknown as PrismaClient; + const legacyReadClient = prisma14 as unknown as PrismaClient; + + const newStore = new CountingRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: newReadClient, + label: "new", + }); + const legacyStore = new CountingRunStore({ + prisma: prisma14 as unknown as PrismaClient, + readOnlyPrisma: legacyReadClient, + label: "legacy", + }); + const routing = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const runId = newRunId("new_run_e"); + const { snapshotId } = await seedRunWithSnapshot( + prisma17 as unknown as PrismaClient, + runId, + "new_e" + ); + + const snapshot = await routing.findLatestExecutionSnapshot(runId); + + expect(snapshot).not.toBeNull(); + expect(snapshot!.id).toBe(snapshotId); + // Resolved from the run-ops (new) store, never the legacy store. + expect(newStore.latestSnapshotReads).toBe(1); + expect(legacyStore.latestSnapshotReads).toBe(0); + } + ); + + // An OLD run (cuid id) seeded only on the legacy (PG14) store reads through + // the legacy store's read-only (replica) client — never the primary. + heteroPostgresTest( + "an old run reads through the legacy store's replica client", + async ({ prisma14, prisma17 }) => { + // Distinct primary vs read-only handles on the legacy side so we can prove the + // read was directed at the read-only (replica) client, not the primary. + const legacyPrimary = prisma14 as unknown as PrismaClient; + let legacyReplicaReads = 0; + const legacyReplica = new Proxy(prisma14 as unknown as PrismaClient, { + get(target, prop, receiver) { + if (prop === "taskRunExecutionSnapshot") { + legacyReplicaReads++; + } + return Reflect.get(target, prop, receiver); + }, + }) as unknown as PrismaClient; + + const newStore = new CountingRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + label: "new", + }); + const legacyStore = new CountingRunStore({ + prisma: legacyPrimary, + readOnlyPrisma: legacyReplica, + label: "legacy", + }); + const routing = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const runId = legacyRunId("old_run_f"); + const { snapshotId } = await seedRunWithSnapshot(legacyPrimary, runId, "old_f"); + + const snapshot = await routing.findLatestExecutionSnapshot(runId); + + expect(snapshot).not.toBeNull(); + expect(snapshot!.id).toBe(snapshotId); + // Read-through resolved on the legacy store... + expect(legacyStore.latestSnapshotReads).toBe(1); + expect(newStore.latestSnapshotReads).toBe(0); + // ...via its read-only (replica) client, never the primary. + expect(legacyReplicaReads).toBeGreaterThan(0); + } + ); + + // The sweeper's findRuns scan across the routing store. The routing store's + // findRuns ships the single-store (new) delegate today (the mixed-residency fan-out + // is owned by the downstream routing-wire unit); this asserts the live behavior: the + // scan reads through the run-ops (new) store's read-only client, off the primary. + heteroPostgresTest( + "the sweeper findRuns scan reads through the run-ops store", + async ({ prisma14, prisma17 }) => { + const newStore = new CountingRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + label: "new", + }); + const legacyStore = new CountingRunStore({ + prisma: prisma14 as unknown as PrismaClient, + readOnlyPrisma: prisma14 as unknown as PrismaClient, + label: "legacy", + }); + const routing = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const newId = newRunId("new_run_g"); + await seedRunWithSnapshot(prisma17 as unknown as PrismaClient, newId, "new_g"); + await (prisma17 as unknown as PrismaClient).taskRun.update({ + where: { id: newId }, + data: { + status: "COMPLETED_SUCCESSFULLY", + completedAt: new Date(Date.now() - 1000 * 60 * 20), + }, + }); + + const runs = await routing.findRuns({ + where: { + id: { in: [newId] }, + completedAt: { lte: new Date(Date.now() - 1000 * 60 * 10) }, + organizationId: { not: null }, + status: { in: ["COMPLETED_SUCCESSFULLY"] }, + }, + select: { id: true, status: true, organizationId: true }, + }); + + expect(runs.map((r) => r.id)).toContain(newId); + // The scan went through the run-ops (new) store's read-only client (no explicit + // client passed → resolves to the store's read replica). + expect(newStore.findRunsCalls.length).toBe(1); + expect(newStore.findRunsCalls[0].client).toBeUndefined(); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/runAttemptSystem.controlPlaneResolver.test.ts b/internal-packages/run-engine/src/engine/tests/runAttemptSystem.controlPlaneResolver.test.ts new file mode 100644 index 00000000000..3f686a5f5cd --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/runAttemptSystem.controlPlaneResolver.test.ts @@ -0,0 +1,540 @@ +// Cross-DB inversion proof for runAttemptSystem.resolveTaskRunContext. +// run-ops scalars live on the run-ops DB (cross-seam FKs dropped); the env (slug/branchName/git/ +// project/org) lives on the control-plane DB. resolveAuthenticatedEnv over the control-plane DB resolves the env half +// while PostgresRunStore over the run-ops DB resolves the run scalars — proving no cross-DB join. +// The DB is never mocked. A single-DB case drives the real resolveTaskRunContext end-to-end. +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { + PassthroughControlPlaneResolver, + type ControlPlaneResolver, +} from "../controlPlaneResolver.js"; +import { ServiceValidationError } from "../errors.js"; +import { PostgresRunStore } from "@internal/run-store"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any, overrides?: Record) { + return { + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + ...overrides, + }; +} + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +async function seedRichEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + branchName: `feature-${suffix}`, + git: { commitSha: `sha_${suffix}` }, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 10, + }, + }); + return { organization, project, environment }; +} + +describe("runAttemptSystem.resolveTaskRunContext controlPlaneResolver (hetero cross-DB)", () => { + heteroPostgresTest( + "env (slug/branchName/git) resolves from the control-plane DB while run scalars resolve from the run-ops DB (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedRichEnv(prisma14 as unknown as PrismaClient, "rtc"); + + const runId = "run_rtc_runops"; + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: generateFriendlyId("run"), + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: "rtc-task", + payload: "{}", + payloadType: "application/json", + queue: "task/rtc-task", + traceId: "trace_rtc", + spanId: "span_rtc", + workerQueue: "main", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + // Run-ops scalars (incl. runtimeEnvironmentId, the resolver key) come from the run-ops DB. + const run = await runStore.findRun( + { id: runId }, + { select: { id: true, runtimeEnvironmentId: true, workerQueue: true } } + ); + assertNonNullable(run); + expect(run.id).toBe(runId); + expect(run.runtimeEnvironmentId).toBe(cp.environment.id); + + // The env half — exactly the fields resolveTaskRunContext reads — comes from the control-plane DB. + const env = await resolver.resolveAuthenticatedEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + expect(env.id).toBe(cp.environment.id); + expect(env.slug).toBe(cp.environment.slug); + expect(env.type).toBe("PRODUCTION"); + expect(env.branchName).toBe(cp.environment.branchName); + expect(env.organizationId).toBe(cp.organization.id); + expect(env.git).toEqual({ commitSha: "sha_rtc" }); + + // Proof of inversion: the run-ops DB holds no env rows; the control-plane DB holds no run rows. + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); + + heteroPostgresTest( + "startRunAttempt env resolves from the control-plane DB with run scalars on the run-ops DB (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedRichEnv(prisma14 as unknown as PrismaClient, "sra"); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const controlPlaneResolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + const runId = "run_sra_runops"; + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "DEQUEUED", + attemptNumber: 0, + friendlyId: generateFriendlyId("run"), + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: "sra-task", + payload: "{}", + payloadType: "application/json", + queue: "task/sra-task", + traceId: "trace_sra", + spanId: "span_sra", + workerQueue: "main", + }, + }); + + // startAttempt reads run scalars from the run-ops DB and resolves env from the control-plane DB. + const updatedRun = await runStore.startAttempt( + runId, + { attemptNumber: 1, executedAt: new Date(), isWarmStart: false }, + { select: { id: true, runtimeEnvironmentId: true, attemptNumber: true } }, + prisma17 as unknown as PrismaClient + ); + const env = await controlPlaneResolver.resolveAuthenticatedEnv( + updatedRun.runtimeEnvironmentId + ); + assertNonNullable(env); + expect(env.id).toBe(cp.environment.id); + expect(env.git).toEqual({ commitSha: "sha_sra" }); + + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); + + heteroPostgresTest( + "recordRetryOutcome run scalars resolve from the run-ops DB, env (org + project) from the control-plane DB (no cross-DB join, no orgMember)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedRichEnv(prisma14 as unknown as PrismaClient, "rro"); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const controlPlaneResolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + const runId = "run_rro_runops"; + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "DEQUEUED", + attemptNumber: 1, + friendlyId: generateFriendlyId("run"), + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: "rro-task", + payload: "{}", + payloadType: "application/json", + queue: "task/rro-task", + traceId: "trace_rro", + spanId: "span_rro", + workerQueue: "main", + }, + }); + + const run = await runStore.recordRetryOutcome( + runId, + { machinePreset: "small-1x", usageDurationMs: 1, costInCents: 1 }, + { select: { id: true, runtimeEnvironmentId: true, status: true } }, + prisma17 as unknown as PrismaClient + ); + const env = await controlPlaneResolver.resolveAuthenticatedEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + expect(env.organizationId).toBe(cp.organization.id); + expect(env.project.id).toBe(cp.project.id); + expect(env.id).toBe(cp.environment.id); + + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); + + heteroPostgresTest( + "failRunPermanently run scalars resolve from the run-ops DB, env ids from the control-plane DB via resolveEnv (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedRichEnv(prisma14 as unknown as PrismaClient, "frp"); + + const controlPlaneResolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + // resolveEnv supplies the env half; the store supplies run scalars. + const env = await controlPlaneResolver.resolveEnv(cp.environment.id); + assertNonNullable(env); + expect(env.id).toBe(cp.environment.id); + expect(env.type).toBe("PRODUCTION"); + expect(env.organizationId).toBe(cp.organization.id); + expect(env.projectId).toBe(cp.project.id); + expect(env.project.id).toBe(cp.project.id); + + // The run-ops DB holds no env; the control-plane DB holds no run. + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); + +describe("runAttemptSystem.resolveTaskRunContext controlPlaneResolver (single-DB passthrough)", () => { + containerTest( + "default passthrough resolveTaskRunContext is byte-identical (env + git resolve end-to-end)", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const runId = "run_rtc_passthru"; + await prisma.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: generateFriendlyId("run"), + runtimeEnvironmentId: environment.id, + organizationId: environment.organizationId, + projectId: environment.projectId, + taskIdentifier: "rtc-task", + payload: "{}", + payloadType: "application/json", + queue: "task/rtc-task", + traceId: "trace_rtc2", + spanId: "span_rtc2", + workerQueue: "main", + }, + }); + + const context = await engine.runAttemptSystem.resolveTaskRunContext(runId); + expect(context.environment.id).toBe(environment.id); + expect(context.environment.slug).toBe(environment.slug); + expect(context.environment.type).toBe("PRODUCTION"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "resolveTaskRunContext surfaces a clean 404 when the env has vanished (resolveAuthenticatedEnv null)", + async ({ prisma, redisOptions }) => { + // A deleted/vanished env must surface a clean 404 ServiceValidationError, + // not a "Cannot read properties of null" crash. We inject a resolver whose + // resolveAuthenticatedEnv returns null (the run row still exists on the + // run-ops side), while every other method delegates to the real passthrough. + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const passthrough = new PassthroughControlPlaneResolver({ + prisma, + }); + const resolver: ControlPlaneResolver = { + resolveEnv: passthrough.resolveEnv.bind(passthrough), + resolveWorkerVersion: passthrough.resolveWorkerVersion.bind(passthrough), + assertEnvExists: passthrough.assertEnvExists.bind(passthrough), + async resolveAuthenticatedEnv() { + return null; + }, + }; + + const engine = new RunEngine( + createEngineOptions(redisOptions, prisma, { controlPlaneResolver: resolver }) + ); + + try { + const runId = "run_rtc_nullenv"; + await prisma.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: generateFriendlyId("run"), + runtimeEnvironmentId: environment.id, + organizationId: environment.organizationId, + projectId: environment.projectId, + taskIdentifier: "rtc-task", + payload: "{}", + payloadType: "application/json", + queue: "task/rtc-task", + traceId: "trace_rtc_nullenv", + spanId: "span_rtc_nullenv", + workerQueue: "main", + }, + }); + + let caught: unknown; + try { + await engine.runAttemptSystem.resolveTaskRunContext(runId); + } catch (error) { + caught = error; + } + + expect(caught).toBeInstanceOf(ServiceValidationError); + const validationError = caught as ServiceValidationError; + expect(validationError.status).toBe(404); + expect(validationError.message).toBe("Task run environment not found"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "default passthrough startRunAttempt resolves env + git into the execution payload", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const taskIdentifier = "sra-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_sra1", + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-sra", + spanId: "s-sra", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_sra", + workerQueue: "main", + }); + assertNonNullable(dequeued[0]); + + const { execution } = await engine.startRunAttempt({ + runId: run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(execution.environment.id).toBe(environment.id); + expect(execution.environment.slug).toBe(environment.slug); + expect(execution.environment.type).toBe("PRODUCTION"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "default passthrough completeAttemptSuccess acks against the resolved org and finishes the run", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const taskIdentifier = "cas-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_cas1", + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-cas", + spanId: "s-cas", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_cas", + workerQueue: "main", + }); + assertNonNullable(dequeued[0]); + + const attemptResult = await engine.startRunAttempt({ + runId: run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + const result = await engine.completeRunAttempt({ + runId: run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: true, + id: run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + + expect(result.snapshot.executionStatus).toBe("FINISHED"); + expect(result.run.status).toBe("COMPLETED_SUCCESSFULLY"); + + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.run.status).toBe("COMPLETED_SUCCESSFULLY"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "default passthrough cancelRun acks against the resolved org and reaches a cancelled snapshot", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const taskIdentifier = "cancel-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_cancel1", + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-cancel", + spanId: "s-cancel", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const result = await engine.cancelRun({ + runId: run.id, + completedAt: new Date(), + reason: "Cancelled by the user", + }); + + expect(result.snapshot.executionStatus).toBe("FINISHED"); + + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.run.status).toBe("CANCELED"); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/runStoreInjectability.test.ts b/internal-packages/run-engine/src/engine/tests/runStoreInjectability.test.ts new file mode 100644 index 00000000000..045ef9d32b9 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/runStoreInjectability.test.ts @@ -0,0 +1,135 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore } from "@internal/run-store"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +describe("RunEngine runStore injectability", () => { + containerTest("defaults the store when none is injected", async ({ prisma, redisOptions }) => { + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + expect(engine.runStore).toBeDefined(); + } finally { + await engine.quit(); + } + }); + + containerTest("uses an explicitly injected store as-is", async ({ prisma, redisOptions }) => { + const injectedStore = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const engine = new RunEngine({ + ...createEngineOptions(redisOptions, prisma), + store: injectedStore, + }); + + try { + expect(engine.runStore).toBe(injectedStore); + } finally { + await engine.quit(); + } + }); + + // The happy-path "Single run (success)" trigger slice, run once per store variant. + // Each variant runs in its own containerTest (fresh DB) so the two RunEngines never + // share state — proving the injected store path is behavior-identical to the default. + async function assertTriggerLandsRun( + prisma: any, + redisOptions: any, + store: PostgresRunStore | undefined, + friendlyId: string + ) { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ + ...createEngineOptions(redisOptions, prisma), + ...(store ? { store } : {}), + }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId, + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + expect(run).toBeDefined(); + expect(run.friendlyId).toBe(friendlyId); + + const runFromDb = await prisma.taskRun.findUnique({ + where: { friendlyId }, + }); + expect(runFromDb).toBeDefined(); + expect(runFromDb?.id).toBe(run.id); + } finally { + await engine.quit(); + } + } + + containerTest( + "injected store path is behavior-identical to default (default store)", + async ({ prisma, redisOptions }) => { + await assertTriggerLandsRun(prisma, redisOptions, undefined, "run_default1234"); + } + ); + + containerTest( + "injected store path is behavior-identical to default (injected store)", + async ({ prisma, redisOptions }) => { + const injectedStore = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + await assertTriggerLandsRun(prisma, redisOptions, injectedStore, "run_injected5678"); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/triggerCreateRouting.test.ts b/internal-packages/run-engine/src/engine/tests/triggerCreateRouting.test.ts new file mode 100644 index 00000000000..f27ef6b8572 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/triggerCreateRouting.test.ts @@ -0,0 +1,598 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { + PostgresRunStore, + RoutingRunStore, + type CreateCancelledRunInput, + type CreateFailedRunInput, + type CreateRunInput, +} from "@internal/run-store"; +import { RunId, ownerEngine, generateKsuidId } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClientOrTransaction } from "@trigger.dev/database"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import type { ControlPlaneResolver } from "../controlPlaneResolver.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function baseEngineOptions(redisOptions: any) { + return { + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +/** + * A real `PostgresRunStore` that records each create/find call and the client + * arg it received. NOT a mock — every method still issues the real query via + * `super.*`; it only counts and records the forwarded tx/client arg so the tests + * can prove routing (which store ran) and the tx-arg residency fix (what client + * the create forwarded). + */ +class CountingRunStore extends PostgresRunStore { + label: string; + createRunCalls = 0; + createCancelledRunCalls = 0; + createFailedRunCalls = 0; + findRunCalls = 0; + createRunTxArgs: (PrismaClientOrTransaction | undefined)[] = []; + createCancelledRunTxArgs: (PrismaClientOrTransaction | undefined)[] = []; + createFailedRunTxArgs: (PrismaClientOrTransaction | undefined)[] = []; + + constructor(opts: { prisma: any; readOnlyPrisma: any; label?: string }) { + super({ prisma: opts.prisma, readOnlyPrisma: opts.readOnlyPrisma }); + this.label = opts.label ?? "counting"; + } + + override createRun(p: CreateRunInput, tx?: PrismaClientOrTransaction) { + this.createRunCalls++; + this.createRunTxArgs.push(tx); + return super.createRun(p, tx); + } + + override createCancelledRun(p: CreateCancelledRunInput, tx?: PrismaClientOrTransaction) { + this.createCancelledRunCalls++; + this.createCancelledRunTxArgs.push(tx); + return super.createCancelledRun(p, tx); + } + + override createFailedRun(p: CreateFailedRunInput, tx?: PrismaClientOrTransaction) { + this.createFailedRunCalls++; + this.createFailedRunTxArgs.push(tx); + return super.createFailedRun(p, tx); + } + + // findRun is overloaded — override the implementation signature and forward + // every arg through unchanged. + override findRun(...args: any[]) { + this.findRunCalls++; + return (super.findRun as any)(...args); + } +} + +function freshRunId() { + return RunId.generate().friendlyId; +} + +function freshKsuidRunId() { + return RunId.toFriendlyId(generateKsuidId()); +} + +const baseTriggerParams = (friendlyId: string, environment: any, taskIdentifier: string) => ({ + number: 1, + friendlyId, + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [] as string[], +}); + +const cancelledSnapshot = (friendlyId: string, environment: any) => ({ + friendlyId, + environment, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + queue: "task/test-task", + isTest: false, + tags: [] as string[], +}); + +describe("RunEngine trigger/create routing", () => { + // trigger create routes through runStore.createRun with the structured + // DTO, and the persisted run + its nested first RUN_CREATED snapshot land via + // the single create call. + containerTest( + "trigger routes createRun and lands run + first snapshot", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + const friendlyId = freshRunId(); + + const run = await engine.trigger( + baseTriggerParams(friendlyId, environment, taskIdentifier), + prisma + ); + + expect(store.createRunCalls).toBe(1); + expect(run.friendlyId).toBe(friendlyId); + + const stored = await prisma.taskRun.findFirst({ where: { friendlyId } }); + expect(stored).not.toBeNull(); + expect(stored!.id).toBe(run.id); + + const snapshot = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId: run.id }, + orderBy: { createdAt: "asc" }, + }); + expect(snapshot).not.toBeNull(); + expect(snapshot!.executionStatus).toBe("RUN_CREATED"); + } finally { + await engine.quit(); + } + } + ); + + // triggerAndWait persists the RUN-associated waitpoint via the single + // create — the associatedWaitpoint DTO field is nested by the store. + containerTest( + "triggerAndWait persists the RUN-associated waitpoint via createRun", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, environment, [parentTask, childTask]); + + const parentRun = await engine.trigger( + baseTriggerParams(freshRunId(), environment, parentTask), + prisma + ); + + await engine.dequeueFromWorkerQueue({ consumerId: "test", workerQueue: "main" }); + const parentData = await engine.getRunExecutionData({ runId: parentRun.id }); + await engine.startRunAttempt({ + runId: parentRun.id, + snapshotId: parentData!.snapshot.id, + }); + + const callsBefore = store.createRunCalls; + const childRun = await engine.trigger( + { + ...baseTriggerParams(freshRunId(), environment, childTask), + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + }, + prisma + ); + + expect(store.createRunCalls).toBe(callsBefore + 1); + + const waitpoint = await prisma.waitpoint.findFirst({ + where: { completedByTaskRunId: childRun.id }, + }); + expect(waitpoint).not.toBeNull(); + expect(waitpoint!.type).toBe("RUN"); + } finally { + await engine.quit(); + } + } + ); + + // createCancelledRun routes the create, and the P2002 double-pop + // fallback routes through findRun, returning the same CANCELED row. + containerTest( + "createCancelledRun routes create + P2002 fallback find", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const snapshot = cancelledSnapshot(freshRunId(), environment); + const cancelledAt = new Date(); + const cancelReason = "Test idempotent"; + + const first = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + expect(store.createCancelledRunCalls).toBe(1); + + const findCallsBefore = store.findRunCalls; + const second = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + + expect(second.id).toBe(first.id); + expect(second.status).toBe("CANCELED"); + expect(store.createCancelledRunCalls).toBe(2); + expect(store.findRunCalls).toBeGreaterThan(findCallsBefore); + } finally { + await engine.quit(); + } + } + ); + + // createFailedTaskRun routes the single create arm — no second engine + // arm exists (the keyless idempotency retry is internal to the store). + containerTest( + "createFailedTaskRun routes the single createFailedRun arm", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const friendlyId = freshRunId(); + const run = await engine.createFailedTaskRun({ + friendlyId, + environment: { + id: environment.id, + type: environment.type, + project: { id: environment.project.id }, + organization: { id: environment.organization.id }, + }, + taskIdentifier: "test-task", + error: { type: "STRING_ERROR", raw: "boom" }, + }); + + expect(store.createFailedRunCalls).toBe(1); + expect(run.status).toBe("SYSTEM_FAILURE"); + + const stored = await prisma.taskRun.findFirst({ where: { friendlyId } }); + expect(stored).not.toBeNull(); + expect(stored!.status).toBe("SYSTEM_FAILURE"); + } finally { + await engine.quit(); + } + } + ); + + // Each create forwards the BARE caller tx + // (undefined on the default path), never the engine's resolved this.prisma, so + // an injected RoutingRunStore's residency selection is not overridden. + containerTest( + "creates forward the bare caller tx, not the resolved client", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + // trigger called with NO tx → store create must receive undefined. + await engine.trigger(baseTriggerParams(freshRunId(), environment, taskIdentifier)); + expect(store.createRunTxArgs).toHaveLength(1); + + await engine.createCancelledRun({ + snapshot: cancelledSnapshot(freshRunId(), environment), + cancelledAt: new Date(), + cancelReason: "tx-arg check", + }); + expect(store.createCancelledRunTxArgs).toHaveLength(1); + + await engine.createFailedTaskRun({ + friendlyId: freshRunId(), + environment: { + id: environment.id, + type: environment.type, + project: { id: environment.project.id }, + organization: { id: environment.organization.id }, + }, + taskIdentifier, + error: { type: "STRING_ERROR", raw: "boom" }, + }); + expect(store.createFailedRunTxArgs).toHaveLength(1); + + // Each create must forward the bare caller tx (undefined here), NOT the + // engine's resolved client. Assert by identity to avoid a deep compare of + // the (recursive) Prisma client object. + for (const arg of [ + ...store.createRunTxArgs, + ...store.createCancelledRunTxArgs, + ...store.createFailedRunTxArgs, + ]) { + expect(arg).toBeUndefined(); + expect(arg === prisma).toBe(false); + } + } finally { + await engine.quit(); + } + } + ); + + // The inverse of the bare-tx case above. When the caller DOES pass a tx, the create + // call sites must forward THAT SAME tx to the store by identity — closing the + // gap a regression hardcoding `undefined` would slip through every other test. + containerTest( + "a non-undefined caller tx is forwarded to the store by identity", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingRunStore({ prisma, readOnlyPrisma: prisma }); + const engine = new RunEngine({ prisma, store, ...baseEngineOptions(redisOptions) }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + // trigger called WITH the real prisma client as the explicit tx → the + // store create must receive that exact same client by identity. + await engine.trigger(baseTriggerParams(freshRunId(), environment, taskIdentifier), prisma); + expect(store.createRunTxArgs).toHaveLength(1); + expect(store.createRunTxArgs[0]).toBe(prisma); + + await engine.createCancelledRun( + { + snapshot: cancelledSnapshot(freshRunId(), environment), + cancelledAt: new Date(), + cancelReason: "tx-arg identity check", + }, + prisma + ); + expect(store.createCancelledRunTxArgs).toHaveLength(1); + expect(store.createCancelledRunTxArgs[0]).toBe(prisma); + } finally { + await engine.quit(); + } + } + ); + + // Split/two-store proof: with the ksuid mint enabled, a NEW-minted run id is + // classified NEW and a RoutingRunStore writes it to the run-ops (NEW) store, + // never the LEGACY store. Proves a new run is born on the run-ops store. + containerTest( + "split proof: a NEW-minted run lands on the run-ops (NEW) store, not LEGACY", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const newStore = new CountingRunStore({ prisma, readOnlyPrisma: prisma, label: "new" }); + const legacyStore = new CountingRunStore({ + prisma, + readOnlyPrisma: prisma, + label: "legacy", + }); + const routing = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const engine = new RunEngine({ + prisma, + store: routing, + ...baseEngineOptions(redisOptions), + }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const friendlyId = freshKsuidRunId(); + // Sanity: this id classifies NEW so RoutingRunStore must pick newStore. + expect(ownerEngine(friendlyId)).toBe("NEW"); + + const run = await engine.trigger( + baseTriggerParams(friendlyId, environment, taskIdentifier) + ); + + expect(newStore.createRunCalls).toBe(1); + expect(legacyStore.createRunCalls).toBe(0); + + const stored = await prisma.taskRun.findFirst({ where: { friendlyId } }); + expect(stored!.id).toBe(run.id); + } finally { + await engine.quit(); + } + } + ); + + // A child triggered with the parent's residency persists to the + // SAME store the parent was written to (routing-by-run-id). Both parent and + // child mint NEW (ksuid) ids → both land on newStore. + containerTest("child inherits the parent's residency store", async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const newStore = new CountingRunStore({ prisma, readOnlyPrisma: prisma, label: "new" }); + const legacyStore = new CountingRunStore({ + prisma, + readOnlyPrisma: prisma, + label: "legacy", + }); + const routing = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const engine = new RunEngine({ + prisma, + store: routing, + ...baseEngineOptions(redisOptions), + }); + + try { + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, environment, [parentTask, childTask]); + + const parentRun = await engine.trigger( + baseTriggerParams(freshKsuidRunId(), environment, parentTask) + ); + + await engine.dequeueFromWorkerQueue({ consumerId: "test", workerQueue: "main" }); + const parentData = await engine.getRunExecutionData({ runId: parentRun.id }); + await engine.startRunAttempt({ + runId: parentRun.id, + snapshotId: parentData!.snapshot.id, + }); + + const childRun = await engine.trigger({ + ...baseTriggerParams(freshKsuidRunId(), environment, childTask), + resumeParentOnCompletion: true, + parentTaskRunId: parentRun.id, + rootTaskRunId: parentRun.id, + }); + + // Both ids are NEW → both routed to the run-ops (NEW) store, never LEGACY. + expect(ownerEngine(parentRun.friendlyId)).toBe("NEW"); + expect(ownerEngine(childRun.friendlyId)).toBe("NEW"); + expect(newStore.createRunCalls).toBe(2); + expect(legacyStore.createRunCalls).toBe(0); + + // The child is found on the same store, routed by its run id. + const childOnRouting = await routing.findRun({ id: childRun.id }); + expect(childOnRouting?.id).toBe(childRun.id); + } finally { + await engine.quit(); + } + }); + + // Split-path env integrity / cross-DB control-plane resolution. With a + // resolver whose assertEnvExists throws, the create is blocked and no row is + // written; with one that resolves, the create succeeds. + containerTest( + "split-path env-existence assertion blocks the create on a dangling env", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + class ThrowingResolver implements ControlPlaneResolver { + assertEnvCalls: string[] = []; + constructor(private readonly throws: boolean) {} + async assertEnvExists(environmentId: string): Promise { + this.assertEnvCalls.push(environmentId); + if (this.throws) { + throw new Error(`Environment not found: ${environmentId}`); + } + } + // Unused by the create path under test. + async resolveEnv(): Promise { + return null; + } + async resolveAuthenticatedEnv(): Promise { + return null; + } + async resolveWorkerVersion(): Promise { + return null; + } + } + + const throwingResolver = new ThrowingResolver(true); + const engine = new RunEngine({ + prisma, + controlPlaneResolver: throwingResolver, + ...baseEngineOptions(redisOptions), + }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + const friendlyId = freshRunId(); + + await expect( + engine.trigger(baseTriggerParams(friendlyId, environment, taskIdentifier)) + ).rejects.toThrow(/Environment not found/); + + // The assertion ran for the run's env, and NO row was written. + expect(throwingResolver.assertEnvCalls).toContain(environment.id); + const stored = await prisma.taskRun.findFirst({ where: { friendlyId } }); + expect(stored).toBeNull(); + } finally { + await engine.quit(); + } + + // With a resolving resolver, the create succeeds. + const okResolver = new ThrowingResolver(false); + const engine2 = new RunEngine({ + prisma, + controlPlaneResolver: okResolver, + ...baseEngineOptions(redisOptions), + }); + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine2, environment, taskIdentifier); + const friendlyId = freshRunId(); + const run = await engine2.trigger( + baseTriggerParams(friendlyId, environment, taskIdentifier) + ); + expect(run.friendlyId).toBe(friendlyId); + expect(okResolver.assertEnvCalls).toContain(environment.id); + const stored = await prisma.taskRun.findFirst({ where: { friendlyId } }); + expect(stored).not.toBeNull(); + } finally { + await engine2.quit(); + } + } + ); + + // FK-drop app-integrity, single-DB arm: with NO resolver injected, the engine + // defaults to the passthrough resolver which runs the env check against the one + // DB. A dangling env (never created) is rejected by that passthrough check, so + // integrity holds in single-DB mode too. + containerTest( + "FK-drop integrity (single-DB): passthrough rejects a deleted env", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + + try { + const friendlyId = freshRunId(); + // A clearly non-existent env id of the right shape. + const danglingEnv = { + ...environment, + id: "clxnonexistentenv0000000", + }; + + await expect( + engine.createFailedTaskRun({ + friendlyId, + environment: { + id: danglingEnv.id, + type: environment.type, + project: { id: environment.project.id }, + organization: { id: environment.organization.id }, + }, + taskIdentifier: "test-task", + error: { type: "STRING_ERROR", raw: "boom" }, + }) + ).rejects.toThrow(); + + const stored = await prisma.taskRun.findFirst({ where: { friendlyId } }); + expect(stored).toBeNull(); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/ttl.test.ts b/internal-packages/run-engine/src/engine/tests/ttl.test.ts index 52b26452070..6ba775670c8 100644 --- a/internal-packages/run-engine/src/engine/tests/ttl.test.ts +++ b/internal-packages/run-engine/src/engine/tests/ttl.test.ts @@ -5,13 +5,16 @@ import { Decimal } from "@trigger.dev/database"; import { RunEngine } from "../index.js"; import { setTimeout } from "timers/promises"; import { EventBusEventArgs } from "../eventBus.js"; +import { + PassthroughControlPlaneResolver, + type ControlPlaneResolver, +} from "../controlPlaneResolver.js"; import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; vi.setConfig({ testTimeout: 60_000 }); describe("RunEngine ttl", () => { containerTest("Run expiring (ttl)", async ({ prisma, redisOptions }) => { - //create environment const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); const engine = new RunEngine({ @@ -53,7 +56,6 @@ describe("RunEngine ttl", () => { try { const taskIdentifier = "test-task"; - //create background worker const backgroundWorker = await setupBackgroundWorker( engine, authenticatedEnvironment, @@ -68,7 +70,6 @@ describe("RunEngine ttl", () => { maximumConcurrencyLimit: 0, }); - //trigger the run const run = await engine.trigger( { number: 1, @@ -99,7 +100,6 @@ describe("RunEngine ttl", () => { expiredEventData = result; }); - //wait for 1 seconds await setTimeout(1_500); assertNonNullable(expiredEventData); @@ -1610,6 +1610,126 @@ describe("RunEngine ttl", () => { } ); + containerTest( + "expireRun completes the run even when env resolution is unavailable (resolveEnv null)", + async ({ prisma, redisOptions }) => { + // Contract: env resolution is NOT on the expire path — identity comes from + // the run's latest execution snapshot. So with resolveEnv returning null the + // run is still fully expired (message acked, waitpoint completed to unblock a + // parent, runExpired emitted), instead of silently dropped. + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const passthrough = new PassthroughControlPlaneResolver({ + prisma, + }); + const resolver: ControlPlaneResolver = { + resolveAuthenticatedEnv: passthrough.resolveAuthenticatedEnv.bind(passthrough), + resolveWorkerVersion: passthrough.resolveWorkerVersion.bind(passthrough), + assertEnvExists: passthrough.assertEnvExists.bind(passthrough), + async resolveEnv() { + return null; + }, + }; + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + processWorkerQueueDebounceMs: 50, + masterQueueConsumersDisabled: true, + // Disable the batch TTL path so it can't race the manual expireRun call. + ttlSystem: { + disabled: true, + }, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + controlPlaneResolver: resolver, + }); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const expiredEvents: EventBusEventArgs<"runExpired">[0][] = []; + engine.eventBus.on("runExpired", (result) => { + expiredEvents.push(result); + }); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_nullenv1", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t_nullenv", + spanId: "s_nullenv", + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + ttl: "1s", + }, + prisma + ); + + // Run is queued waiting; the message is in the queue. + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("QUEUED"); + + // (a) no throw, (b) runExpired IS emitted from snapshot identity, + // (c) message IS acked off the queue, (d) run reaches EXPIRED. + await expect(engine.ttlSystem.expireRun({ runId: run.id })).resolves.toBeUndefined(); + + expect(expiredEvents.length).toBe(1); + expect(expiredEvents[0]!.run.id).toBe(run.id); + expect(expiredEvents[0]!.run.status).toBe("EXPIRED"); + expect(expiredEvents[0]!.organization.id).toBe(authenticatedEnvironment.organization.id); + expect(expiredEvents[0]!.project.id).toBe(authenticatedEnvironment.project.id); + expect(expiredEvents[0]!.environment.id).toBe(authenticatedEnvironment.id); + + const messageExists = await engine.runQueue.messageExists( + authenticatedEnvironment.organization.id, + run.id + ); + expect(messageExists).toBe(0); + + const expiredRun = await prisma.taskRun.findUnique({ + where: { id: run.id }, + select: { status: true }, + }); + expect(expiredRun?.status).toBe("EXPIRED"); + } finally { + await engine.quit(); + } + } + ); + containerTest("expireRunsBatch handles empty array", async ({ prisma, redisOptions }) => { const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); diff --git a/internal-packages/run-engine/src/engine/tests/waitpointPublicRouter.test.ts b/internal-packages/run-engine/src/engine/tests/waitpointPublicRouter.test.ts new file mode 100644 index 00000000000..5011249dc5c --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/waitpointPublicRouter.test.ts @@ -0,0 +1,505 @@ +import { assertNonNullable, containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { PostgresRunStore } from "@internal/run-store"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import { expect } from "vitest"; +import { setTimeout } from "node:timers/promises"; +import { RunEngine } from "../index.js"; +import type { CrossSeamGuardHook } from "../types.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +// A real PostgresRunStore that counts routed waitpoint calls then delegates to the +// real implementation. Dependency injection of a real store over a real container — not a mock. +class CountingPostgresRunStore extends PostgresRunStore { + public readonly counts = { + findWaitpoint: 0, + createWaitpoint: 0, + }; + + // The read client passed into the most recent findWaitpoint call (args[1]). + public lastFindWaitpointClient: unknown = undefined; + + override findWaitpoint(...args: Parameters) { + this.counts.findWaitpoint++; + this.lastFindWaitpointClient = args[1]; + return super.findWaitpoint(...args); + } + + override createWaitpoint(...args: Parameters) { + this.counts.createWaitpoint++; + return super.createWaitpoint(...args); + } +} + +function engineOptions( + redisOptions: any, + prisma: any, + extra?: { store?: PostgresRunStore; crossSeamGuard?: CrossSeamGuardHook } +) { + return { + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + ...(extra?.store ? { store: extra.store } : {}), + ...(extra?.crossSeamGuard ? { crossSeamGuard: extra.crossSeamGuard } : {}), + }; +} + +async function triggerRun(engine: RunEngine, environment: any, prisma: any, friendlyId: string) { + return engine.trigger( + { + number: 1, + friendlyId, + environment, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `t-${friendlyId}`, + spanId: `s-${friendlyId}`, + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); +} + +// Trigger a run and drive it to EXECUTING (dequeue + start attempt), so it can be blocked. +async function triggerAndStart( + engine: RunEngine, + environment: any, + prisma: any, + friendlyId: string +) { + const run = await triggerRun(engine, environment, prisma, friendlyId); + await setTimeout(500); + await engine.dequeueFromWorkerQueue({ + consumerId: `consumer-${friendlyId}`, + workerQueue: "main", + }); + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + await engine.startRunAttempt({ runId: run.id, snapshotId: executionData.snapshot.id }); + return run; +} + +async function createBatch(prisma: any, environment: any) { + return prisma.batchTaskRun.create({ + data: { + friendlyId: generateFriendlyId("batch"), + runtimeEnvironmentId: environment.id, + }, + }); +} + +describe("RunEngine public waitpoint router", () => { + // getWaitpoint routes its read through the store seam, preserving the env-mismatch guard. + containerTest("getWaitpoint reads through the store", async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ + prisma, + readOnlyPrisma: prisma, + }); + const engine = new RunEngine(engineOptions(redisOptions, prisma, { store })); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: environment.id, + projectId: environment.project.id, + }); + + const before = store.counts.findWaitpoint; + const found = await engine.getWaitpoint({ + waitpointId: waitpoint.id, + environmentId: environment.id, + projectId: environment.project.id, + }); + + // routed through the store exactly once + expect(store.counts.findWaitpoint).toBe(before + 1); + assertNonNullable(found); + expect(found.id).toBe(waitpoint.id); + // the include shape is preserved (blockingTaskRuns is present, even if empty) + expect((found as any).blockingTaskRuns).toBeDefined(); + expect(Array.isArray((found as any).blockingTaskRuns)).toBe(true); + + // the read is pinned to the PRIMARY client, not defaulted to the replica + expect(store.lastFindWaitpointClient).toBe(prisma); + + // env-mismatch guard still returns null + const mismatch = await engine.getWaitpoint({ + waitpointId: waitpoint.id, + environmentId: "env_does_not_exist", + projectId: environment.project.id, + }); + expect(mismatch).toBeNull(); + + // a non-existent waitpointId drives the `if (!waitpoint) return null` branch + const beforeMissing = store.counts.findWaitpoint; + const missing = await engine.getWaitpoint({ + waitpointId: "waitpoint_does_not_exist", + environmentId: environment.id, + projectId: environment.project.id, + }); + expect(missing).toBeNull(); + // not-found was reached THROUGH the store (not short-circuited) + expect(store.counts.findWaitpoint).toBe(beforeMissing + 1); + } finally { + await engine.quit(); + } + }); + + // blockRunWithCreatedBatch routes its BATCH waitpoint create through the store (non-tx path), + // links the run, and preserves the P2002 duplicate-idempotency-key -> null path. + containerTest( + "blockRunWithCreatedBatch writes the BATCH waitpoint through the store", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ + prisma, + readOnlyPrisma: prisma, + }); + const engine = new RunEngine(engineOptions(redisOptions, prisma, { store })); + + try { + await setupBackgroundWorker(engine, environment, "test-task"); + const run = await triggerAndStart(engine, environment, prisma, "run_batchone"); + const batch = await createBatch(prisma, environment); + + const before = store.counts.createWaitpoint; + const waitpoint = await engine.blockRunWithCreatedBatch({ + runId: run.id, + batchId: batch.id, + environmentId: environment.id, + projectId: environment.project.id, + organizationId: environment.organization.id, + }); + + // routed through the store + expect(store.counts.createWaitpoint).toBe(before + 1); + assertNonNullable(waitpoint); + expect(waitpoint.type).toBe("BATCH"); + expect(waitpoint.completedByBatchId).toBe(batch.id); + + // the BATCH waitpoint row exists + const row = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(row?.type).toBe("BATCH"); + expect(row?.completedByBatchId).toBe(batch.id); + + // the run is now blocked: a TaskRunWaitpoint edge links run -> waitpoint + const edge = await prisma.taskRunWaitpoint.findFirst({ + where: { taskRunId: run.id, waitpointId: waitpoint.id }, + }); + assertNonNullable(edge); + + // second call with the same batchId => duplicate idempotency key (P2002) => null + const dup = await engine.blockRunWithCreatedBatch({ + runId: run.id, + batchId: batch.id, + environmentId: environment.id, + projectId: environment.project.id, + organizationId: environment.organization.id, + }); + expect(dup).toBeNull(); + } finally { + await engine.quit(); + } + } + ); + + // With a tx supplied, the create is routed through the store with that tx pinned as the + // client, so the waitpoint is persisted via the caller's transaction. + containerTest( + "blockRunWithCreatedBatch with a tx pins the create to the tx client", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const store = new CountingPostgresRunStore({ + prisma, + readOnlyPrisma: prisma, + }); + const engine = new RunEngine(engineOptions(redisOptions, prisma, { store })); + + try { + await setupBackgroundWorker(engine, environment, "test-task"); + const run = await triggerAndStart(engine, environment, prisma, "run_batchtx"); + const batch = await createBatch(prisma, environment); + + const waitpoint = await engine.blockRunWithCreatedBatch({ + runId: run.id, + batchId: batch.id, + environmentId: environment.id, + projectId: environment.project.id, + organizationId: environment.organization.id, + tx: prisma, + }); + + assertNonNullable(waitpoint); + expect(waitpoint.type).toBe("BATCH"); + expect(waitpoint.completedByBatchId).toBe(batch.id); + + // the waitpoint was created via the provided client and is readable afterwards + const row = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(row?.type).toBe("BATCH"); + } finally { + await engine.quit(); + } + } + ); + + // The delegators still work through the (already system-routed) public API. + containerTest( + "delegators (create/block/getOrCreate) work through the public API", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(engineOptions(redisOptions, prisma)); + + try { + await setupBackgroundWorker(engine, environment, "test-task"); + const run = await triggerAndStart(engine, environment, prisma, "run_delegators1"); + + // createDateTimeWaitpoint + const { waitpoint: dtWaitpoint } = await engine.createDateTimeWaitpoint({ + projectId: environment.project.id, + environmentId: environment.id, + completedAfter: new Date(Date.now() + 60_000), + }); + expect(dtWaitpoint.type).toBe("DATETIME"); + + // createManualWaitpoint + const { waitpoint: manualWaitpoint } = await engine.createManualWaitpoint({ + environmentId: environment.id, + projectId: environment.project.id, + }); + expect(manualWaitpoint.type).toBe("MANUAL"); + expect(manualWaitpoint.status).toBe("PENDING"); + + // blockRunWithWaitpoint + const snapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: [manualWaitpoint.id], + projectId: environment.project.id, + organizationId: environment.organization.id, + }); + expect(snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + const edge = await prisma.taskRunWaitpoint.findFirst({ + where: { taskRunId: run.id, waitpointId: manualWaitpoint.id }, + }); + assertNonNullable(edge); + + // getOrCreateRunWaitpoint + const runWaitpoint = await engine.getOrCreateRunWaitpoint({ + runId: run.id, + projectId: environment.project.id, + environmentId: environment.id, + }); + expect(runWaitpoint.type).toBe("RUN"); + } finally { + await engine.quit(); + } + } + ); + + // The public completeWaitpoint consults the cross-seam hook (RESUME_TOKEN) first, then + // unconditionally delegates. + containerTest( + "completeWaitpoint consults the cross-seam hook (RESUME_TOKEN) then delegates", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const seen: Array<{ waitpointId: string; routeKind: string }> = []; + const engine = new RunEngine( + engineOptions(redisOptions, prisma, { + crossSeamGuard: async ({ waitpointId, routeKind }) => { + seen.push({ waitpointId, routeKind }); + return { store: "legacy", residency: "LEGACY", routeKind }; + }, + }) + ); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: environment.id, + projectId: environment.project.id, + }); + expect(waitpoint.status).toBe("PENDING"); + + await engine.completeWaitpoint({ + id: waitpoint.id, + output: { value: "{}", isError: false }, + }); + + // hook consulted FIRST with the right id + RESUME_TOKEN route kind + expect(seen).toEqual([{ waitpointId: waitpoint.id, routeKind: "RESUME_TOKEN" }]); + + // completion then applied via the unconditional delegation + const after = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(after?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "completeWaitpoint with a throwing guard does not apply (loud, no silent local apply)", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine( + engineOptions(redisOptions, prisma, { + crossSeamGuard: async () => { + throw new Error("UnclassifiableRunId"); + }, + }) + ); + + try { + const { waitpoint } = await engine.createManualWaitpoint({ + environmentId: environment.id, + projectId: environment.project.id, + }); + await expect( + engine.completeWaitpoint({ id: waitpoint.id, output: { value: "{}", isError: false } }) + ).rejects.toThrow(); + + const after = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(after?.status).toBe("PENDING"); + } finally { + await engine.quit(); + } + } + ); + + // Single-DB passthrough: a full public-API round-trip over the one client reads back exactly + // what it wrote, with no crossSeamGuard and no second connection (proven by behavior). + containerTest( + "single-DB passthrough: full public round-trip behaves as today", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + // default single PostgresRunStore (no injected store), no crossSeamGuard + const engine = new RunEngine(engineOptions(redisOptions, prisma)); + + try { + await setupBackgroundWorker(engine, environment, "test-task"); + const run = await triggerAndStart(engine, environment, prisma, "run_passthrough1"); + const batch = await createBatch(prisma, environment); + + // blockRunWithCreatedBatch persists the BATCH waitpoint + edge + const waitpoint = await engine.blockRunWithCreatedBatch({ + runId: run.id, + batchId: batch.id, + environmentId: environment.id, + projectId: environment.project.id, + organizationId: environment.organization.id, + }); + assertNonNullable(waitpoint); + + // getWaitpoint reads back the row it wrote (with the include shape) + const fetched = await engine.getWaitpoint({ + waitpointId: waitpoint.id, + environmentId: environment.id, + projectId: environment.project.id, + }); + assertNonNullable(fetched); + expect(fetched.id).toBe(waitpoint.id); + expect(fetched.type).toBe("BATCH"); + + // duplicate batchId returns null + const dup = await engine.blockRunWithCreatedBatch({ + runId: run.id, + batchId: batch.id, + environmentId: environment.id, + projectId: environment.project.id, + organizationId: environment.organization.id, + }); + expect(dup).toBeNull(); + + // completeWaitpoint marks COMPLETED and unblocks the run (no guard => exactly as today) + await engine.completeWaitpoint({ + id: waitpoint.id, + output: { value: "{}", isError: false }, + }); + const after = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(after?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); + + // FK-drop app-integrity: the routed create + block + complete round-trip introduces no + // dependency on a physical control-plane FK, and the persisted rows are well-formed. + containerTest( + "FK-drop app-integrity: routed waitpoint round-trip is well-formed and FK-independent", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(engineOptions(redisOptions, prisma)); + + try { + await setupBackgroundWorker(engine, environment, "test-task"); + const run = await triggerAndStart(engine, environment, prisma, "run_fkintegrity"); + const batch = await createBatch(prisma, environment); + + const waitpoint = await engine.blockRunWithCreatedBatch({ + runId: run.id, + batchId: batch.id, + environmentId: environment.id, + projectId: environment.project.id, + organizationId: environment.organization.id, + }); + assertNonNullable(waitpoint); + + // rows are well-formed: the waitpoint carries its env/project, the edge links run->wp + const wpRow = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(wpRow?.environmentId).toBe(environment.id); + expect(wpRow?.projectId).toBe(environment.project.id); + + const edge = await prisma.taskRunWaitpoint.findFirst({ + where: { taskRunId: run.id, waitpointId: waitpoint.id }, + }); + assertNonNullable(edge); + expect(edge.projectId).toBe(environment.project.id); + + // completion round-trip still succeeds (no reliance on a cross-server cascade) + await engine.completeWaitpoint({ + id: waitpoint.id, + output: { value: "{}", isError: false }, + }); + const after = await prisma.waitpoint.findFirst({ where: { id: waitpoint.id } }); + expect(after?.status).toBe("COMPLETED"); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/waitpointSystem.controlPlaneResolver.test.ts b/internal-packages/run-engine/src/engine/tests/waitpointSystem.controlPlaneResolver.test.ts new file mode 100644 index 00000000000..e364ea30901 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/waitpointSystem.controlPlaneResolver.test.ts @@ -0,0 +1,213 @@ +// Cross-DB inversion proof for the waitpoint env include (continueRunIfUnblocked). +// Cloud topology: run-ops = new DB (PG17, cross-seam FKs DROPPED), control-plane = legacy DB (PG14). +// The env (with maxConc/burstFactor/project/organization) lives on PG14; the run-ops scalar row on +// PG17. The PassthroughControlPlaneResolver over PG14 resolves the env half (which satisfies +// MinimalAuthenticatedEnvironment for enqueueRun) while the run scalars come from PG17 — no +// cross-DB join. The DB is never mocked. A single-DB passthrough case proves continueRunIfUnblocked +// re-queues byte-identically through the resolved env. +import { assertNonNullable, containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import type { PrismaClient } from "@trigger.dev/database"; +import { setTimeout } from "node:timers/promises"; +import { expect } from "vitest"; +import { MinimalAuthenticatedEnvironment } from "../../shared/index.js"; +import { PassthroughControlPlaneResolver } from "../controlPlaneResolver.js"; +import { PostgresRunStore } from "@internal/run-store"; +import { RunEngine } from "../index.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function createEngineOptions(redisOptions: any, prisma: any) { + return { + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +async function seedControlPlaneEnv(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `prod-${suffix}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${suffix}`, + pkApiKey: `pk_prod_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 13, + }, + }); + return { organization, project, environment }; +} + +describe("WaitpointSystem controlPlaneResolver (hetero cross-DB)", () => { + heteroPostgresTest( + "env resolves from PG14 (control-plane) while the run scalars live on PG17 (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedControlPlaneEnv(prisma14 as unknown as PrismaClient, "cpwp"); + + const runId = "run_cpwp_pg17"; + await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: "run_friendly_cpwp", + runtimeEnvironmentId: cp.environment.id, + organizationId: cp.organization.id, + projectId: cp.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + queue: "task/my-task", + traceId: "trace_cpwp", + spanId: "span_cpwp", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new PassthroughControlPlaneResolver({ + prisma: prisma14 as unknown as PrismaClient, + }); + + const run = await runStore.findRun({ id: runId }); + assertNonNullable(run); + expect(run.runtimeEnvironmentId).toBe(cp.environment.id); + + const env = await resolver.resolveEnv(run.runtimeEnvironmentId); + assertNonNullable(env); + // The resolved env carries everything enqueueRun's MinimalAuthenticatedEnvironment needs. + const asMinimal: MinimalAuthenticatedEnvironment = env; + expect(asMinimal.id).toBe(cp.environment.id); + expect(asMinimal.type).toBe("PRODUCTION"); + expect(asMinimal.maximumConcurrencyLimit).toBe(13); + expect(asMinimal.concurrencyLimitBurstFactor.toNumber()).toBe(2); + expect(asMinimal.project.id).toBe(cp.project.id); + expect(asMinimal.organization.id).toBe(cp.organization.id); + + // Inversion: the run-ops DB (PG17) holds no env row; a co-located join would resolve null. + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + } + ); +}); + +describe("WaitpointSystem controlPlaneResolver (single-DB passthrough)", () => { + containerTest( + "continueRunIfUnblocked re-queues byte-identically through the resolved env", + async ({ prisma, redisOptions }) => { + const environment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine(createEngineOptions(redisOptions, prisma)); + + try { + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, environment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_cpwppassthru1", + environment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t-cpwp", + spanId: "s-cpwp", + workerQueue: "main", + queue: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); + assertNonNullable(dequeued[0]); + + await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + const waitpoint = await engine.createManualWaitpoint({ + environmentId: environment.id, + projectId: environment.projectId, + }); + + await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint.waitpoint.id, + projectId: environment.projectId, + organizationId: environment.organizationId, + }); + + const blocked = await engine.getRunExecutionData({ runId: run.id }); + expect(blocked?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Completing the waitpoint drives continueRunIfUnblocked, which resolves the env via the + // resolver and unblocks the run. + await engine.completeWaitpoint({ id: waitpoint.waitpoint.id }); + await setTimeout(300); + + const unblocked = await engine.getRunExecutionData({ runId: run.id }); + expect(unblocked?.snapshot.executionStatus).toBe("EXECUTING"); + + const stillBlocking = await prisma.taskRunWaitpoint.findFirst({ + where: { taskRunId: run.id }, + }); + expect(stillBlocking).toBeNull(); + } finally { + await engine.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts index a65eca347c6..6cb34716937 100644 --- a/internal-packages/run-engine/src/engine/types.ts +++ b/internal-packages/run-engine/src/engine/types.ts @@ -8,11 +8,13 @@ import { TriggerTraceContext, } from "@trigger.dev/core/v3"; import { PrismaClient, PrismaReplicaClient, TaskRun, Waitpoint } from "@trigger.dev/database"; +import { RunStore } from "@internal/run-store"; import { Worker, type WorkerConcurrencyOptions, type GlobalRateLimiter, } from "@trigger.dev/redis-worker"; +import { type ControlPlaneResolver } from "./controlPlaneResolver.js"; import { FairQueueSelectionStrategyOptions } from "../run-queue/fairQueueSelectionStrategy.js"; import { MinimalAuthenticatedEnvironment } from "../shared/index.js"; import { LockRetryConfig } from "./locking.js"; @@ -21,9 +23,44 @@ import { type BillingPlan } from "./billingCache.js"; import type { DRRConfig } from "../batch-queue/types.js"; import type { PendingVersionRunIdLookup } from "./services/pendingVersionLookup.js"; +/** + * Structural mirror of the webapp's CrossSeamGuardDecision + * (apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts). + * Re-declared here because @internal/run-engine must not depend on the webapp. + * Keep field names identical so the injected value is assignable. + */ +export type CrossSeamGuardDecision = { + store: "new" | "legacy"; + residency: "NEW" | "LEGACY"; + routeKind: string; + pinnedReason?: string; +}; + +/** + * Optional cross-seam residency store-selection guard for waitpoint completion. + * Injected by the webapp as `pickRunOpsStoreForCompletion`. + * A no-op (returns store="legacy", the single store) when the split is OFF — the + * webapp wrapper short-circuits without classifying. + * When omitted entirely (self-host, tests), completeWaitpoint behaves exactly + * as today. + */ +export type CrossSeamGuardHook = (input: { + waitpointId: string; + routeKind: "MANUAL" | "DATETIME" | "RESUME_TOKEN" | "IDEMPOTENCY_REUSE" | "RUN"; +}) => Promise; + export type RunEngineOptions = { prisma: PrismaClient; readOnlyPrisma?: PrismaReplicaClient; + /** Optional RunStore implementation to inject. Defaults to a PostgresRunStore + * built from `prisma`/`readOnlyPrisma`, so single-DB / self-host behavior is unchanged. */ + store?: RunStore; + /** Optional ControlPlaneResolver to inject. Defaults to a PassthroughControlPlaneResolver + * built from `prisma`/`readOnlyPrisma` (in-DB joins), so single-DB / self-host behavior is + * unchanged. The webapp injects an adapter over its cross-DB cached resolver. */ + controlPlaneResolver?: ControlPlaneResolver; + /** Optional cross-seam store-selection guard. Omit for single-DB / tests. */ + crossSeamGuard?: CrossSeamGuardHook; worker: { disabled?: boolean; redis: RedisOptions; diff --git a/internal-packages/run-engine/src/index.ts b/internal-packages/run-engine/src/index.ts index 43ca7f177c6..b385f5e753e 100644 --- a/internal-packages/run-engine/src/index.ts +++ b/internal-packages/run-engine/src/index.ts @@ -12,6 +12,13 @@ export type { PendingVersionRunIdLookupResult, } from "./engine/services/pendingVersionLookup.js"; export { NoopPendingVersionRunIdLookup } from "./engine/services/pendingVersionLookup.js"; +export { PassthroughControlPlaneResolver } from "./engine/controlPlaneResolver.js"; +export type { + ControlPlaneResolver, + ResolvedEngineEnv, + ResolvedAuthenticatedEnv, + ResolvedWorkerVersion, +} from "./engine/controlPlaneResolver.js"; // Batch Queue exports export { BatchQueue, BatchCompletionTracker } from "./batch-queue/index.js"; diff --git a/internal-packages/run-store/package.json b/internal-packages/run-store/package.json index 096888c4e96..481dcae4b9a 100644 --- a/internal-packages/run-store/package.json +++ b/internal-packages/run-store/package.json @@ -14,6 +14,7 @@ } }, "dependencies": { + "@internal/run-ops-database": "workspace:*", "@trigger.dev/core": "workspace:*", "@trigger.dev/database": "workspace:*" }, diff --git a/internal-packages/run-store/src/NoopRunStore.ts b/internal-packages/run-store/src/NoopRunStore.ts deleted file mode 100644 index 067aa3de096..00000000000 --- a/internal-packages/run-store/src/NoopRunStore.ts +++ /dev/null @@ -1,89 +0,0 @@ -import type { RunStore } from "./types.js"; - -/** Test double: throws on any call. Inject into units that must not write runs. */ -export class NoopRunStore implements RunStore { - private fail(method: string): never { - throw new Error(`NoopRunStore.${method} called`); - } - createRun(): never { - return this.fail("createRun"); - } - createCancelledRun(): never { - return this.fail("createCancelledRun"); - } - createFailedRun(): never { - return this.fail("createFailedRun"); - } - startAttempt(): never { - return this.fail("startAttempt"); - } - completeAttemptSuccess(): never { - return this.fail("completeAttemptSuccess"); - } - recordRetryOutcome(): never { - return this.fail("recordRetryOutcome"); - } - requeueRun(): never { - return this.fail("requeueRun"); - } - recordBulkActionMembership(): never { - return this.fail("recordBulkActionMembership"); - } - cancelRun(): never { - return this.fail("cancelRun"); - } - failRunPermanently(): never { - return this.fail("failRunPermanently"); - } - expireRun(): never { - return this.fail("expireRun"); - } - expireRunsBatch(): never { - return this.fail("expireRunsBatch"); - } - lockRunToWorker(): never { - return this.fail("lockRunToWorker"); - } - parkPendingVersion(): never { - return this.fail("parkPendingVersion"); - } - promotePendingVersionRuns(): never { - return this.fail("promotePendingVersionRuns"); - } - suspendForCheckpoint(): never { - return this.fail("suspendForCheckpoint"); - } - resumeFromCheckpoint(): never { - return this.fail("resumeFromCheckpoint"); - } - rescheduleRun(): never { - return this.fail("rescheduleRun"); - } - enqueueDelayedRun(): never { - return this.fail("enqueueDelayedRun"); - } - rewriteDebouncedRun(): never { - return this.fail("rewriteDebouncedRun"); - } - updateMetadata(): never { - return this.fail("updateMetadata"); - } - clearIdempotencyKey(): never { - return this.fail("clearIdempotencyKey"); - } - pushTags(): never { - return this.fail("pushTags"); - } - pushRealtimeStream(): never { - return this.fail("pushRealtimeStream"); - } - findRun(): never { - return this.fail("findRun"); - } - findRunOrThrow(): never { - return this.fail("findRunOrThrow"); - } - findRuns(): never { - return this.fail("findRuns"); - } -} diff --git a/internal-packages/run-store/src/PostgresRunStore.batchProbeReadClient.test.ts b/internal-packages/run-store/src/PostgresRunStore.batchProbeReadClient.test.ts new file mode 100644 index 00000000000..ec6ec5cd3ec --- /dev/null +++ b/internal-packages/run-store/src/PostgresRunStore.batchProbeReadClient.test.ts @@ -0,0 +1,294 @@ +// RED→GREEN repro for the run-ops split BASELINE BLOCKER: +// RoutingRunStore cross-DB PROBE reads forward the caller's control-plane `client` into the #new +// sub-store probe, so #new queries the CONTROL-PLANE DB instead of its own (5434) and never finds a +// ksuid-resident batch/attempt → returns null. Live effect: batchSystem.#tryCompleteBatch calls +// `runStore.findBatchTaskRunById(batchId, undefined, this.$.prisma)` → null → "batch doesn't exist" +// → the batch waitpoint is never completed → every `batchTriggerAndWait` parent hangs forever. +// +// `heteroRunOpsPostgresTest` gives a REAL split topology: prisma17 = real RunOpsPrismaClient over the +// dedicated subset schema (#new / 5434), prisma14 = full legacy schema on a SEPARATE physical PG +// container (#legacy / control-plane). NEVER mocked. The repro seeds a ksuid batch (and a ksuid +// attempt) on #new and probes via the router passing the LEGACY client as the read client — exactly +// as the live caller does. RED before the fix (router forwards the client → #new reads control-plane +// → null); GREEN after (router drops the client → #new reads its own DB → finds the row). + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH (runOpsResidency.ts): 25 chars → cuid → LEGACY, +// 27 chars → ksuid → NEW. +const CUID_25 = "c".repeat(25); // → LEGACY (#legacy / prisma14, full schema) +const KSUID_27 = "k".repeat(27); // → NEW (#new / prisma17, dedicated subset schema) + +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function makeDedicatedStore(prisma17: RunOpsPrismaClient) { + return new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); +} + +function makeLegacyStore(prisma14: PrismaClient) { + return new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); +} + +// Real production split topology: #new = dedicated subset on prisma17, #legacy = full schema on +// prisma14 — two physically distinct DBs. +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = makeLegacyStore(prisma14); + const newStore = makeDedicatedStore(prisma17); + return { + router: new RoutingRunStore({ new: newStore, legacy: legacyStore }), + legacyStore, + newStore, + }; +} + +describe("run-ops split — cross-DB probe reads must NOT forward the caller's control-plane client", () => { + // findBatchTaskRunById — the live batchTriggerAndWait hang: #tryCompleteBatch probes with the + // control-plane client, which the router forwarded into #new → #new read the wrong DB → null. + heteroRunOpsPostgresTest( + "findBatchTaskRunById FINDS a ksuid batch on #new even when probed with the LEGACY (control-plane) client", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "batchprobe_new"); + const batchId = `batch_${KSUID_27}`; // ksuid → #new + + // Seed the batch directly on #new (5434), exactly where a runEngine-routed ksuid batch lives. + await prisma17.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_probe_new", + runtimeEnvironmentId: env.environment.id, + runCount: 3, + successfulRunCount: 3, + status: "PENDING", + }, + }); + + // Probe EXACTLY as batchSystem.#tryCompleteBatch does: pass the control-plane client. + // RED before fix: null (probed control-plane). GREEN after: resolved from #new's own DB. + const found = await router.findBatchTaskRunById(batchId, undefined, prisma14 as never); + expect(found).not.toBeNull(); + expect(found!.id).toBe(batchId); + expect(found!.successfulRunCount).toBe(3); + } + ); + + // Control: a cuid batch on #legacy is still found through the router when probed with the same + // (legacy) client — proving the fix does not regress the legacy cohort. + heteroRunOpsPostgresTest( + "findBatchTaskRunById control: a cuid batch on #legacy is still found", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "batchprobe_leg"); + const batchId = `batch_${CUID_25}`; // cuid → #legacy + + await prisma14.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_probe_leg", + runtimeEnvironmentId: env.environment.id, + runCount: 1, + successfulRunCount: 1, + status: "PENDING", + }, + }); + + const found = await router.findBatchTaskRunById(batchId, undefined, prisma14 as never); + expect(found).not.toBeNull(); + expect(found!.id).toBe(batchId); + } + ); + + // findBatchTaskRunByFriendlyId — same anti-pattern (env-scoped friendlyId probe). + heteroRunOpsPostgresTest( + "findBatchTaskRunByFriendlyId FINDS a ksuid batch on #new despite the LEGACY client", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "batchfid_new"); + const batchId = `batch_${KSUID_27}`; + const friendlyId = "batch_fid_new"; + + await prisma17.batchTaskRun.create({ + data: { + id: batchId, + friendlyId, + runtimeEnvironmentId: env.environment.id, + status: "PENDING", + }, + }); + + const found = await router.findBatchTaskRunByFriendlyId( + friendlyId, + env.environment.id, + undefined, + prisma14 as never + ); + expect(found).not.toBeNull(); + expect(found!.id).toBe(batchId); + } + ); + + // findBatchTaskRunByIdempotencyKey — same anti-pattern (env + idempotency-key probe). + heteroRunOpsPostgresTest( + "findBatchTaskRunByIdempotencyKey FINDS a ksuid batch on #new despite the LEGACY client", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "batchidem_new"); + const batchId = `batch_${KSUID_27}`; + const idempotencyKey = "idem_batch_new"; + + await prisma17.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_idem_new", + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + status: "PENDING", + }, + }); + + const found = await router.findBatchTaskRunByIdempotencyKey( + env.environment.id, + idempotencyKey, + undefined, + prisma14 as never + ); + expect(found).not.toBeNull(); + expect(found!.id).toBe(batchId); + } + ); + + // findTaskRunAttempt — same anti-pattern. A classifiable taskRunId routes to the owning store + // (#new for a ksuid run) but the control-plane client was still forwarded into it. + heteroRunOpsPostgresTest( + "findTaskRunAttempt FINDS a ksuid attempt on #new even when probed with the LEGACY client", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "attempt_new"); + const runId = `run_${KSUID_27}`; // ksuid run → #new + const attemptId = `attempt_${KSUID_27}`; + + // The attempt's owning run lives on #new (the FK is co-resident on the dedicated schema). + await prisma17.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_attempt_new", + runtimeEnvironmentId: env.environment.id, + environmentType: "DEVELOPMENT", + organizationId: env.organization.id, + projectId: env.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${runId}`, + spanId: `span_${runId}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await prisma17.taskRunAttempt.create({ + data: { + id: attemptId, + number: 1, + friendlyId: "attempt_fid_new", + taskRunId: runId, + backgroundWorkerId: `bw_${KSUID_27}`, + backgroundWorkerTaskId: `bwt_${KSUID_27}`, + runtimeEnvironmentId: env.environment.id, + queueId: `queue_${KSUID_27}`, + status: "PENDING", + }, + }); + + // Probe with the LEGACY client, mirroring callers that pass the control-plane handle. + const found = await router.findTaskRunAttempt( + { where: { taskRunId: runId } }, + prisma14 as never + ); + expect(found).not.toBeNull(); + expect(found!.id).toBe(attemptId); + } + ); + + // Split-OFF guard: with a single store configured, the probe finds the batch with or without a + // passed client (the one configured store reads its own DB either way) — no behavior change. + heteroRunOpsPostgresTest( + "split-OFF: a single-store router finds the batch with or without a passed client", + async ({ prisma17 }) => { + const newStore = makeDedicatedStore(prisma17); + // Single-DB config: both slots point at the same dedicated store (split effectively OFF). + const router = new RoutingRunStore({ new: newStore, legacy: newStore }); + const env = await seedEnvironment(prisma17, "dedicated", "splitoff_new"); + const batchId = `batch_${KSUID_27}`; + + await prisma17.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_splitoff", + runtimeEnvironmentId: env.environment.id, + status: "PENDING", + }, + }); + + const withoutClient = await router.findBatchTaskRunById(batchId); + const withClient = await router.findBatchTaskRunById(batchId, undefined, prisma17 as never); + expect(withoutClient?.id).toBe(batchId); + expect(withClient?.id).toBe(batchId); + } + ); +}); diff --git a/internal-packages/run-store/src/PostgresRunStore.crossGenerationError.test.ts b/internal-packages/run-store/src/PostgresRunStore.crossGenerationError.test.ts new file mode 100644 index 00000000000..7dee0d3bd59 --- /dev/null +++ b/internal-packages/run-store/src/PostgresRunStore.crossGenerationError.test.ts @@ -0,0 +1,104 @@ +// Cross-generation Prisma error normalization LOCK. +// +// The store can be backed by the run-ops `@internal/run-ops-database` client, a SEPARATELY +// generated Prisma client with its OWN `PrismaClientKnownRequestError` class object (distinct +// module identity from `@trigger.dev/database`'s, even at the same version). A P2002 raised by +// the run-ops client is therefore NOT `instanceof` the control-plane +// `Prisma.PrismaClientKnownRequestError` — so the webapp's uniform P2002→422 conversion +// (`error instanceof Prisma.PrismaClientKnownRequestError`) is skipped and a raw 500 escapes. +// +// PostgresRunStore normalizes at its write boundary: a routed NEW-client P2002 surfaces such +// that a control-plane `instanceof` catch (the 422 path) sees it. This test drives a REAL +// duplicate-key on the REAL run-ops-generation client (prisma17) through the store and asserts +// the surfaced error is recognized by the control-plane class — the exact predicate every +// routed-write caller uses. Fails before the normalization (raw foreign error ⇒ instanceof false). + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import { Prisma } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import type { CreateBatchTaskRunData } from "./types.js"; + +function makeDedicatedStore(prisma17: RunOpsPrismaClient) { + return new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); +} + +function batchData(overrides: Partial = {}): CreateBatchTaskRunData { + return { + id: `batch_${"x".repeat(24)}`, + friendlyId: "batch_dup_friendly", + runtimeEnvironmentId: "env_cgerr", + status: "PENDING", + runCount: 1, + expectedCount: 1, + batchVersion: "runengine:v2", + sealed: false, + ...overrides, + }; +} + +describe("PostgresRunStore — cross-generation Prisma error normalization", () => { + heteroRunOpsPostgresTest( + "a routed NEW-client P2002 surfaces as a control-plane instanceof Prisma.PrismaClientKnownRequestError", + async ({ prisma17 }) => { + const store = makeDedicatedStore(prisma17); + + // First create succeeds; second collides on the unique friendlyId → NEW-generation P2002. + await store.createBatchTaskRun(batchData({ id: `batch_${"a".repeat(24)}` })); + + let caught: unknown; + try { + await store.createBatchTaskRun(batchData({ id: `batch_${"b".repeat(24)}` })); + } catch (error) { + caught = error; + } + + // The control-plane `instanceof` catch (the P2002→422 path the webapp uses) must see it. + expect(caught instanceof Prisma.PrismaClientKnownRequestError).toBe(true); + const known = caught as Prisma.PrismaClientKnownRequestError; + expect(known.code).toBe("P2002"); + // code/message/meta are preserved through the normalization. + expect(typeof known.message).toBe("string"); + expect(known.message.length).toBeGreaterThan(0); + expect(known.clientVersion).toBeTruthy(); + } + ); + + heteroRunOpsPostgresTest( + "a NEW-client P2002 inside runInTransaction is also normalized to the control-plane class", + async ({ prisma17 }) => { + const store = makeDedicatedStore(prisma17); + + await store.createBatchTaskRun(batchData({ id: `batch_${"c".repeat(24)}` })); + + let caught: unknown; + try { + await store.runInTransaction(undefined, async (txStore) => { + await txStore.createBatchTaskRun(batchData({ id: `batch_${"d".repeat(24)}` })); + }); + } catch (error) { + caught = error; + } + + expect(caught instanceof Prisma.PrismaClientKnownRequestError).toBe(true); + expect((caught as Prisma.PrismaClientKnownRequestError).code).toBe("P2002"); + } + ); + + heteroRunOpsPostgresTest( + "a successful NEW-client write is untouched by the normalization wrapper", + async ({ prisma17 }) => { + const store = makeDedicatedStore(prisma17); + + const created = await store.createBatchTaskRun(batchData({ id: `batch_${"e".repeat(24)}` })); + + expect(created.id).toBe(`batch_${"e".repeat(24)}`); + expect(created.friendlyId).toBe("batch_dup_friendly"); + } + ); +}); diff --git a/internal-packages/run-store/src/PostgresRunStore.dedicatedRepro.test.ts b/internal-packages/run-store/src/PostgresRunStore.dedicatedRepro.test.ts new file mode 100644 index 00000000000..e0cc3e217b4 --- /dev/null +++ b/internal-packages/run-store/src/PostgresRunStore.dedicatedRepro.test.ts @@ -0,0 +1,1555 @@ +// Store-level regression suite for the run-ops split. +// +// These tests EMPIRICALLY PROVE the critical/high store-level correctness issues against the REAL +// dedicated subset schema (`heteroRunOpsPostgresTest.prisma17` is a real `RunOpsPrismaClient` over +// the @internal/run-ops-database SUBSET schema) and the full legacy schema on a SEPARATE physical +// PG container (`prisma14`). An earlier harness masked every one of these by backing the "#new" +// store with the FULL legacy schema and globally minting ksuid, so the split never ran against the +// dedicated schema. +// +// Each case either asserts the fixed behavior directly or, for a still-open item, wraps the broken +// behavior so the suite documents it. They are runnable (not skipped) so the behavior is +// demonstrated end-to-end against two physical DBs. + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { CreateRunInput, RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH (runOpsResidency.ts): 25 chars → cuid → LEGACY, +// 27 chars → ksuid → NEW. A `run_`-prefixed friendly id strips the first underscore before length. +const CUID_25 = "c".repeat(25); // → LEGACY (#legacy / control-plane DB, full schema) +const KSUID_27 = "k".repeat(27); // → NEW (#new / dedicated run-ops DB, subset schema) + +// On the dedicated subset there are no Organization/Project/RuntimeEnvironment models (the run-ops +// rows carry FK-free scalar ids), so we mint synthetic owning ids. On legacy we seed the real rows +// the kept FKs require. +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +function makeDedicatedStore(prisma17: RunOpsPrismaClient) { + return new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); +} + +function makeLegacyStore(prisma14: PrismaClient) { + return new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); +} + +// The REAL production split topology: #new = dedicated subset on prisma17, #legacy = full schema on +// prisma14. Two physically-distinct DBs, dedicated schema on #new — exactly what a single-schema +// harness never wires. +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = makeLegacyStore(prisma14); + const newStore = makeDedicatedStore(prisma17); + return { + router: new RoutingRunStore({ new: newStore, legacy: legacyStore }), + legacyStore, + newStore, + }; +} + +describe("run-ops split — store-level behavior against the REAL dedicated schema", () => { + // =========================================================================================== + // continueRunIfUnblocked dedicated-schema relation-select validation throw. + // `continueRunIfUnblocked` reads edges with `select:{ waitpoint:{...} }` + // (waitpointSystem.ts); the dedicated `TaskRunWaitpoint` model has NO `waitpoint` + // relation (only a `waitpointId` scalar), and `PostgresRunStore.findManyTaskRunWaitpoints` + // must strip/hydrate rather than pass the args straight through. Against the real run-ops subset + // client an un-stripped select is a Prisma validation error → every waitpoint-blocked run hangs. + // =========================================================================================== + + // The EXACT caller select from continueRunIfUnblocked step 1 no longer throws on the dedicated + // client. With no edges seeded it returns []; the dedicated strip/hydrate branch + // (PostgresRunStore.findManyTaskRunWaitpoints) handles the missing `waitpoint` relation. + heteroRunOpsPostgresTest( + "findManyTaskRunWaitpoints with the continueRunIfUnblocked select does NOT throw on the DEDICATED client", + async ({ prisma17 }) => { + const store = makeDedicatedStore(prisma17); + + const rows = await store.findManyTaskRunWaitpoints({ + where: { taskRunId: `run_${KSUID_27}` }, + select: { + id: true, + batchId: true, + batchIndex: true, + waitpoint: { + select: { id: true, status: true, type: true, completedAfter: true }, + }, + }, + }); + expect(rows).toEqual([]); + } + ); + + // A co-resident block edge on the dedicated client hydrates its `waitpoint` + // relation from the scalar `waitpointId`, returning the requested fields (no Prisma throw). + heteroRunOpsPostgresTest( + "the dedicated waitpoint relation-select hydrates a co-resident waitpoint", + async ({ prisma17 }) => { + const store = makeDedicatedStore(prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "gap4hyd_new"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${KSUID_27}`; + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap4hyd", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + await prisma17.taskRunWaitpoint.create({ + data: { taskRunId: runId, waitpointId, projectId: env.project.id }, + }); + + const rows = await store.findManyTaskRunWaitpoints({ + where: { taskRunId: runId }, + select: { + id: true, + waitpoint: { select: { id: true, status: true } }, + }, + }); + expect(rows).toHaveLength(1); + expect(rows[0].waitpoint).toEqual({ id: waitpointId, status: "PENDING" }); + } + ); + + // Control: the LEGACY full schema HAS the `waitpoint` relation, so the same select must NOT throw. + // This proves the throw is specific to the dedicated subset schema, not the query shape. + heteroRunOpsPostgresTest( + "control: the SAME select does NOT throw on the LEGACY full schema", + async ({ prisma14 }) => { + const store = makeLegacyStore(prisma14); + + const rows = await store.findManyTaskRunWaitpoints({ + where: { taskRunId: `run_${KSUID_27}` }, + select: { + id: true, + waitpoint: { select: { id: true, status: true } }, + }, + }); + expect(rows).toEqual([]); + } + ); + + // The full router path (continueRunIfUnblocked fans to BOTH stores via + // RoutingRunStore.findManyTaskRunWaitpoints) no longer throws — the #new (dedicated) leg strips + // the relation and the router re-resolves `waitpoint` cross-DB. Empty result with no edges seeded. + heteroRunOpsPostgresTest( + "RoutingRunStore.findManyTaskRunWaitpoints does NOT throw even though the #new leg is dedicated", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + + const rows = await router.findManyTaskRunWaitpoints({ + where: { taskRunId: `run_${KSUID_27}` }, + select: { + id: true, + waitpoint: { select: { id: true, status: true, type: true, completedAfter: true } }, + }, + }); + expect(rows).toEqual([]); + } + ); + + // =========================================================================================== + // Cross-DB waitpoint hydration through the router. + // A ksuid run (on #new) blocked by a waitpoint that lives on the OTHER DB (#legacy). The block + // edge co-resides with the run on #new; the token is on #legacy. A single store hydrates the + // edge's `waitpoint` from its own client → null → the run hangs / loses output. The + // router must re-resolve the token across BOTH DBs. + // =========================================================================================== + + // Co-resident control (the ksuid happy path): a ksuid run blocked by a ksuid waitpoint, + // both on #new, hydrates through the router with the real status/output. + heteroRunOpsPostgresTest( + "cross-DB: a ksuid run blocked by a CO-RESIDENT ksuid waitpoint hydrates the real status via the router", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "cores_new"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${KSUID_27}`; + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_cores", + type: "MANUAL", + status: "COMPLETED", + output: '{"resumed":"co-resident"}', + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + await prisma17.taskRunWaitpoint.create({ + data: { taskRunId: runId, waitpointId, projectId: env.project.id }, + }); + + const rows = await router.findManyTaskRunWaitpoints({ + where: { taskRunId: runId }, + select: { + id: true, + waitpoint: { select: { id: true, status: true, output: true } }, + }, + }); + expect(rows).toHaveLength(1); + expect(rows[0].waitpoint).toEqual({ + id: waitpointId, + status: "COMPLETED", + output: '{"resumed":"co-resident"}', + }); + } + ); + + // The cross-DB topology. The block edge is on #new (co-resident with the + // ksuid run), the completing token is on #legacy. The router resolves the token across both DBs + // and returns its REAL status and OUTPUT (the wrong-result guard) — a single store would + // hydrate null here and strand the run. + heteroRunOpsPostgresTest( + "cross-DB: a ksuid run completed by a waitpoint on the OTHER DB hydrates the real status + output via the router", + async ({ prisma14, prisma17 }) => { + const { router, newStore } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "xdb_new"); + const legEnv = await seedEnvironment(prisma14, "legacy", "xdb_leg"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${CUID_25}`; // cuid → lives on #legacy + + // The completing token lives on #legacy (cuid MANUAL token blocking a ksuid run). + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_xdb", + type: "MANUAL", + status: "COMPLETED", + output: '{"resumed":"cross-db"}', + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: legEnv.project.id, + environmentId: legEnv.environment.id, + }, + }); + // The block edge co-resides with the ksuid RUN on #new. + await prisma17.taskRunWaitpoint.create({ + data: { taskRunId: runId, waitpointId, projectId: newEnv.project.id }, + }); + + // Single-store guard: the #new store alone hydrates the edge's waitpoint to null (the token is + // on #legacy) — proving the bug the router fixes. + const singleStoreRows = await newStore.findManyTaskRunWaitpoints({ + where: { taskRunId: runId }, + select: { id: true, waitpoint: { select: { id: true, status: true, output: true } } }, + }); + expect(singleStoreRows[0].waitpoint).toBeNull(); + + // Router path: resolves the cross-DB token and returns the real status + output. + const rows = await router.findManyTaskRunWaitpoints({ + where: { taskRunId: runId }, + select: { + id: true, + waitpoint: { select: { id: true, status: true, output: true } }, + }, + }); + expect(rows).toHaveLength(1); + expect(rows[0].waitpoint).toEqual({ + id: waitpointId, + status: "COMPLETED", + output: '{"resumed":"cross-db"}', + }); + } + ); + + // Hard-error contract: a blocking edge whose waitpoint exists on NEITHER DB must throw, never + // resolve to null (which would let continueRunIfUnblocked treat it as not-COMPLETED forever, or + // silently complete). The router raises rather than strand the run on a phantom blocker. + heteroRunOpsPostgresTest( + "cross-DB: a block edge whose waitpoint is on NEITHER DB throws (no silent null)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "phantom_new"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${"p".repeat(27)}`; // ksuid-shaped, but never created anywhere + + await prisma17.taskRunWaitpoint.create({ + data: { taskRunId: runId, waitpointId, projectId: newEnv.project.id }, + }); + + await expect( + router.findManyTaskRunWaitpoints({ + where: { taskRunId: runId }, + select: { id: true, waitpoint: { select: { id: true, status: true } } }, + }) + ).rejects.toThrow(/not found on either run-ops DB/i); + } + ); + + // =========================================================================================== + // checkpoint→snapshot residency FK break. + // If `createTaskRunCheckpoint` were hardcoded to `#new` while the referencing execution snapshot + // routes by run id, a cuid run's snapshot would land on `#legacy` carrying a `checkpointId` that + // only exists on `#new` → TaskRunExecutionSnapshot_checkpointId_fkey violated; the run cannot + // suspend/checkpoint. Live V2 path (checkpointSystem.ts). + // =========================================================================================== + + // createTaskRunCheckpoint routes by the OWNING run id, so a cuid run's + // checkpoint co-resides on #legacy with its snapshot. The referencing snapshot insert (routed to + // #legacy by the cuid run id) satisfies the checkpointId FK on the same DB — no throw. + heteroRunOpsPostgresTest( + "a cuid-run snapshot referencing its checkpoint satisfies the checkpointId FK on #legacy", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + + // A cuid (LEGACY-resident) run — the in-flight cohort that keeps executing after split-on. + const env = await seedEnvironment(prisma14, "legacy", "gap2_leg"); + const runId = `run_${CUID_25}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap2_legacy", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + // checkpointSystem path: create the checkpoint routed by the OWNING (cuid) run id → #legacy. + const checkpoint = await router.createTaskRunCheckpoint( + { + data: { + friendlyId: `checkpoint_${CUID_25}`, + type: "DOCKER", + location: "s3://bucket/cuid-run-checkpoint", + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }, + }, + runId + ); + + // The referencing snapshot routes by the cuid run id → #legacy. Its checkpointId now resolves + // on the same DB (the checkpoint co-resides), so the insert succeeds. + const snapshot = await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { + executionStatus: "SUSPENDED", + description: "Run suspended after checkpoint", + }, + checkpointId: checkpoint.id, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + expect(snapshot.checkpointId).toBe(checkpoint.id); + } + ); + + // Residency proof: the checkpoint, routed by its cuid owning run id, + // co-resides on #legacy (prisma14) and is ABSENT from #new (prisma17). + heteroRunOpsPostgresTest( + "createTaskRunCheckpoint co-locates the checkpoint with its owning cuid run on #legacy", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "gap2res_leg"); + const runId = `run_${CUID_25}`; + + const checkpoint = await router.createTaskRunCheckpoint( + { + data: { + friendlyId: `checkpoint_res_${CUID_25}`, + type: "DOCKER", + location: "s3://bucket/cp", + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }, + }, + runId + ); + + // Present on #legacy (full schema / prisma14) — where the cuid run's snapshot lives. + const onLegacy = await prisma14.taskRunCheckpoint.findUnique({ + where: { id: checkpoint.id }, + }); + expect(onLegacy).not.toBeNull(); + // Absent from #new (dedicated / prisma17). + const onNew = await prisma17.taskRunCheckpoint.findUnique({ where: { id: checkpoint.id } }); + expect(onNew).toBeNull(); + } + ); + + // Control: a ksuid run's checkpoint, routed by its owning run id, co-resides on #new. + heteroRunOpsPostgresTest( + "control: a ksuid run's checkpoint co-resides on #new with its snapshot", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "gap2k_new"); + const runId = `run_${KSUID_27}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap2k_new", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const checkpoint = await router.createTaskRunCheckpoint( + { + data: { + friendlyId: `checkpoint_${KSUID_27}`, + type: "DOCKER", + location: "s3://bucket/ksuid-run-checkpoint", + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }, + }, + runId + ); + + const onNew = await prisma17.taskRunCheckpoint.findUnique({ where: { id: checkpoint.id } }); + expect(onNew).not.toBeNull(); + + const snapshot = await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "SUSPENDED", description: "ksuid suspended" }, + checkpointId: checkpoint.id, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + expect(snapshot.checkpointId).toBe(checkpoint.id); + } + ); + + // =========================================================================================== + // Snapshot reads must route by run id, not hardcode `#new`. + // If `findExecutionSnapshot` / `findManyExecutionSnapshots` were hardcoded to `#new`, then for a + // cuid run (snapshots on #legacy, because createExecutionSnapshot routes by run id) these reads + // would miss it → null / empty. The getExecutionSnapshotsSince warm-restart path would then throw + // ExecutionSnapshotNotFoundError. + // =========================================================================================== + + // findExecutionSnapshot routes by the OWNING run id, so a cuid run's + // #legacy snapshot is found through the router (the warm-restart `getExecutionSnapshotsSince` step 1 + // shape `{ id, runId }`). + heteroRunOpsPostgresTest( + "findExecutionSnapshot FINDS a cuid run's #legacy snapshot via the router", + async ({ prisma14, prisma17 }) => { + const { router, legacyStore } = makeSplitRouter(prisma14, prisma17); + + const env = await seedEnvironment(prisma14, "legacy", "gap5_leg"); + const runId = `run_${CUID_25}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap5_legacy", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + // Snapshot routes by the cuid run id → physically created on #legacy. + const created = await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "cuid run executing" }, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + // Sanity: the snapshot really is on #legacy (a direct legacy-store read finds it). + const onLegacy = await legacyStore.findExecutionSnapshot({ + where: { id: created.id, runId }, + }); + expect(onLegacy).not.toBeNull(); + + // The router routes by `where.runId` → #legacy → the cuid run's snapshot is found. + const viaRouter = await router.findExecutionSnapshot({ where: { id: created.id, runId } }); + expect(viaRouter).not.toBeNull(); + expect(viaRouter!.id).toBe(created.id); + } + ); + + // findManyExecutionSnapshots routes by `where.runId`, so + // the warm-restart step-2 shape sees a cuid run's #legacy snapshots instead of an empty #new read. + heteroRunOpsPostgresTest( + "findManyExecutionSnapshots SEES a cuid run's #legacy snapshots via the router", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + + const env = await seedEnvironment(prisma14, "legacy", "gap5b_leg"); + const runId = `run_${CUID_25}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap5b_legacy", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "cuid run executing 2" }, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + // getExecutionSnapshotsSince step 2 shape: findMany by runId on the router → routes to #legacy. + const many = await router.findManyExecutionSnapshots({ + where: { runId, isValid: true }, + }); + expect(many.length).toBeGreaterThanOrEqual(1); + } + ); + + // A by-snapshot-id-only read (no runId — snapshot ids are cuids, not classifiable) fans out + // NEW→LEGACY, so a cuid run's #legacy snapshot is still found. + heteroRunOpsPostgresTest( + "findExecutionSnapshot with no runId fans out NEW→LEGACY", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + + const env = await seedEnvironment(prisma14, "legacy", "gap5d_leg"); + const runId = `run_${CUID_25}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap5d_legacy", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + const created = await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "cuid run executing 3" }, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + // No runId in the where — the router must probe both DBs to find the #legacy snapshot. + const viaRouter = await router.findExecutionSnapshot({ where: { id: created.id } }); + expect(viaRouter).not.toBeNull(); + expect(viaRouter!.id).toBe(created.id); + } + ); + + // Control: a KSUID run (on #new / dedicated) IS visible through the router — proving the read gap + // is residency-specific (only the cuid/#legacy cohort would be dropped), not a blanket failure. + heteroRunOpsPostgresTest( + "control: a ksuid run's #new snapshot IS found through the router", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + + const env = await seedEnvironment(prisma17, "dedicated", "gap5c_new"); + const runId = `run_${KSUID_27}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap5c_new", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + const created = await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "ksuid run executing" }, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + const viaRouter = await router.findExecutionSnapshot({ where: { id: created.id, runId } }); + expect(viaRouter).not.toBeNull(); + expect(viaRouter!.id).toBe(created.id); + } + ); + + // =========================================================================================== + // blockWaitpoint raw-CTE wrong-DB no-block. + // `RunEngine.trigger` passes `tx: prisma` (control-plane) into + // `blockRunWithWaitpoint(Lockless)`, forcing the `if (tx)` raw + // `$queryRaw` CTE branch. The CTE inserts into + // `TaskRunWaitpoint`/`_WaitpointRunConnections` and joins `FROM "Waitpoint" w WHERE w.id IN (...)` + // on the `tx`'s DB. When the waitpoint lives on the OTHER physical DB, the join returns 0 rows → + // no edge written → isRunBlocked=false → the parent is silently never suspended. + // + // SCOPING: the behavior lives in WaitpointSystem.blockRunWithWaitpoint, which requires a full + // SystemResources context (RunQueue, EventBus, RunLocker/Redis, controlPlaneResolver, worker, + // pendingVersionRunIdLookup) plus `runLock.lock` and getLatestExecutionSnapshot. + // That is not constructible as a run-store unit test; a faithful end-to-end repro needs the full + // RunEngine.trigger wiring with two physical DBs. What IS tractable here is + // the CORE MECHANISM: the exact raw CTE, run against a `tx` whose DB does NOT hold the waitpoint, + // inserts ZERO block edges. We reproduce that precisely below; the engine-level "parent ends NOT + // suspended" assertion is left to a RunEngine integration test. + // =========================================================================================== + + // Proof of the mechanism: the verbatim block-edge CTE + // run on `tx = prisma14` (the control-plane / #legacy DB) inserts NOTHING when the waitpoint was + // created on prisma17 (#new), because `FROM "Waitpoint" w WHERE w.id IN (...)` finds 0 rows on + // prisma14. Asserts the wrong-DB behavior (0 edges) directly. + heteroRunOpsPostgresTest( + "mechanism: the block-edge CTE writes ZERO edges when the waitpoint is on the other DB", + async ({ prisma14, prisma17 }) => { + // A ksuid parent run + its associated waitpoint live on #new (prisma17 / dedicated). + const newEnv = await seedEnvironment(prisma17, "dedicated", "gap3_new"); + const parentRunId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${KSUID_27}`; + await prisma17.taskRun.create({ + data: { + id: parentRunId, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_gap3_parent", + runtimeEnvironmentId: newEnv.environment.id, + environmentType: "DEVELOPMENT", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${parentRunId}`, + spanId: `span_${parentRunId}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap3", + type: "RUN", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: newEnv.project.id, + environmentId: newEnv.environment.id, + }, + }); + + // The forced-tx branch: RunEngine.trigger passes the control-plane client (= the #legacy DB + // here) as `tx`. Run the VERBATIM block-edge CTE on prisma14 (#legacy). + await prisma14.$queryRaw` + WITH inserted AS ( + INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt") + SELECT gen_random_uuid(), ${parentRunId}, w.id, ${newEnv.project.id}, NOW(), NOW() + FROM "Waitpoint" w + WHERE w.id IN (${waitpointId}) + ON CONFLICT DO NOTHING + RETURNING "waitpointId" + ) + SELECT COUNT(*) FROM inserted`; + + // The waitpoint is on #new, so the join on #legacy matched nothing → NO edge on EITHER DB. + const edgesOnLegacy = await prisma14.taskRunWaitpoint.count({ + where: { taskRunId: parentRunId }, + }); + const edgesOnNew = await prisma17.taskRunWaitpoint.count({ + where: { taskRunId: parentRunId }, + }); + expect(edgesOnLegacy).toBe(0); // the CTE inserted nothing (Waitpoint join empty) + expect(edgesOnNew).toBe(0); // and it never touched the #new DB where the waitpoint lives + + // Therefore countPendingWaitpoints sees no PENDING blocker for the run → the engine would + // treat isRunBlocked=false and NOT suspend the parent. (countPendingWaitpoints on #new finds + // the PENDING waitpoint by id, but with NO edge bound to the run the engine never asks.) + } + ); + + // Control: the SAME CTE on the DB that DOES hold the waitpoint writes the edge correctly — + // proving the failure is purely the wrong-DB join, not a malformed CTE. + heteroRunOpsPostgresTest( + "control: the block-edge CTE writes the edge when the waitpoint is co-resident", + async ({ prisma14 }) => { + const env = await seedEnvironment(prisma14, "legacy", "gap3ctl_leg"); + const parentRunId = `run_${CUID_25}`; + const waitpointId = `waitpoint_${CUID_25}`; + await prisma14.taskRun.create({ + data: { + id: parentRunId, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_gap3ctl_parent", + runtimeEnvironmentId: env.environment.id, + environmentType: "DEVELOPMENT", + organizationId: env.organization.id, + projectId: env.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${parentRunId}`, + spanId: `span_${parentRunId}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap3ctl", + type: "RUN", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + + await prisma14.$queryRaw` + WITH inserted AS ( + INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt") + SELECT gen_random_uuid(), ${parentRunId}, w.id, ${env.project.id}, NOW(), NOW() + FROM "Waitpoint" w + WHERE w.id IN (${waitpointId}) + ON CONFLICT DO NOTHING + RETURNING "waitpointId" + ) + SELECT COUNT(*) FROM inserted`; + + const edges = await prisma14.taskRunWaitpoint.count({ where: { taskRunId: parentRunId } }); + expect(edges).toBe(1); // co-resident → the edge is written, the parent would suspend + } + ); + + // =========================================================================================== + // Lazy RUN-waitpoint residency split. + // `getOrCreateRunWaitpoint` creates the lazy RUN waitpoint via `createWaitpoint` + // carrying `completedByTaskRunId: runId`. Production never mints ksuid waitpoint ids, so routing by + // the waitpoint's OWN id-shape would land it on #legacy while a ksuid run is on #new → run-completion + // hydrate (associatedWaitpoint by completedByTaskRunId on the run's DB) misses it → parent hangs. + // Fix: route the create by the OWNING run id (completedByTaskRunId) so it co-resides with the run. + // =========================================================================================== + + // A ksuid run's lazy RUN-waitpoint with a CUID-shaped waitpoint id (production-like: ksuid + // waitpoint minting is off) co-resides on #new with the run, NOT on #legacy by its own id-shape. + heteroRunOpsPostgresTest( + "a ksuid run's lazy RUN-waitpoint co-resides on #new (routed by completedByTaskRunId)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "gap6_new"); + const runId = `run_${KSUID_27}`; // ksuid run → #new + const waitpointId = `waitpoint_${CUID_25}`; // cuid waitpoint id → would route to #legacy by id-shape + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap6_new", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + // The lazy `getOrCreateRunWaitpoint` create shape: a RUN waitpoint pointing back at its run. + await router.createWaitpoint({ + data: { + id: waitpointId, + friendlyId: "wp_gap6", + type: "RUN", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + completedByTaskRunId: runId, + }, + }); + + // Co-resides with the ksuid run on #new (NOT stranded on #legacy by the cuid id-shape). + const onNew = await prisma17.waitpoint.findUnique({ where: { id: waitpointId } }); + expect(onNew).not.toBeNull(); + const onLegacy = await prisma14.waitpoint.findUnique({ where: { id: waitpointId } }); + expect(onLegacy).toBeNull(); + + // And the run-completion hydrate (associatedWaitpoint by completedByTaskRunId on the run's DB) + // now resolves it — proving the parent would resume rather than hang. + const run = await router.findRun({ id: runId }, { include: { associatedWaitpoint: true } }); + expect((run as any).associatedWaitpoint?.id).toBe(waitpointId); + } + ); + + // Control: a cuid run's lazy RUN-waitpoint co-resides on #legacy with the run. + heteroRunOpsPostgresTest( + "control: a cuid run's lazy RUN-waitpoint co-resides on #legacy", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "gap6c_leg"); + const runId = `run_${CUID_25}`; // cuid run → #legacy + const waitpointId = `waitpoint_${KSUID_27}`; // ksuid waitpoint id → would route to #new by id-shape + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap6c_legacy", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + await router.createWaitpoint({ + data: { + id: waitpointId, + friendlyId: "wp_gap6c", + type: "RUN", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + completedByTaskRunId: runId, + }, + }); + + const onLegacy = await prisma14.waitpoint.findUnique({ where: { id: waitpointId } }); + expect(onLegacy).not.toBeNull(); + const onNew = await prisma17.waitpoint.findUnique({ where: { id: waitpointId } }); + expect(onNew).toBeNull(); + } + ); + + // =========================================================================================== + // Snapshot resume payload must not lose a cross-DB waitpoint's OUTPUT. + // `findLatestExecutionSnapshot` hydrates `completedWaitpoints` from the + // snapshot's own (run's) client. A ksuid run resumed by a waitpoint that completed on the OTHER DB + // (cuid token) would get the token hydrated to a stale/absent row → its OUTPUT silently vanishes from + // the resume payload (a wrong-result, not just a wrong dashboard). Fix: the router re-resolves the + // snapshot's completed waitpoints across BOTH DBs. + // =========================================================================================== + + // A ksuid run's latest snapshot lists a completed waitpoint that lives on #legacy + // (cross-DB). The single #new store hydrates it null; the router recovers its real OUTPUT. + heteroRunOpsPostgresTest( + "findLatestExecutionSnapshot recovers a cross-DB completed waitpoint's OUTPUT via the router", + async ({ prisma14, prisma17 }) => { + const { router, newStore } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "cg1_new"); + const legEnv = await seedEnvironment(prisma14, "legacy", "cg1_leg"); + const runId = `run_${KSUID_27}`; // ksuid run → #new + const waitpointId = `waitpoint_${CUID_25}`; // cuid token → completed on #legacy + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_cg1_new", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + runtimeEnvironmentId: newEnv.environment.id, + }) + ); + + // The completing token lives on #legacy with its OUTPUT. + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_cg1", + type: "MANUAL", + status: "COMPLETED", + output: '{"resumed":"cross-db-output"}', + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: legEnv.project.id, + environmentId: legEnv.environment.id, + }, + }); + + // The latest snapshot (on #new, co-resident with the ksuid run) lists the cross-DB token as a + // completed waitpoint via the CompletedWaitpoint join. + await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "resumed by cross-db token" }, + completedWaitpoints: [{ id: waitpointId, index: 0 }], + environmentId: newEnv.environment.id, + environmentType: "DEVELOPMENT", + projectId: newEnv.project.id, + organizationId: newEnv.organization.id, + }); + + // Single-store guard: the #new store alone hydrates the completed waitpoint to nothing (the + // token is on #legacy) — proving the bug the router fixes. + const singleStore = await newStore.findLatestExecutionSnapshot(runId); + const singleWp = singleStore?.completedWaitpoints?.find((w) => w.id === waitpointId); + expect(singleWp).toBeUndefined(); + + // Router path: re-resolves the cross-DB token and surfaces its real OUTPUT on the resume payload. + const viaRouter = await router.findLatestExecutionSnapshot(runId); + const recovered = viaRouter?.completedWaitpoints?.find((w) => w.id === waitpointId); + expect(recovered).toBeDefined(); + expect(recovered!.output).toBe('{"resumed":"cross-db-output"}'); + expect(recovered!.status).toBe("COMPLETED"); + } + ); + + // =========================================================================================== + // Block-edge WRITER must not require a LOCAL waitpoint row. + // The design routes the block edge to the RUN's DB and mints standalone tokens on LEGACY, so a + // ksuid run on #new can legitimately block on a cuid token resident on #legacy (the one tolerated + // cross-DB direction — the #new `TaskRunWaitpoint` is FK-free precisely for this). If + // `blockRunWithWaitpointEdges`'s dedicated branch sourced the edge rows from + // `FROM "Waitpoint" w WHERE w.id = ANY(...)`, then when the token is NOT on the run's own DB the + // SELECT yields 0 rows → 0 edges written on #new → the run blocks at EXECUTING_WITH_WAITPOINTS with + // no edge → the token's completion (even its own timeout) can never find/resume it → permanent + // hang. The fix sources the edge rows from the waitpointId array directly (`unnest(...)`), since + // the #new DB is FK-free on these columns. + // =========================================================================================== + + // A ksuid run on #new blocking on a cuid token resident on + // #legacy writes the block edge (TaskRunWaitpoint + WaitpointRunConnection) on #new, NOT requiring + // the waitpoint row to be local. The #legacy DB holds NO edge for the ksuid run. + heteroRunOpsPostgresTest( + "a NEW run blocking on a LEGACY-resident token writes the edge on NEW (no local waitpoint required)", + async ({ prisma14, prisma17 }) => { + const { router, newStore } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "gap3b_new"); + const legEnv = await seedEnvironment(prisma14, "legacy", "gap3b_leg"); + const runId = `run_${KSUID_27}`; // ksuid run → #new + const waitpointId = `waitpoint_${CUID_25}`; // cuid standalone token → resides on #legacy + + // The ksuid run lives on #new. + await prisma17.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_gap3b_new", + runtimeEnvironmentId: newEnv.environment.id, + environmentType: "DEVELOPMENT", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${runId}`, + spanId: `span_${runId}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + // The standalone token (minted on LEGACY) lives on #legacy ONLY — it is NOT on #new. + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap3b", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: legEnv.project.id, + environmentId: legEnv.environment.id, + }, + }); + + // Route the block edge by the blocked RUN's id → #new. The token is NOT local to #new, + // but the #new TaskRunWaitpoint is FK-free, so the edge MUST still be written. + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: newEnv.project.id, + }); + + // The block edge is written on #new (co-resident with the run) — a local-waitpoint join writes 0. + const edgesOnNew = await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } }); + expect(edgesOnNew).toBe(1); + // The historical WaitpointRunConnection is also written on #new. + const connectionsOnNew = await prisma17.waitpointRunConnection.count({ + where: { taskRunId: runId }, + }); + expect(connectionsOnNew).toBe(1); + // The #legacy DB holds NO edge for the ksuid run (the safety invariant: no cross-ref on LEGACY). + const edgesOnLegacy = await prisma14.taskRunWaitpoint.count({ where: { taskRunId: runId } }); + expect(edgesOnLegacy).toBe(0); + + // And the edge is discoverable by the token's completion fan-out: a read keyed on the + // token's waitpointId via the router finds the #new-resident edge, so completing the LEGACY + // token would resume the NEW run rather than hang. + const byWaitpoint = await router.findManyTaskRunWaitpoints({ + where: { waitpointId }, + select: { id: true, taskRunId: true }, + }); + expect(byWaitpoint.map((e) => e.taskRunId)).toContain(runId); + + // Single-store cross-check: the #new store ALSO writes the edge directly (proving the fix is in + // the store writer, not only the router routing). + const runId2 = `run_${"m".repeat(27)}`; // a second ksuid run on #new + await prisma17.taskRun.create({ + data: { + id: runId2, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_gap3b_new2", + runtimeEnvironmentId: newEnv.environment.id, + environmentType: "DEVELOPMENT", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${runId2}`, + spanId: `span_${runId2}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await newStore.blockRunWithWaitpointEdges({ + runId: runId2, + waitpointIds: [waitpointId], + projectId: newEnv.project.id, + }); + const edgesOnNew2 = await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId2 } }); + expect(edgesOnNew2).toBe(1); + } + ); + + // Co-resident control: a ksuid run blocking on a CO-RESIDENT ksuid token still writes the + // edge on #new (proving the fix didn't break the co-resident case the old join handled). + heteroRunOpsPostgresTest( + "control: a NEW run blocking on a CO-RESIDENT NEW token writes the edge on NEW", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "gap3bco_new"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${KSUID_27}`; // ksuid token → co-resident on #new + + await prisma17.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_gap3bco_new", + runtimeEnvironmentId: env.environment.id, + environmentType: "DEVELOPMENT", + organizationId: env.organization.id, + projectId: env.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${runId}`, + spanId: `span_${runId}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap3bco", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: env.project.id, + }); + + const edgesOnNew = await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } }); + expect(edgesOnNew).toBe(1); + const connectionsOnNew = await prisma17.waitpointRunConnection.count({ + where: { taskRunId: runId }, + }); + expect(connectionsOnNew).toBe(1); + } + ); + + // Idempotency control: a duplicate block (ON CONFLICT DO NOTHING) must not create a second + // edge — the crash-recovery / retry contract (the engine re-writes the same edge on retry). + heteroRunOpsPostgresTest( + "a duplicate cross-DB block edge is idempotent (ON CONFLICT DO NOTHING)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "gap3bidem_new"); + const legEnv = await seedEnvironment(prisma14, "legacy", "gap3bidem_leg"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${CUID_25}`; // cuid token → #legacy + + await prisma17.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_gap3bidem_new", + runtimeEnvironmentId: newEnv.environment.id, + environmentType: "DEVELOPMENT", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${runId}`, + spanId: `span_${runId}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap3bidem", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: legEnv.project.id, + environmentId: legEnv.environment.id, + }, + }); + + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: newEnv.project.id, + }); + // Replay the same write (a retry after a crash between the edge write and the snapshot flip). + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: newEnv.project.id, + }); + + const edgesOnNew = await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } }); + expect(edgesOnNew).toBe(1); + const connectionsOnNew = await prisma17.waitpointRunConnection.count({ + where: { taskRunId: runId }, + }); + expect(connectionsOnNew).toBe(1); + } + ); + + // Control: a co-resident completed waitpoint (token + run on #new) is unaffected — the router + // re-resolution is idempotent. + heteroRunOpsPostgresTest( + "control: a co-resident completed waitpoint's OUTPUT is preserved through the router", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "cg1c_new"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${KSUID_27}`; // co-resident on #new + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_cg1c_new", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_cg1c", + type: "MANUAL", + status: "COMPLETED", + output: '{"resumed":"co-resident-output"}', + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "resumed by co-resident token" }, + completedWaitpoints: [{ id: waitpointId, index: 0 }], + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + const viaRouter = await router.findLatestExecutionSnapshot(runId); + const recovered = viaRouter?.completedWaitpoints?.find((w) => w.id === waitpointId); + expect(recovered).toBeDefined(); + expect(recovered!.output).toBe('{"resumed":"co-resident-output"}'); + } + ); + + // =========================================================================================== + // `getWaitpoint`'s WAITPOINT_DEDICATED relations + // ({ blockingTaskRuns, connectedRuns, completedExecutionSnapshots }) are hydrated by the dedicated + // store on its OWN client only (PostgresRunStore.findWaitpoint → WAITPOINT_DEDICATED + // hydrators, all keyed by `waitpointId` on the store's single client). But a + // waitpoint's blocking edge, run connection and completing snapshot all CO-LOCATE WITH THE RUN + // (blockRunWithWaitpointEdges routes by runId; the CompletedWaitpoint + WaitpointRunConnection + // join rows are written on the run's DB). A cuid token blocking + // a ksuid run therefore has every group-A TARGET on the OTHER DB → the single-client hydrator finds + // nothing → engine.getWaitpoint (include blockingTaskRuns→taskRun) silently returns an + // empty `blockingTaskRuns`. Fix: the router (RoutingRunStore.findWaitpoint/findManyWaitpoints) strips + // these relation keys from the per-leg query and re-resolves the targets across BOTH DBs, mirroring + // findManyTaskRunWaitpoints' edge fan-out. + // =========================================================================================== + + // A cuid token on #legacy blocking a ksuid run on #new. The block edge + run connection live + // on #new (the run's DB). getWaitpoint's include{ blockingTaskRuns: { select: { taskRun } } } must + // surface the cross-DB blocked run. Single-store guard proves the #legacy hydrator alone misses it. + heteroRunOpsPostgresTest( + "findWaitpoint include blockingTaskRuns surfaces a cross-DB blocked run via the router", + async ({ prisma14, prisma17 }) => { + const { router, legacyStore } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "gap13bt_new"); + const legEnv = await seedEnvironment(prisma14, "legacy", "gap13bt_leg"); + const runId = `run_${KSUID_27}`; // ksuid run → #new + const waitpointId = `waitpoint_${CUID_25}`; // cuid token → #legacy + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap13bt_new", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + runtimeEnvironmentId: newEnv.environment.id, + }) + ); + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap13bt", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: legEnv.project.id, + environmentId: legEnv.environment.id, + }, + }); + + // Real production write path: the edge + WaitpointRunConnection land on the RUN's DB (#new). + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: newEnv.project.id, + }); + + // Residency sanity: the edge and connection are on #new only; the token is on #legacy only. + expect(await prisma17.taskRunWaitpoint.count({ where: { waitpointId } })).toBe(1); + expect(await prisma14.taskRunWaitpoint.count({ where: { waitpointId } })).toBe(0); + expect(await prisma17.waitpoint.count({ where: { id: waitpointId } })).toBe(0); + expect(await prisma14.waitpoint.count({ where: { id: waitpointId } })).toBe(1); + + // Single-store guard: the #legacy store (where the token lives) hydrates blockingTaskRuns from its + // own client → the edge (on #new) is invisible → empty. This is the bug the router fixes. + const single = (await legacyStore.findWaitpoint({ + where: { id: waitpointId }, + include: { + blockingTaskRuns: { select: { taskRun: { select: { id: true, friendlyId: true } } } }, + }, + })) as Record | null; + expect(single?.blockingTaskRuns ?? []).toHaveLength(0); + + // Router path: re-resolves blockingTaskRuns across BOTH DBs → the cross-DB blocked run surfaces. + const viaRouter = (await router.findWaitpoint({ + where: { id: waitpointId }, + include: { + blockingTaskRuns: { select: { taskRun: { select: { id: true, friendlyId: true } } } }, + }, + })) as Record | null; + const blocking = viaRouter?.blockingTaskRuns ?? []; + expect(blocking).toHaveLength(1); + expect(blocking[0].taskRun?.id).toBe(runId); + expect(blocking[0].taskRun?.friendlyId).toBe("run_gap13bt_new"); + } + ); + + // Sibling: connectedRuns. The WaitpointRunConnection join is co-resident with the run (#new), + // so a cuid token's connectedRuns must be re-resolved across BOTH DBs to surface the ksuid run. + heteroRunOpsPostgresTest( + "findWaitpoint include connectedRuns surfaces a cross-DB connected run via the router", + async ({ prisma14, prisma17 }) => { + const { router, legacyStore } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "gap13cr_new"); + const legEnv = await seedEnvironment(prisma14, "legacy", "gap13cr_leg"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${CUID_25}`; + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap13cr_new", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + runtimeEnvironmentId: newEnv.environment.id, + }) + ); + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap13cr", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: legEnv.project.id, + environmentId: legEnv.environment.id, + }, + }); + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: newEnv.project.id, + }); + + expect(await prisma17.waitpointRunConnection.count({ where: { waitpointId } })).toBe(1); + + // Single-store guard: the token's own store sees no connection (it's on #new). + const single = (await legacyStore.findWaitpoint({ + where: { id: waitpointId }, + include: { connectedRuns: { select: { id: true, friendlyId: true } } }, + })) as Record | null; + expect(single?.connectedRuns ?? []).toHaveLength(0); + + const viaRouter = (await router.findWaitpoint({ + where: { id: waitpointId }, + include: { connectedRuns: { select: { id: true, friendlyId: true } } }, + })) as Record | null; + const connected = viaRouter?.connectedRuns ?? []; + expect(connected).toHaveLength(1); + expect(connected[0].id).toBe(runId); + expect(connected[0].friendlyId).toBe("run_gap13cr_new"); + } + ); + + // Sibling: completedExecutionSnapshots. The CompletedWaitpoint join is co-resident with the + // snapshot/run (#new), so a cuid token's completedExecutionSnapshots straddle to #new and must be + // re-resolved across BOTH DBs. + heteroRunOpsPostgresTest( + "findWaitpoint include completedExecutionSnapshots surfaces a cross-DB snapshot via the router", + async ({ prisma14, prisma17 }) => { + const { router, legacyStore } = makeSplitRouter(prisma14, prisma17); + const newEnv = await seedEnvironment(prisma17, "dedicated", "gap13cs_new"); + const legEnv = await seedEnvironment(prisma14, "legacy", "gap13cs_leg"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${CUID_25}`; + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap13cs_new", + organizationId: newEnv.organization.id, + projectId: newEnv.project.id, + runtimeEnvironmentId: newEnv.environment.id, + }) + ); + await prisma14.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap13cs", + type: "MANUAL", + status: "COMPLETED", + output: '{"done":true}', + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: legEnv.project.id, + environmentId: legEnv.environment.id, + }, + }); + // The snapshot (on #new, co-resident with the ksuid run) records the cross-DB token as completed + // via the CompletedWaitpoint join. + const snapshot = await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "completed by cross-db token" }, + completedWaitpoints: [{ id: waitpointId, index: 0 }], + environmentId: newEnv.environment.id, + environmentType: "DEVELOPMENT", + projectId: newEnv.project.id, + organizationId: newEnv.organization.id, + }); + + expect(await prisma17.completedWaitpoint.count({ where: { waitpointId } })).toBe(1); + + // Single-store guard: the token's own (#legacy) store sees no completing snapshot (join on #new). + const single = (await legacyStore.findWaitpoint({ + where: { id: waitpointId }, + include: { completedExecutionSnapshots: { select: { id: true, description: true } } }, + })) as Record | null; + expect(single?.completedExecutionSnapshots ?? []).toHaveLength(0); + + const viaRouter = (await router.findWaitpoint({ + where: { id: waitpointId }, + include: { completedExecutionSnapshots: { select: { id: true, description: true } } }, + })) as Record | null; + const snaps = viaRouter?.completedExecutionSnapshots ?? []; + expect(snaps).toHaveLength(1); + expect(snaps[0].id).toBe(snapshot.id); + expect(snaps[0].description).toBe("completed by cross-db token"); + } + ); + + // Control: a fully co-resident waitpoint (token + run + edge all on #new) is unaffected — the + // router re-resolution is idempotent and does not double-count or drop the local group-A targets. + heteroRunOpsPostgresTest( + "control: a co-resident waitpoint's blockingTaskRuns/connectedRuns are preserved through the router", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "gap13ctl_new"); + const runId = `run_${KSUID_27}`; + const waitpointId = `waitpoint_${KSUID_27}`; // co-resident on #new + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_gap13ctl_new", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + await prisma17.waitpoint.create({ + data: { + id: waitpointId, + friendlyId: "wp_gap13ctl", + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${waitpointId}`, + userProvidedIdempotencyKey: false, + projectId: env.project.id, + environmentId: env.environment.id, + }, + }); + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: env.project.id, + }); + + const viaRouter = (await router.findWaitpoint({ + where: { id: waitpointId }, + include: { + blockingTaskRuns: { select: { taskRun: { select: { id: true } } } }, + connectedRuns: { select: { id: true } }, + }, + })) as Record | null; + expect(viaRouter?.blockingTaskRuns ?? []).toHaveLength(1); + expect(viaRouter!.blockingTaskRuns[0].taskRun?.id).toBe(runId); + expect((viaRouter?.connectedRuns ?? []).map((r: any) => r.id)).toEqual([runId]); + } + ); +}); diff --git a/internal-packages/run-store/src/PostgresRunStore.dedicatedSelect.test.ts b/internal-packages/run-store/src/PostgresRunStore.dedicatedSelect.test.ts new file mode 100644 index 00000000000..0c5fcbf21dd --- /dev/null +++ b/internal-packages/run-store/src/PostgresRunStore.dedicatedSelect.test.ts @@ -0,0 +1,408 @@ +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import type { + CreateRunInput, + RunAssociatedWaitpointInput, + RunStoreSchemaVariant, +} from "./types.js"; + +// The store's structural client accepts either backing Prisma client; the two generated +// clients are nominally distinct so we widen at the boundary, exactly as buildRunStore does. +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// On the dedicated subset schema there are no Organization/Project/RuntimeEnvironment models and +// the run-ops rows carry FK-free scalar ids, so we mint synthetic ids; on legacy we seed the real +// owning rows the FKs require. +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildAssociatedWaitpoint(params: { + id: string; + friendlyId: string; + projectId: string; + environmentId: string; +}): RunAssociatedWaitpointInput { + return { + id: params.id, + friendlyId: params.friendlyId, + type: "RUN", + status: "PENDING", + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + associatedWaitpoint?: RunAssociatedWaitpointInput; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + associatedWaitpoint: params.associatedWaitpoint, + }; +} + +async function seedPendingWaitpoint( + prisma: AnyClient, + params: { id: string; friendlyId: string; projectId: string; environmentId: string } +) { + return (prisma as PrismaClient).waitpoint.create({ + data: { + id: params.id, + friendlyId: params.friendlyId, + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + }, + }); +} + +function makeStore(prisma: AnyClient, schemaVariant: RunStoreSchemaVariant) { + return new PostgresRunStore({ + prisma: prisma as never, + readOnlyPrisma: prisma as never, + schemaVariant, + }); +} + +// --- group-A on TaskRun: associatedWaitpoint ------------------------------------------------- + +// Runs the run-engine-shaped completeAttemptSuccess call: a caller select that includes the +// group-A `associatedWaitpoint` relation key, exactly as runAttemptSystem does. +async function runAssociatedWaitpointScenario( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + const runId = `run_${suffix}`; + const waitpointId = `wp_assoc_${suffix}`; + + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_${suffix}`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + associatedWaitpoint: buildAssociatedWaitpoint({ + id: waitpointId, + friendlyId: `waitpoint_assoc_${suffix}`, + projectId: env.project.id, + environmentId: env.environment.id, + }), + }) + ); + + // The actual run-engine call shape (runAttemptSystem.completeRunAttemptSuccess). + const completed = await store.completeAttemptSuccess( + runId, + { + completedAt: new Date(), + output: '{"done":true}', + outputType: "application/json", + usageDurationMs: 100, + costInCents: 1, + snapshot: { + executionStatus: "FINISHED", + description: "Attempt succeeded", + runStatus: "COMPLETED_SUCCESSFULLY", + attemptNumber: 1, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }, + }, + { + select: { + id: true, + status: true, + associatedWaitpoint: { + select: { id: true }, + }, + }, + } + ); + + // findRun with the same group-A select (the read path). + const found = await store.findRun( + { id: runId }, + { select: { id: true, associatedWaitpoint: { select: { id: true } } } } + ); + + return { runId, waitpointId, completed, found }; +} + +// --- group-A on TaskRunExecutionSnapshot: completedWaitpoints -------------------------------- + +async function runCompletedWaitpointsScenario( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + const runId = `run_${suffix}`; + + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_${suffix}`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const w1 = `wp_${suffix}_1`; + const w2 = `wp_${suffix}_2`; + await seedPendingWaitpoint(prisma, { + id: w1, + friendlyId: `waitpoint_${suffix}_1`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + await seedPendingWaitpoint(prisma, { + id: w2, + friendlyId: `waitpoint_${suffix}_2`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + + const snapshot = await store.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING_WITH_WAITPOINTS", description: "with waitpoints" }, + completedWaitpoints: [ + { id: w2, index: 1 }, + { id: w1, index: 0 }, + ], + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + // The run-engine call shape for fetching a snapshot's completed waitpoints. + const fetched = await store.findExecutionSnapshot({ + where: { id: snapshot.id }, + include: { completedWaitpoints: true }, + }); + + return { snapshotId: snapshot.id, w1, w2, fetched }; +} + +// --- connection back-ref on Waitpoint: blockingTaskRuns -------------------------------------- + +async function runBlockingTaskRunsScenario( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + const runId = `run_${suffix}`; + const waitpointId = `wp_block_${suffix}`; + + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_${suffix}`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + await seedPendingWaitpoint(prisma, { + id: waitpointId, + friendlyId: `waitpoint_block_${suffix}`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + + // Block the run on the waitpoint (writes the TaskRunWaitpoint block edge + connection). + await store.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [waitpointId], + projectId: env.project.id, + }); + + // The run-engine call shape (engine.getWaitpoint). + const waitpoint = await store.findWaitpoint({ + where: { id: waitpointId }, + include: { + blockingTaskRuns: { + select: { + taskRun: { + select: { id: true, friendlyId: true }, + }, + }, + }, + }, + }); + + return { runId, waitpointId, waitpoint, friendlyId: `run_friendly_${suffix}` }; +} + +describe("PostgresRunStore dedicated caller-select adapter (P2-store-bodies-2)", () => { + // associatedWaitpoint (TaskRun group-A) — RED on dedicated before this task, GREEN after. + heteroRunOpsPostgresTest( + "completeAttemptSuccess + findRun honor associatedWaitpoint on the DEDICATED client", + async ({ prisma17 }) => { + const r = await runAssociatedWaitpointScenario(prisma17, "dedicated", "ded_aw"); + + expect(r.completed.id).toBe(r.runId); + expect(r.completed.status).toBe("COMPLETED_SUCCESSFULLY"); + // honor the caller sub-select { id: true } only + expect(r.completed.associatedWaitpoint).not.toBeNull(); + expect(r.completed.associatedWaitpoint!.id).toBe(r.waitpointId); + expect(Object.keys(r.completed.associatedWaitpoint!)).toEqual(["id"]); + + expect(r.found).not.toBeNull(); + expect(r.found!.associatedWaitpoint).not.toBeNull(); + expect(r.found!.associatedWaitpoint!.id).toBe(r.waitpointId); + } + ); + + heteroRunOpsPostgresTest( + "completeAttemptSuccess + findRun honor associatedWaitpoint on the LEGACY client", + async ({ prisma14 }) => { + const r = await runAssociatedWaitpointScenario(prisma14, "legacy", "leg_aw"); + + expect(r.completed.id).toBe(r.runId); + expect(r.completed.associatedWaitpoint).not.toBeNull(); + expect(r.completed.associatedWaitpoint!.id).toBe(r.waitpointId); + expect(r.found!.associatedWaitpoint!.id).toBe(r.waitpointId); + } + ); + + // completedWaitpoints (snapshot group-A) — RED on dedicated before, GREEN after. + heteroRunOpsPostgresTest( + "findExecutionSnapshot honors completedWaitpoints on the DEDICATED client", + async ({ prisma17 }) => { + const r = await runCompletedWaitpointsScenario(prisma17, "dedicated", "ded_cw"); + + expect(r.fetched).not.toBeNull(); + expect(r.fetched!.id).toBe(r.snapshotId); + expect(r.fetched!.completedWaitpoints.map((w) => w.id).sort()).toEqual([r.w1, r.w2].sort()); + } + ); + + heteroRunOpsPostgresTest( + "findExecutionSnapshot honors completedWaitpoints on the LEGACY client", + async ({ prisma14 }) => { + const r = await runCompletedWaitpointsScenario(prisma14, "legacy", "leg_cw"); + + expect(r.fetched).not.toBeNull(); + expect(r.fetched!.completedWaitpoints.map((w) => w.id).sort()).toEqual([r.w1, r.w2].sort()); + } + ); + + // blockingTaskRuns connection back-ref (Waitpoint group-A) — RED on dedicated before, GREEN after. + heteroRunOpsPostgresTest( + "findWaitpoint honors blockingTaskRuns back-ref on the DEDICATED client", + async ({ prisma17 }) => { + const r = await runBlockingTaskRunsScenario(prisma17, "dedicated", "ded_bk"); + + expect(r.waitpoint).not.toBeNull(); + expect(r.waitpoint!.id).toBe(r.waitpointId); + const blocking = r.waitpoint!.blockingTaskRuns; + expect(blocking.length).toBe(1); + expect(blocking[0].taskRun.id).toBe(r.runId); + expect(blocking[0].taskRun.friendlyId).toBe(r.friendlyId); + } + ); + + heteroRunOpsPostgresTest( + "findWaitpoint honors blockingTaskRuns back-ref on the LEGACY client", + async ({ prisma14 }) => { + const r = await runBlockingTaskRunsScenario(prisma14, "legacy", "leg_bk"); + + expect(r.waitpoint).not.toBeNull(); + const blocking = r.waitpoint!.blockingTaskRuns; + expect(blocking.length).toBe(1); + expect(blocking[0].taskRun.id).toBe(r.runId); + expect(blocking[0].taskRun.friendlyId).toBe(r.friendlyId); + } + ); +}); diff --git a/internal-packages/run-store/src/PostgresRunStore.dualSchema.test.ts b/internal-packages/run-store/src/PostgresRunStore.dualSchema.test.ts new file mode 100644 index 00000000000..1770481300f --- /dev/null +++ b/internal-packages/run-store/src/PostgresRunStore.dualSchema.test.ts @@ -0,0 +1,378 @@ +import { heteroPostgresTest, heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import type { CreateRunInput, RunAssociatedWaitpointInput } from "./types.js"; + +// The store's structural client accepts either backing Prisma client; the two generated +// clients are nominally distinct so we widen at the boundary, exactly as buildRunStore does. +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// The dedicated subset schema has no Organization/Project/RuntimeEnvironment models and the +// run-ops TaskRun/Waitpoint carry FK-free scalar ids, so on that variant we mint synthetic +// ids; on legacy we seed the real owning rows the FKs require. +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: "legacy" | "dedicated", + slugSuffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${slugSuffix}` }, + project: { id: `proj_${slugSuffix}` }, + environment: { id: `env_${slugSuffix}` }, + }; + } + + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildAssociatedWaitpoint(params: { + id: string; + friendlyId: string; + projectId: string; + environmentId: string; +}): RunAssociatedWaitpointInput { + return { + id: params.id, + friendlyId: params.friendlyId, + type: "RUN", + status: "PENDING", + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + taskIdentifier: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + associatedWaitpoint?: RunAssociatedWaitpointInput; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: params.taskIdentifier, + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: "trace_1", + spanId: "span_1", + runTags: ["alpha", "beta"], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + associatedWaitpoint: params.associatedWaitpoint, + }; +} + +async function seedPendingWaitpoint( + prisma: AnyClient, + params: { id: string; friendlyId: string; projectId: string; environmentId: string } +) { + return (prisma as PrismaClient).waitpoint.create({ + data: { + id: params.id, + friendlyId: params.friendlyId, + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + }, + }); +} + +// Strip the prisma-managed / connection-volatile columns so two waitpoint rows born on +// different physical DBs (and via different code paths) compare field-for-field. +function normalizeWaitpoint(row: Record | null) { + if (!row) return row; + const r = { ...row }; + delete r.createdAt; + delete r.updatedAt; + return r; +} + +// Runs the same createRun + snapshot scenario against any (client, schemaVariant) pair and +// returns the observable shapes the interface contract promises, so the legacy and dedicated +// runs can be asserted equivalent. +async function runScenario( + prisma: AnyClient, + schemaVariant: "legacy" | "dedicated", + suffix: string +) { + const store = new PostgresRunStore({ + prisma: prisma as never, + readOnlyPrisma: prisma as never, + schemaVariant, + }); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + + const runId = `run_dual_${suffix}`; + const created = await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_${suffix}`, + taskIdentifier: "my-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + associatedWaitpoint: buildAssociatedWaitpoint({ + id: `wp_assoc_${suffix}`, + friendlyId: `waitpoint_assoc_${suffix}`, + projectId: env.project.id, + environmentId: env.environment.id, + }), + }) + ); + + // The run read that pulls the associatedWaitpoint back (rewriteDebouncedRun shape). + const rewritten = await store.rewriteDebouncedRun(runId, { + payload: '{"hello":"again"}', + payloadType: "application/json", + }); + + // Two pending waitpoints to complete via a snapshot. + const w1 = `wp_${suffix}_1`; + const w2 = `wp_${suffix}_2`; + await seedPendingWaitpoint(prisma, { + id: w1, + friendlyId: `waitpoint_${suffix}_1`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + await seedPendingWaitpoint(prisma, { + id: w2, + friendlyId: `waitpoint_${suffix}_2`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + + const ids = { + environmentId: env.environment.id, + environmentType: "DEVELOPMENT" as const, + projectId: env.project.id, + organizationId: env.organization.id, + }; + + const snapshot = await store.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING_WITH_WAITPOINTS", description: "with waitpoints" }, + completedWaitpoints: [ + { id: w2, index: 1 }, + { id: w1, index: 0 }, + ], + ...ids, + }); + + const latest = await store.findLatestExecutionSnapshot(runId); + const joinIds = await store.findSnapshotCompletedWaitpointIds(snapshot.id); + + return { + runId, + created, + rewritten, + snapshot, + latest, + joinIds, + waitpointId: `wp_assoc_${suffix}`, + w1, + w2, + }; +} + +function assertScenario(r: Awaited>) { + // createRun returns the run with its associatedWaitpoint hydrated. + expect(r.created.id).toBe(r.runId); + expect(r.created.associatedWaitpoint).not.toBeNull(); + expect(r.created.associatedWaitpoint!.id).toBe(r.waitpointId); + expect(r.created.associatedWaitpoint!.type).toBe("RUN"); + expect(r.created.associatedWaitpoint!.completedByTaskRunId).toBe(r.runId); + + // The run read hydrates the same associatedWaitpoint. + expect(r.rewritten.associatedWaitpoint).not.toBeNull(); + expect(r.rewritten.associatedWaitpoint!.id).toBe(r.waitpointId); + + // The snapshot create derives completedWaitpointOrder by index (w1 index 0, w2 index 1). + expect(r.snapshot.completedWaitpointOrder).toEqual([r.w1, r.w2]); + + // The join read returns both completed waitpoints (set-equal). + expect([...r.joinIds].sort()).toEqual([r.w1, r.w2].sort()); + + // findLatest hydrates completedWaitpoints (set-equal) and the (null) checkpoint. + expect(r.latest).not.toBeNull(); + expect(r.latest!.id).toBe(r.snapshot.id); + expect(r.latest!.checkpoint).toBeNull(); + expect(r.latest!.completedWaitpoints.map((w) => w.id).sort()).toEqual([r.w1, r.w2].sort()); +} + +describe("PostgresRunStore dual-schema (P2-store-bodies)", () => { + // Legacy variant over the full @trigger.dev/database schema — existing behavior must hold. + heteroPostgresTest( + "createRun + snapshot relation ops work on the LEGACY client (schemaVariant=legacy)", + async ({ prisma14 }) => { + const r = await runScenario(prisma14, "legacy", "leg"); + assertScenario(r); + } + ); + + // Dedicated variant over the @internal/run-ops-database SUBSET schema — RED before this + // task (Prisma validation error on associatedWaitpoint/completedWaitpoints), GREEN after. + heteroRunOpsPostgresTest( + "createRun + snapshot relation ops work on the DEDICATED RunOpsPrismaClient (schemaVariant=dedicated)", + async ({ prisma17 }) => { + const r = await runScenario(prisma17, "dedicated", "ded"); + assertScenario(r); + } + ); + + // Cross-variant equivalence: the observable return contract is the same regardless of which + // backing schema produced it. + heteroRunOpsPostgresTest( + "legacy and dedicated produce equivalent return shapes", + async ({ prisma14, prisma17 }) => { + const legacy = await runScenario(prisma14, "legacy", "xleg"); + const dedicated = await runScenario(prisma17, "dedicated", "xded"); + + // Associated waitpoint: normalize per-DB volatile columns, the rest must match. + const legW = normalizeWaitpoint( + legacy.created.associatedWaitpoint as unknown as Record + ); + const dedW = normalizeWaitpoint( + dedicated.created.associatedWaitpoint as unknown as Record + ); + // Both carry the same friendlyId/type/status/completedByTaskRunId-shaped contract; + // ids differ by suffix so compare the structural keys that must agree. + expect(legW!.type).toEqual(dedW!.type); + expect(legW!.status).toEqual(dedW!.status); + expect((legW as Record).outputType).toEqual( + (dedW as Record).outputType + ); + + // completedWaitpointOrder derivation is variant-independent. + expect(legacy.snapshot.completedWaitpointOrder.length).toEqual( + dedicated.snapshot.completedWaitpointOrder.length + ); + expect(legacy.latest!.completedWaitpoints.length).toEqual( + dedicated.latest!.completedWaitpoints.length + ); + } + ); + + // expireRunsBatch dedicated-fixture: RED before fix (Prisma.join mis-binds on dedicated client + // → 42601-class error), GREEN after (= ANY(ids::text[]) path). + heteroRunOpsPostgresTest( + "expireRunsBatch sets EXPIRED on the DEDICATED RunOpsPrismaClient (schemaVariant=dedicated)", + async ({ prisma17 }) => { + const store = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + + // Dedicated subset has no Organization/Project/RuntimeEnvironment tables — use synthetic ids. + const orgId = "org_expbatch_ded"; + const projId = "proj_expbatch_ded"; + const envId = "env_expbatch_ded"; + + const runId1 = "run_expbatch_ded_1"; + const runId2 = "run_expbatch_ded_2"; + + for (const id of [runId1, runId2]) { + await prisma17.taskRun.create({ + data: { + id, + engine: "V2", + status: "PENDING", + friendlyId: `friendly_${id}`, + runtimeEnvironmentId: envId, + environmentType: "DEVELOPMENT", + organizationId: orgId, + projectId: projId, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${id}`, + spanId: `span_${id}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + } + + const now = new Date("2026-06-01T12:00:00.000Z"); + const error = { + type: "STRING_ERROR" as const, + raw: "Run expired because the TTL was reached", + }; + + const count = await store.expireRunsBatch([runId1, runId2], { error, now }); + + expect(count).toBe(2); + + for (const id of [runId1, runId2]) { + const row = await prisma17.taskRun.findUniqueOrThrow({ + where: { id }, + select: { status: true, completedAt: true, expiredAt: true, updatedAt: true }, + }); + expect(row.status).toBe("EXPIRED"); + expect(row.completedAt).toEqual(now); + expect(row.expiredAt).toEqual(now); + expect(row.updatedAt).toEqual(now); + } + } + ); +}); diff --git a/internal-packages/run-store/src/PostgresRunStore.test.ts b/internal-packages/run-store/src/PostgresRunStore.test.ts index 49fcbfe4503..82ae67805c4 100644 --- a/internal-packages/run-store/src/PostgresRunStore.test.ts +++ b/internal-packages/run-store/src/PostgresRunStore.test.ts @@ -778,7 +778,7 @@ describe("PostgresRunStore", () => { ); postgresTest( - "lockRunToWorker sets status to DEQUEUED with lock columns, includes runtimeEnvironment, and creates one PENDING_EXECUTING snapshot", + "lockRunToWorker sets status to DEQUEUED with lock columns and creates one PENDING_EXECUTING snapshot (no runtimeEnvironment relation)", async ({ prisma }) => { const { organization, project, environment } = await seedEnvironment(prisma); @@ -877,8 +877,9 @@ describe("PostgresRunStore", () => { expect(locked.lockedById).toBe(workerTask.id); expect(locked.lockedToVersionId).toBe(backgroundWorker.id); expect(locked.lockedQueueId).toBe(queue.id); - expect(locked.runtimeEnvironment).toBeDefined(); - expect(locked.runtimeEnvironment.id).toBe(environment.id); + // The result is base-TaskRun scalars only — no control-plane relation. + expect(locked.runtimeEnvironmentId).toBe(environment.id); + expect((locked as Record).runtimeEnvironment).toBeUndefined(); const snap = await prisma.taskRunExecutionSnapshot.findUnique({ where: { id: snapshotId } }); expect(snap).not.toBeNull(); @@ -1740,8 +1741,8 @@ describe("PostgresRunStore — read", () => { async ({ prisma }) => { const { organization, project, environment } = await seedEnvironment(prisma); - // Use a NoopRunStore-style read replica that must NOT be hit: pass the writer - // (prisma) explicitly so reads go through it for read-after-write consistency. + // The read replica must NOT be hit here: pass the writer (prisma) explicitly + // so reads go through it for read-after-write consistency. const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); const runId = "run_find_read_after_write_1"; diff --git a/internal-packages/run-store/src/PostgresRunStore.ts b/internal-packages/run-store/src/PostgresRunStore.ts index 79c85099d50..843f4c3b32a 100644 --- a/internal-packages/run-store/src/PostgresRunStore.ts +++ b/internal-packages/run-store/src/PostgresRunStore.ts @@ -1,18 +1,22 @@ import { Prisma } from "@trigger.dev/database"; import type { + BatchTaskRun, + BatchTaskRunItemStatus, PrismaClient, PrismaClientOrTransaction, - PrismaReplicaClient, TaskRun, TaskRunStatus, } from "@trigger.dev/database"; import type { ClearIdempotencyKeyInput, CompletionSnapshotInput, + CreateBatchTaskRunData, CreateCancelledRunInput, + CreateExecutionSnapshotInput, CreateFailedRunInput, CreateRunInput, ExpireSnapshotInput, + ForWaitpointCompletionContext, LockRunData, ReadClient, RescheduleSnapshotInput, @@ -22,26 +26,498 @@ import type { } from "./types.js"; import type { TaskRunError } from "@trigger.dev/core/v3/schemas"; +// Loose delegate method shape: each generated client types delegate methods as +// `(args: PackageLocalArgs) => PrismaPromise<…>` against its own nominal +// `Prisma` namespace, so the full generics are mutually non-assignable. `any` args +// are the structural common denominator both clients' delegates satisfy. +// Do NOT tighten to concrete `Prisma.*Args`: it re-breaks dual-client support. The +// trade is no compile-time check of the store's call args — the testcontainers +// integration suite is the compensating control. +type RunOpsDelegate = { + [M in Methods]: (args: any) => Promise; +}; + +/** + * Structural client interface covering exactly the delegates + raw methods the + * store uses. Both `@trigger.dev/database`'s `PrismaClient` and + * `@internal/run-ops-database`'s `RunOpsPrismaClient` are assignable to it, so + * either can back the store; the cast from a concrete client happens once at the + * wiring boundary. + */ +export interface RunOpsCapableClient { + taskRun: RunOpsDelegate< + "create" | "findFirst" | "findFirstOrThrow" | "findMany" | "update" | "updateMany" + >; + taskRunAttempt: RunOpsDelegate<"create" | "findFirst" | "findMany" | "update">; + taskRunExecutionSnapshot: RunOpsDelegate<"create" | "findFirst" | "findMany">; + taskRunWaitpoint: RunOpsDelegate<"deleteMany" | "findMany">; + taskRunCheckpoint: RunOpsDelegate<"create">; + checkpoint: RunOpsDelegate<"create" | "findFirst">; + checkpointRestoreEvent: RunOpsDelegate<"create" | "findFirst">; + taskRunDependency: RunOpsDelegate<"create" | "findFirst" | "findMany">; + waitpoint: RunOpsDelegate< + "create" | "findFirst" | "findMany" | "update" | "updateMany" | "upsert" + >; + // Dedicated-only join model (replaces the legacy implicit `_completedWaitpoints` M2M); optional + // so the legacy client (which lacks it) stays assignable. Touched only on the dedicated branch. + completedWaitpoint?: RunOpsDelegate<"create" | "createMany" | "findMany">; + // Dedicated-only explicit join (replaces the legacy implicit `_WaitpointRunConnections` M2M); + // optional so the legacy client stays assignable. Touched only on the dedicated branch. + waitpointRunConnection?: RunOpsDelegate<"createMany" | "findMany">; + batchTaskRun: RunOpsDelegate<"create" | "findFirst" | "update" | "updateMany">; + batchTaskRunItem: RunOpsDelegate<"create" | "count" | "updateMany">; + $queryRaw: PrismaClient["$queryRaw"]; + $executeRaw: PrismaClient["$executeRaw"]; +} + +/** + * A writer client (never a read replica) that can open an interactive transaction on its OWN + * connection. Both `PrismaClient` and `RunOpsPrismaClient` satisfy it; `PrismaReplicaClient` (which + * omits `$transaction`) does NOT — only the store's `prisma` (writer) handle opens one, never + * `readOnlyPrisma`. The tx callback's client is threaded into the store's inner writes as the + * per-call `tx` so they share one transaction (see `runInTransaction`). + */ +export interface RunOpsTransactionalClient extends RunOpsCapableClient { + $transaction: (fn: (tx: RunOpsCapableClient) => Promise) => Promise; +} + +/** + * Which backing schema the supplied clients carry. `"legacy"` = the full + * `@trigger.dev/database` schema (implicit M2M join tables + `@relation`s); + * `"dedicated"` = the `@internal/run-ops-database` SUBSET (FK-free scalars + explicit join + * models). The relation-shaped ops branch on this; everything else is schema-identical. + */ +export type RunStoreSchemaVariant = "legacy" | "dedicated"; + export type PostgresRunStoreOptions = { - prisma: PrismaClient; - readOnlyPrisma: PrismaReplicaClient; + prisma: RunOpsCapableClient; + readOnlyPrisma: RunOpsCapableClient; + /** Defaults to `"legacy"` so existing callers/tests are unaffected. */ + schemaVariant?: RunStoreSchemaVariant; }; +// A caller sub-select for a relation: `{ select?, include? }` or `true` for a bare `key: true`. +type SubProjection = { select?: any; include?: any } | true | undefined; + +// Hydrates one dedicated-schema relation for a single parent row, honoring the caller's sub-projection. +type DedicatedRelationHydrator = ( + client: RunOpsCapableClient, + parent: Record, + projection: { select?: any; include?: any } | undefined, + store: PostgresRunStore +) => Promise; + +// The dedicated-schema relation keys (with hydrators) for a single Prisma model. +type DedicatedRelationSpec = Record; + +// Normalize a caller sub-projection to `{ select | include }` (or undefined for `true`). +function projectionOf(sub: SubProjection): { select?: any; include?: any } | undefined { + if (sub === true || sub === undefined) { + return undefined; + } + return sub; +} + +// Apply a caller sub-projection to a hydrated row (or array) so only requested fields remain. +function applyProjection | null>( + row: T, + projection: { select?: any; include?: any } | undefined +): T { + if (!row || !projection?.select) { + return row; + } + const keys = Object.keys(projection.select).filter((k) => projection.select[k]); + const out: Record = {}; + for (const k of keys) { + out[k] = (row as Record)[k]; + } + return out as T; +} + +/** + * Split a caller `{ select | include }` into the args to send Prisma (dedicated-schema relation keys removed, + * `id` ensured present so hydrators can key off it) and the `requested` map of stripped keys + * to their sub-projection. Both `select` and `include` are handled: with `select` the parent + * scalars must be explicitly kept, with `include` they come back by default. + */ +function stripDedicatedRelations( + args: { select?: any; include?: any }, + spec: DedicatedRelationSpec +): { stripped: { select?: any; include?: any }; requested: Record } { + const requested: Record = {}; + + if (args.select) { + const select: Record = { ...args.select }; + for (const key of Object.keys(spec)) { + if (key in select) { + requested[key] = select[key] as SubProjection; + delete select[key]; + } + } + // Hydrators key off the parent `id`; ensure it survives a narrowed select. + select.id = true; + return { stripped: { select }, requested }; + } + + if (args.include) { + const include: Record = { ...args.include }; + for (const key of Object.keys(spec)) { + if (key in include) { + requested[key] = include[key] as SubProjection; + delete include[key]; + } + } + // An empty include is invalid for Prisma; drop it so the full row comes back. + const stripped = Object.keys(include).length > 0 ? { include } : {}; + return { stripped, requested }; + } + + return { stripped: args, requested }; +} + +// --- per-model dedicated-schema relation hydrators --- + +// Waitpoint where completedByTaskRunId = run.id (the @unique scalar back-pointer); at most one. +const hydrateAssociatedWaitpoint: DedicatedRelationHydrator = async ( + client, + parent, + projection +) => { + const wp = (await client.waitpoint.findFirst({ + where: { completedByTaskRunId: parent.id as string }, + })) as Record | null; + return applyProjection(wp, projection); +}; + +// Display connections for a run: WaitpointRunConnection → Waitpoint rows. +const hydrateConnectedWaitpoints: DedicatedRelationHydrator = async ( + client, + parent, + projection +) => { + const join = client.waitpointRunConnection; + if (!join) { + return []; + } + const links = (await join.findMany({ + where: { taskRunId: parent.id as string }, + select: { waitpointId: true }, + })) as { waitpointId: string }[]; + if (links.length === 0) { + return []; + } + const rows = (await client.waitpoint.findMany({ + where: { id: { in: links.map((l) => l.waitpointId) } }, + })) as Record[]; + return rows.map((r) => applyProjection(r, projection)); +}; + +// Completed waitpoints for a snapshot: CompletedWaitpoint join → Waitpoint rows. +const hydrateCompletedWaitpoints: DedicatedRelationHydrator = async ( + client, + parent, + projection +) => { + const join = client.completedWaitpoint; + if (!join) { + return []; + } + const links = (await join.findMany({ + where: { snapshotId: parent.id as string }, + select: { waitpointId: true }, + })) as { waitpointId: string }[]; + if (links.length === 0) { + return []; + } + const rows = (await client.waitpoint.findMany({ + where: { id: { in: links.map((l) => l.waitpointId) } }, + })) as Record[]; + return rows.map((r) => applyProjection(r, projection)); +}; + +// Runs a waitpoint is blocking: TaskRunWaitpoint rows keyed by waitpointId. A nested `taskRun` +// select (the run-engine's getWaitpoint shape) is resolved from the scalar TaskRunWaitpoint.taskRunId. +const hydrateBlockingTaskRuns: DedicatedRelationHydrator = async (client, parent, projection) => { + const edges = (await client.taskRunWaitpoint.findMany({ + where: { waitpointId: parent.id as string }, + })) as Record[]; + const nestedTaskRun = projection?.select?.taskRun; + if (!nestedTaskRun) { + return edges; + } + const runProjection = projectionOf(nestedTaskRun as SubProjection); + return Promise.all( + edges.map(async (edge) => { + const run = (await client.taskRun.findFirst({ + where: { id: edge.taskRunId as string }, + })) as Record | null; + return { ...edge, taskRun: applyProjection(run, runProjection) }; + }) + ); +}; + +// Display connections for a waitpoint: WaitpointRunConnection → TaskRun rows. +const hydrateConnectedRuns: DedicatedRelationHydrator = async (client, parent, projection) => { + const join = client.waitpointRunConnection; + if (!join) { + return []; + } + const links = (await join.findMany({ + where: { waitpointId: parent.id as string }, + select: { taskRunId: true }, + })) as { taskRunId: string }[]; + if (links.length === 0) { + return []; + } + const rows = (await client.taskRun.findMany({ + where: { id: { in: links.map((l) => l.taskRunId) } }, + })) as Record[]; + return rows.map((r) => applyProjection(r, projection)); +}; + +// Snapshots that completed a waitpoint: CompletedWaitpoint join → TaskRunExecutionSnapshot rows. +const hydrateCompletedExecutionSnapshots: DedicatedRelationHydrator = async ( + client, + parent, + projection +) => { + const join = client.completedWaitpoint; + if (!join) { + return []; + } + const links = (await join.findMany({ + where: { waitpointId: parent.id as string }, + select: { snapshotId: true }, + })) as { snapshotId: string }[]; + if (links.length === 0) { + return []; + } + const rows = (await client.taskRunExecutionSnapshot.findMany({ + where: { id: { in: links.map((l) => l.snapshotId) } }, + })) as Record[]; + return rows.map((r) => applyProjection(r, projection)); +}; + +// The waitpoint a block edge points at, resolved from the edge's scalar `waitpointId`. The edge's +// own client only finds a co-resident token; the router re-resolves cross-DB. +const hydrateEdgeWaitpoint: DedicatedRelationHydrator = async (client, parent, projection) => { + const waitpointId = parent.waitpointId as string | undefined; + if (!waitpointId) { + return null; + } + const wp = (await client.waitpoint.findFirst({ + where: { id: waitpointId }, + })) as Record | null; + return applyProjection(wp, projection); +}; + +// The run a block edge belongs to, resolved from the edge's scalar `taskRunId`. +const hydrateEdgeTaskRun: DedicatedRelationHydrator = async (client, parent, projection) => { + const taskRunId = parent.taskRunId as string | undefined; + if (!taskRunId) { + return null; + } + const run = (await client.taskRun.findFirst({ + where: { id: taskRunId }, + })) as Record | null; + return applyProjection(run, projection); +}; + +const TASK_RUN_DEDICATED: DedicatedRelationSpec = { + associatedWaitpoint: hydrateAssociatedWaitpoint, + connectedWaitpoints: hydrateConnectedWaitpoints, +}; + +// Dedicated-schema relations on the TaskRunWaitpoint (block edge) model. The dedicated subset has only the +// scalar `waitpointId`/`taskRunId`, so a caller `select`/`include` naming these relations must be +// stripped and hydrated. +const TASK_RUN_WAITPOINT_DEDICATED: DedicatedRelationSpec = { + waitpoint: hydrateEdgeWaitpoint, + taskRun: hydrateEdgeTaskRun, +}; + +const SNAPSHOT_DEDICATED: DedicatedRelationSpec = { + completedWaitpoints: hydrateCompletedWaitpoints, +}; + +const WAITPOINT_DEDICATED: DedicatedRelationSpec = { + blockingTaskRuns: hydrateBlockingTaskRuns, + connectedRuns: hydrateConnectedRuns, + completedExecutionSnapshots: hydrateCompletedExecutionSnapshots, +}; + +// Cross-generation Prisma error normalization. +// +// The store can be backed by the control-plane `@trigger.dev/database` client OR the +// run-ops `@internal/run-ops-database` client. Each is a SEPARATELY generated client with +// its own copy of the Prisma runtime, so each has its OWN `PrismaClientKnownRequestError` +// class object (identical code, distinct module identity). A P2002 from the run-ops client +// is therefore NOT `instanceof` the control-plane class — so the webapp's uniform +// `error instanceof Prisma.PrismaClientKnownRequestError` P2002→422 conversion is skipped and +// a raw 500 escapes. The store normalizes at its write boundary: any foreign +// known-request-error is re-thrown as the control-plane class so every routed-write caller's +// `instanceof` works regardless of which client raised it. + +// `instanceof` can't detect a foreign generation's class, so key on the runtime `name` the +// Prisma runtime stamps on every generation plus a string `code` (the P-code). +function isForeignPrismaKnownRequestError(error: unknown): error is { + name: string; + message: string; + code: string; + meta?: unknown; + clientVersion?: string; +} { + return ( + typeof error === "object" && + error !== null && + (error as { name?: unknown }).name === "PrismaClientKnownRequestError" && + typeof (error as { code?: unknown }).code === "string" && + !(error instanceof Prisma.PrismaClientKnownRequestError) + ); +} + +// Native + non-known-request errors are returned unchanged (caller re-throws the result). +function normalizeRunOpsError(error: unknown): unknown { + if (!isForeignPrismaKnownRequestError(error)) { + return error; + } + return new Prisma.PrismaClientKnownRequestError(error.message, { + code: error.code, + clientVersion: error.clientVersion ?? "unknown", + meta: error.meta as Record | undefined, + }); +} + +// Only these Prisma-model delegates carry the create/update/upsert writes that raise P2002; +// `$queryRaw`/`$executeRaw`/`$transaction` are left untouched (raw queries here never raise a +// duplicate-key, and wrapping their tagged-template/callback contract would break it). +const RUN_OPS_DELEGATE_KEYS: ReadonlySet = new Set([ + "taskRun", + "taskRunAttempt", + "taskRunExecutionSnapshot", + "taskRunWaitpoint", + "taskRunCheckpoint", + "checkpoint", + "checkpointRestoreEvent", + "taskRunDependency", + "waitpoint", + "completedWaitpoint", + "waitpointRunConnection", + "batchTaskRun", + "batchTaskRunItem", +]); + +// Every method call on a delegate rewrites ONLY its rejection reason; success is untouched. +function wrapDelegateForErrorNormalization(delegate: D): D { + return new Proxy(delegate, { + get(target, prop, receiver) { + const value = Reflect.get(target, prop, receiver); + if (typeof value !== "function") { + return value; + } + return (...args: unknown[]) => { + let result: unknown; + try { + result = (value as (...a: unknown[]) => unknown).apply(target, args); + } catch (error) { + throw normalizeRunOpsError(error); + } + // Delegate methods return a thenable PrismaPromise; rewrite its rejection only. + if (result != null && typeof (result as { then?: unknown }).then === "function") { + return (result as Promise).then(undefined, (error) => { + throw normalizeRunOpsError(error); + }); + } + return result; + }; + }, + }); +} + +// Model delegates are wrapped; `$transaction` wraps its tx client so inner writes normalize +// too; every other property (incl. `$queryRaw`/`$executeRaw`) passes through unchanged. +export function wrapRunOpsClientForErrorNormalization(client: C): C { + // Some tests inject a non-object fake (or nothing) as the client; only a real client can be + // proxied, and only a real client raises the foreign known-request-errors we normalize. + if (client == null || (typeof client !== "object" && typeof client !== "function")) { + return client; + } + const delegateCache = new Map(); + return new Proxy(client, { + get(target, prop, receiver) { + if (typeof prop === "string" && RUN_OPS_DELEGATE_KEYS.has(prop)) { + const cached = delegateCache.get(prop); + if (cached) { + return cached; + } + const delegate = Reflect.get(target, prop, receiver); + if (delegate == null || typeof delegate !== "object") { + return delegate; + } + const wrapped = wrapDelegateForErrorNormalization(delegate as object); + delegateCache.set(prop, wrapped); + return wrapped; + } + + if (prop === "$transaction") { + const original = Reflect.get(target, prop, receiver); + if (typeof original !== "function") { + return original; + } + return (fnOrArray: unknown, ...rest: unknown[]) => { + // Interactive (callback) form: wrap the tx client so inner writes normalize too. + if (typeof fnOrArray === "function") { + const wrappedFn = (tx: RunOpsCapableClient) => + (fnOrArray as (t: RunOpsCapableClient) => unknown)( + wrapRunOpsClientForErrorNormalization(tx) + ); + return (original as (...a: unknown[]) => unknown).call(target, wrappedFn, ...rest); + } + return (original as (...a: unknown[]) => unknown).call(target, fnOrArray, ...rest); + }; + } + + return Reflect.get(target, prop, receiver); + }, + }) as C; +} + /** * Typed write layer for the task-run row, backed by the `taskRun` Prisma model. * * Each method is a verbatim relocation of the Prisma statement that lives at a * specific call site today. Methods write through `(tx ?? this.prisma).taskRun` - * so callers can opt into an existing transaction. Errors (including unique - * constraint violations) propagate to the caller unchanged. + * so callers can opt into an existing transaction. Errors surface with unique + * constraint violations (P2002 etc.) normalized to the control-plane + * `Prisma.PrismaClientKnownRequestError` class (see `wrapRunOpsClientForErrorNormalization`), + * so `instanceof Prisma.PrismaClientKnownRequestError` works regardless of which + * generated client backs the store. */ export class PostgresRunStore implements RunStore { - private readonly prisma: PrismaClient; - private readonly readOnlyPrisma: PrismaReplicaClient; + private readonly prisma: RunOpsCapableClient; + private readonly readOnlyPrisma: RunOpsCapableClient; + private readonly schemaVariant: RunStoreSchemaVariant; constructor(options: PostgresRunStoreOptions) { - this.prisma = options.prisma; - this.readOnlyPrisma = options.readOnlyPrisma; + // Normalize foreign (run-ops-generation) Prisma known-request-errors to the control-plane + // class at the write boundary so callers' `instanceof Prisma.PrismaClientKnownRequestError` + // (P2002→422) works regardless of which generated client backs the store. + this.prisma = wrapRunOpsClientForErrorNormalization(options.prisma); + this.readOnlyPrisma = wrapRunOpsClientForErrorNormalization(options.readOnlyPrisma); + this.schemaVariant = options.schemaVariant ?? "legacy"; + } + + // Open ONE interactive transaction on this store's OWN writer client and run `fn` against THIS store + // (so subclass overrides survive) with the tx as the client to thread into the inner writes. `runId` + // is ignored here — a single store has one connection — but is in the contract so the router can + // resolve the owner. Only the writer opens transactions; the replica has no `$transaction`. + async runInTransaction( + _runId: string | undefined, + fn: (store: RunStore, tx: PrismaClientOrTransaction) => Promise + ): Promise { + return (this.prisma as RunOpsTransactionalClient).$transaction((tx) => + fn(this, tx as unknown as PrismaClientOrTransaction) + ); } async createRun( @@ -50,6 +526,31 @@ export class PostgresRunStore implements RunStore { ): Promise { const client = tx ?? this.prisma; + const snapshotCreate = { + engine: params.snapshot.engine, + executionStatus: params.snapshot.executionStatus, + description: params.snapshot.description, + runStatus: params.snapshot.runStatus, + environmentId: params.snapshot.environmentId, + environmentType: params.snapshot.environmentType, + projectId: params.snapshot.projectId, + organizationId: params.snapshot.organizationId, + workerId: params.snapshot.workerId, + runnerId: params.snapshot.runnerId, + }; + + if (this.schemaVariant === "dedicated") { + const run = (await client.taskRun.create({ + data: { + ...params.data, + executionSnapshots: { create: snapshotCreate }, + }, + })) as TaskRun; + + const associatedWaitpoint = await this.#createAssociatedWaitpoint(client, run.id, params); + return { ...run, associatedWaitpoint }; + } + return client.taskRun.create({ include: { associatedWaitpoint: true, @@ -57,18 +558,7 @@ export class PostgresRunStore implements RunStore { data: { ...params.data, executionSnapshots: { - create: { - engine: params.snapshot.engine, - executionStatus: params.snapshot.executionStatus, - description: params.snapshot.description, - runStatus: params.snapshot.runStatus, - environmentId: params.snapshot.environmentId, - environmentType: params.snapshot.environmentType, - projectId: params.snapshot.projectId, - organizationId: params.snapshot.organizationId, - workerId: params.snapshot.workerId, - runnerId: params.snapshot.runnerId, - }, + create: snapshotCreate, }, associatedWaitpoint: params.associatedWaitpoint ? { @@ -79,6 +569,29 @@ export class PostgresRunStore implements RunStore { }); } + /** + * Dedicated-schema replacement for the legacy `associatedWaitpoint: { create }` nested write. + * On the subset schema the association is the scalar `Waitpoint.completedByTaskRunId`, so the + * RUN-type waitpoint is created as its own row pointing back at the run, then returned so the + * caller can hydrate the same `{ run, associatedWaitpoint }` contract the legacy include gives. + */ + async #createAssociatedWaitpoint( + client: RunOpsCapableClient, + runId: string, + params: CreateRunInput | CreateFailedRunInput + ): Promise { + if (!params.associatedWaitpoint) { + return null; + } + + return (await client.waitpoint.create({ + data: { + ...params.associatedWaitpoint, + completedByTaskRunId: runId, + }, + })) as TaskRunWithWaitpoint["associatedWaitpoint"]; + } + async createCancelledRun( params: CreateCancelledRunInput, tx?: PrismaClientOrTransaction @@ -112,6 +625,15 @@ export class PostgresRunStore implements RunStore { ): Promise { const client = tx ?? this.prisma; + if (this.schemaVariant === "dedicated") { + const run = (await client.taskRun.create({ + data: { ...params.data }, + })) as TaskRun; + + const associatedWaitpoint = await this.#createAssociatedWaitpoint(client, run.id, params); + return { ...run, associatedWaitpoint }; + } + return client.taskRun.create({ include: { associatedWaitpoint: true, @@ -135,16 +657,17 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "EXECUTING", attemptNumber: data.attemptNumber, executedAt: data.executedAt, isWarmStart: data.isWarmStart, }, - select: args.select, - }) as Promise>; + { select: args.select } + ) as Promise>; } async completeAttemptSuccess( @@ -162,9 +685,10 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "COMPLETED_SUCCESSFULLY", completedAt: data.completedAt, output: data.output, @@ -186,27 +710,28 @@ export class PostgresRunStore implements RunStore { }, }, }, - select: args.select, - }) as Promise>; + { select: args.select } + ) as Promise>; } - async recordRetryOutcome( + async recordRetryOutcome( runId: string, data: { machinePreset?: string; usageDurationMs: number; costInCents: number }, - args: { include: I }, + args: { select: S }, tx?: PrismaClientOrTransaction - ): Promise> { + ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { machinePreset: data.machinePreset, usageDurationMs: data.usageDurationMs, costInCents: data.costInCents, }, - include: args.include, - }) as Promise>; + { select: args.select } + ) as Promise>; } async requeueRun( @@ -216,11 +741,12 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { status: "PENDING" }, - select: args.select, - }) as Promise>; + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "PENDING" }, + { select: args.select } + ) as Promise>; } async recordBulkActionMembership( @@ -254,9 +780,10 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "CANCELED", ...(data.completedAt !== undefined && { completedAt: data.completedAt }), error: data.error as Prisma.InputJsonValue, @@ -266,8 +793,8 @@ export class PostgresRunStore implements RunStore { ...(data.usageDurationMs !== undefined && { usageDurationMs: data.usageDurationMs }), ...(data.costInCents !== undefined && { costInCents: data.costInCents }), }, - select: args.select, - }) as Promise>; + { select: args.select } + ) as Promise>; } async failRunPermanently( @@ -284,17 +811,18 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: data.status, completedAt: data.completedAt, error: data.error as Prisma.InputJsonValue, usageDurationMs: data.usageDurationMs, costInCents: data.costInCents, }, - select: args.select, - }) as Promise>; + { select: args.select } + ) as Promise>; } async expireRun( @@ -310,9 +838,10 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "EXPIRED", completedAt: data.completedAt, expiredAt: data.expiredAt, @@ -330,8 +859,8 @@ export class PostgresRunStore implements RunStore { }, }, }, - select: args.select, - }) as Promise>; + { select: args.select } + ) as Promise>; } async expireRunsBatch( @@ -347,6 +876,21 @@ export class PostgresRunStore implements RunStore { return 0; } + // Dedicated: the run-ops generated client binds a bare value array ambiguously (jsonb), so we + // pass the id list as a single `text[]` param and match with `= ANY`, mirroring blockRunWithWaitpointEdges. + if (this.schemaVariant === "dedicated") { + const ids = runIds; + return prisma.$executeRaw` + UPDATE "TaskRun" + SET "status" = 'EXPIRED'::"TaskRunStatus", + "completedAt" = ${data.now}, + "expiredAt" = ${data.now}, + "updatedAt" = ${data.now}, + "error" = ${JSON.stringify(data.error)}::jsonb + WHERE "id" = ANY(${ids}::text[]) + `; + } + return prisma.$executeRaw` UPDATE "TaskRun" SET "status" = 'EXPIRED'::"TaskRunStatus", @@ -358,14 +902,36 @@ export class PostgresRunStore implements RunStore { `; } + /** + * Dedicated-schema replacement for the legacy `completedWaitpoints: { connect }` nested write. + * On the subset schema the snapshot↔waitpoint links live in the explicit FK-free + * `CompletedWaitpoint` join model, so we insert `{ snapshotId, waitpointId }` rows directly. + */ + async #connectCompletedWaitpoints( + client: RunOpsCapableClient, + snapshotId: string, + waitpointIds: string[] + ): Promise { + if (waitpointIds.length === 0 || !client.completedWaitpoint) { + return; + } + + await client.completedWaitpoint.createMany({ + data: waitpointIds.map((waitpointId) => ({ snapshotId, waitpointId })), + skipDuplicates: true, + }); + } + async lockRunToWorker( runId: string, data: LockRunData, tx?: PrismaClientOrTransaction - ): Promise> { + ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + const dedicated = this.schemaVariant === "dedicated"; + + const result = await prisma.taskRun.update({ where: { id: runId }, data: { status: "DEQUEUED", @@ -397,19 +963,32 @@ export class PostgresRunStore implements RunStore { organizationId: data.snapshot.organizationId, checkpointId: data.snapshot.checkpointId ?? undefined, batchId: data.snapshot.batchId ?? undefined, - completedWaitpoints: { - connect: data.snapshot.completedWaitpointIds.map((id) => ({ id })), - }, + // Legacy: connect the implicit M2M. Dedicated: links inserted below into the + // CompletedWaitpoint join model (no such relation field exists on that schema). + ...(dedicated + ? {} + : { + completedWaitpoints: { + connect: data.snapshot.completedWaitpointIds.map((id) => ({ id })), + }, + }), completedWaitpointOrder: data.snapshot.completedWaitpointOrder, workerId: data.snapshot.workerId ?? undefined, runnerId: data.snapshot.runnerId ?? undefined, }, }, }, - include: { - runtimeEnvironment: true, - }, }); + + if (dedicated) { + await this.#connectCompletedWaitpoints( + prisma, + data.snapshot.id, + data.snapshot.completedWaitpointIds + ); + } + + return result; } async parkPendingVersion( @@ -420,14 +999,15 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "PENDING_VERSION", statusReason: data.statusReason, }, - select: args.select, - }) as Promise>; + { select: args.select } + ) as Promise>; } async promotePendingVersionRuns( @@ -451,11 +1031,12 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { status: "WAITING_TO_RESUME" }, - include: args.include, - }) as Promise>; + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "WAITING_TO_RESUME" }, + { include: args.include } + ) as Promise>; } async resumeFromCheckpoint( @@ -465,11 +1046,12 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ - where: { id: runId }, - data: { status: "EXECUTING" }, - select: args.select, - }) as Promise>; + return this.#updateTaskRunWithSelect( + prisma, + { id: runId }, + { status: "EXECUTING" }, + { select: args.select } + ) as Promise>; } async rescheduleRun( @@ -525,6 +1107,12 @@ export class PostgresRunStore implements RunStore { ): Promise { const prisma = tx ?? this.prisma; + if (this.schemaVariant === "dedicated") { + const run = (await prisma.taskRun.update({ where: { id: runId }, data })) as TaskRun; + const associatedWaitpoint = await this.#findAssociatedWaitpoint(prisma, runId); + return { ...run, associatedWaitpoint }; + } + return prisma.taskRun.update({ where: { id: runId }, data, @@ -534,6 +1122,20 @@ export class PostgresRunStore implements RunStore { }); } + /** + * Dedicated-schema replacement for the legacy `include: { associatedWaitpoint: true }` run read. + * The relation doesn't exist on the subset schema; the RUN-type waitpoint is found by its scalar + * `completedByTaskRunId` back-pointer (`@unique`), so at most one matches. + */ + async #findAssociatedWaitpoint( + client: RunOpsCapableClient, + runId: string + ): Promise { + return (await client.waitpoint.findFirst({ + where: { completedByTaskRunId: runId }, + })) as TaskRunWithWaitpoint["associatedWaitpoint"]; + } + async updateMetadata( runId: string, data: { @@ -596,6 +1198,15 @@ export class PostgresRunStore implements RunStore { return { count: result.count }; } + async createBatchTaskRunItem( + data: { batchTaskRunId: string; taskRunId: string; status: BatchTaskRunItemStatus }, + tx?: PrismaClientOrTransaction + ): Promise { + const prisma = tx ?? this.prisma; + + await prisma.batchTaskRunItem.create({ data }); + } + async pushTags( runId: string, tags: string[], @@ -642,10 +1253,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const { args, prisma } = this.#resolveReadArgs(argsOrClient, client); - return prisma.taskRun.findFirst({ - where, - ...args, - }); + return this.#findTaskRunWithSelect(prisma, "findFirst", where, args); } findRunOrThrow( @@ -666,10 +1274,44 @@ export class PostgresRunStore implements RunStore { ): Promise { const { args, prisma } = this.#resolveReadArgs(argsOrClient, client); - return prisma.taskRun.findFirstOrThrow({ - where, - ...args, - }); + return this.#findTaskRunWithSelect(prisma, "findFirstOrThrow", where, args); + } + + // Read-after-write on THIS store's PRIMARY (writer), never the replica. Mirrors + // `findWaitpointOnPrimary`: a caller that just wrote a run in this request re-reads it here so + // replica lag can't null out a fresh row and turn a successful create into a false "not found". + // The routing store dispatches here (per owning store) when the caller passed the control-plane + // writer, so each store reads its OWN writer and never leaks a control-plane client into another DB. + findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { select: S } + ): Promise | null>; + findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { include: I } + ): Promise | null>; + findRunOnPrimary(where: Prisma.TaskRunWhereInput): Promise; + async findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args?: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } + ): Promise { + return this.#findTaskRunWithSelect(this.prisma, "findFirst", where, args ?? {}); + } + + findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { select: S } + ): Promise>; + findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { include: I } + ): Promise>; + findRunOrThrowOnPrimary(where: Prisma.TaskRunWhereInput): Promise; + async findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args?: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } + ): Promise { + return this.#findTaskRunWithSelect(this.prisma, "findFirstOrThrow", where, args ?? {}); } findRuns( @@ -718,7 +1360,776 @@ export class PostgresRunStore implements RunStore { ): Promise { const prisma = client ?? this.readOnlyPrisma; - return prisma.taskRun.findMany(args); + if (this.schemaVariant !== "dedicated" || (!args.select && !args.include)) { + return prisma.taskRun.findMany(args); + } + + const { where, orderBy, take, skip, cursor, ...projection } = args; + const { stripped, requested } = stripDedicatedRelations(projection, TASK_RUN_DEDICATED); + const rows = (await (prisma as RunOpsCapableClient).taskRun.findMany({ + where, + orderBy, + take, + skip, + cursor, + ...stripped, + })) as Record[]; + for (const row of rows) { + await this.#hydrateDedicatedRelations( + prisma as RunOpsCapableClient, + row, + requested, + TASK_RUN_DEDICATED + ); + } + return rows; + } + + // --- run-ops persistence --- + + async findLatestExecutionSnapshot( + runId: string, + client?: ReadClient + ): Promise | null> { + const prisma = client ?? this.readOnlyPrisma; + + if (this.schemaVariant === "dedicated") { + const snapshot = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId, isValid: true }, + include: { checkpoint: true }, + orderBy: { createdAt: "desc" }, + }); + if (!snapshot) { + return null; + } + const completedWaitpoints = await this.#hydrateCompletedWaitpoints(prisma, snapshot.id); + return { ...snapshot, completedWaitpoints }; + } + + return prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId, isValid: true }, + include: { + completedWaitpoints: true, + checkpoint: true, + }, + orderBy: { createdAt: "desc" }, + }); + } + + /** + * Dedicated-schema replacement for the legacy `include: { completedWaitpoints: true }` snapshot + * read. The relation doesn't exist on the subset schema, so we resolve the linked waitpoint ids + * from the explicit `CompletedWaitpoint` join model and load the rows to fill the same array. + */ + async #hydrateCompletedWaitpoints( + client: RunOpsCapableClient, + snapshotId: string + ): Promise { + if (!client.completedWaitpoint) { + return []; + } + const links = (await client.completedWaitpoint.findMany({ + where: { snapshotId }, + select: { waitpointId: true }, + })) as { waitpointId: string }[]; + if (links.length === 0) { + return []; + } + return client.waitpoint.findMany({ + where: { id: { in: links.map((l) => l.waitpointId) } }, + }); + } + + async findExecutionSnapshot( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise | null> { + const prisma = client ?? this.readOnlyPrisma; + + if (this.schemaVariant !== "dedicated") { + return prisma.taskRunExecutionSnapshot.findFirst( + args + ) as Promise | null>; + } + + const { where, orderBy, take, skip, cursor, ...projection } = args as Record; + return this.#runDedicatedSelect( + prisma as RunOpsCapableClient, + (stripped) => + (prisma as RunOpsCapableClient).taskRunExecutionSnapshot.findFirst({ + where, + orderBy, + take, + skip, + cursor, + ...stripped, + }), + projection, + SNAPSHOT_DEDICATED + ) as Promise | null>; + } + + async findManyExecutionSnapshots( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]> { + const prisma = client ?? this.readOnlyPrisma; + + if (this.schemaVariant !== "dedicated") { + return prisma.taskRunExecutionSnapshot.findMany(args) as Promise< + Prisma.TaskRunExecutionSnapshotGetPayload[] + >; + } + + const { where, orderBy, take, skip, cursor, ...projection } = args as Record; + const { stripped, requested } = stripDedicatedRelations(projection, SNAPSHOT_DEDICATED); + const rows = (await (prisma as RunOpsCapableClient).taskRunExecutionSnapshot.findMany({ + where, + orderBy, + take, + skip, + cursor, + ...stripped, + })) as Record[]; + for (const row of rows) { + await this.#hydrateDedicatedRelations( + prisma as RunOpsCapableClient, + row, + requested, + SNAPSHOT_DEDICATED + ); + } + return rows as Prisma.TaskRunExecutionSnapshotGetPayload[]; + } + + async createExecutionSnapshot( + input: CreateExecutionSnapshotInput, + tx?: PrismaClientOrTransaction + ): Promise> { + const prisma = tx ?? this.prisma; + + const { + run, + snapshot, + previousSnapshotId, + batchId, + environmentId, + environmentType, + projectId, + organizationId, + checkpointId, + workerId, + runnerId, + completedWaitpoints, + error, + } = input; + + const dedicated = this.schemaVariant === "dedicated"; + + const newSnapshot = await prisma.taskRunExecutionSnapshot.create({ + data: { + engine: "V2", + executionStatus: snapshot.executionStatus, + description: snapshot.description, + previousSnapshotId, + runId: run.id, + // We can't set the runStatus to DEQUEUED because it will break older runners + runStatus: run.status === "DEQUEUED" ? "PENDING" : run.status, + attemptNumber: run.attemptNumber ?? undefined, + batchId, + environmentId, + environmentType, + projectId, + organizationId, + checkpointId, + workerId, + runnerId, + metadata: snapshot.metadata ?? undefined, + // Legacy: connect the implicit M2M. Dedicated: links inserted below into the + // CompletedWaitpoint join model (no such relation field exists on that schema). + ...(dedicated + ? {} + : { + completedWaitpoints: { + connect: completedWaitpoints?.map((w) => ({ id: w.id })), + }, + }), + completedWaitpointOrder: completedWaitpoints + ?.filter((c) => c.index !== undefined) + .sort((a, b) => a.index! - b.index!) + .map((w) => w.id), + isValid: error ? false : true, + error, + }, + include: { checkpoint: true }, + }); + + if (dedicated) { + await this.#connectCompletedWaitpoints( + prisma, + newSnapshot.id, + completedWaitpoints?.map((w) => w.id) ?? [] + ); + } + + return newSnapshot; + } + + async findSnapshotCompletedWaitpointIds( + snapshotId: string, + client?: ReadClient + ): Promise { + const prisma = client ?? this.readOnlyPrisma; + + // Dedicated: the links live in the explicit CompletedWaitpoint join model; the legacy implicit + // `_completedWaitpoints` M2M table does not exist on the subset schema. (`ReadClient` does not + // surface the join delegate; on the dedicated path the read client is always a RunOpsClient.) + const joinDelegate = (prisma as RunOpsCapableClient).completedWaitpoint; + if (this.schemaVariant === "dedicated" && joinDelegate) { + const links = (await joinDelegate.findMany({ + where: { snapshotId }, + select: { waitpointId: true }, + })) as { waitpointId: string }[]; + return links.map((l) => l.waitpointId); + } + + const result = await prisma.$queryRaw<{ B: string }[]>` + SELECT "B" FROM "_completedWaitpoints" WHERE "A" = ${snapshotId} + `; + return result.map((r) => r.B); + } + + // Reverse of `connectedRuns`: the run ids linked to a waitpoint. Co-resident with the RUN (the join + // is written on the run's DB in blockRunWithWaitpointEdges), so the waitpoint's own store can MISS a + // cross-DB run — the router fans this across BOTH DBs. + async findWaitpointConnectedRunIds(waitpointId: string, client?: ReadClient): Promise { + const prisma = client ?? this.readOnlyPrisma; + + const joinDelegate = (prisma as RunOpsCapableClient).waitpointRunConnection; + if (this.schemaVariant === "dedicated" && joinDelegate) { + const links = (await joinDelegate.findMany({ + where: { waitpointId }, + select: { taskRunId: true }, + })) as { taskRunId: string }[]; + return links.map((l) => l.taskRunId); + } + + // Legacy implicit M2M `_WaitpointRunConnections`: A = TaskRun.id, B = Waitpoint.id (alphabetical). + const result = await prisma.$queryRaw<{ A: string }[]>` + SELECT "A" FROM "_WaitpointRunConnections" WHERE "B" = ${waitpointId} + `; + return result.map((r) => r.A); + } + + // Reverse of `completedExecutionSnapshots`: the snapshot ids that completed a waitpoint. The join is + // co-resident with the SNAPSHOT/run, so the waitpoint's own store can MISS a cross-DB snapshot — the + // router fans this across BOTH DBs (the reverse direction of the resume-payload output recovery). + async findWaitpointCompletedSnapshotIds( + waitpointId: string, + client?: ReadClient + ): Promise { + const prisma = client ?? this.readOnlyPrisma; + + const joinDelegate = (prisma as RunOpsCapableClient).completedWaitpoint; + if (this.schemaVariant === "dedicated" && joinDelegate) { + const links = (await joinDelegate.findMany({ + where: { waitpointId }, + select: { snapshotId: true }, + })) as { snapshotId: string }[]; + return links.map((l) => l.snapshotId); + } + + // Legacy implicit M2M `_completedWaitpoints`: A = TaskRunExecutionSnapshot.id, B = Waitpoint.id. + const result = await prisma.$queryRaw<{ A: string }[]>` + SELECT "A" FROM "_completedWaitpoints" WHERE "B" = ${waitpointId} + `; + return result.map((r) => r.A); + } + + async blockRunWithWaitpointEdges(params: { + runId: string; + waitpointIds: string[]; + projectId: string; + spanIdToComplete?: string; + batchId?: string; + batchIndex?: number; + tx?: PrismaClientOrTransaction; + }): Promise { + const { runId, waitpointIds, projectId, spanIdToComplete, batchId, batchIndex, tx } = params; + const prisma = tx ?? this.prisma; + + // Nothing to block for an empty set, and Prisma.join would build an invalid `IN ()` + // clause, so short-circuit before touching the database. + if (waitpointIds.length === 0) { + return; + } + + // Dedicated: the run↔waitpoint connection lives in the explicit FK-free `WaitpointRunConnection` + // table; the legacy implicit `_WaitpointRunConnections` M2M does not exist on the subset schema. + if (this.schemaVariant === "dedicated") { + // Source the edge rows from the waitpointId array DIRECTLY via `unnest`, NOT a join to the local + // `"Waitpoint"` table: this branch is FK-free, and for the tolerated NEW-run→LEGACY-token + // direction the token lives on the OTHER DB, so `FROM "Waitpoint" w` would match 0 rows and the + // run would hang forever. The token's status is resolved at completion by the both-DB fan-out. + // The run-ops client binds a bare array ambiguously (jsonb), so pass it as one `text[]` param. + const ids = waitpointIds; + await prisma.$queryRaw` + WITH inserted AS ( + INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex") + SELECT + gen_random_uuid(), + ${runId}, + w.id, + ${projectId}, + NOW(), + NOW(), + ${spanIdToComplete ?? null}::text, + ${batchId ?? null}::text, + ${batchIndex ?? null}::int + FROM unnest(${ids}::text[]) AS w(id) + ON CONFLICT DO NOTHING + RETURNING "waitpointId" + ), + connected_runs AS ( + INSERT INTO "WaitpointRunConnection" ("id", "taskRunId", "waitpointId") + SELECT gen_random_uuid(), ${runId}, w.id + FROM unnest(${ids}::text[]) AS w(id) + ON CONFLICT DO NOTHING + ) + SELECT COUNT(*) FROM inserted`; + return; + } + + // Insert the blocking connections and the historical run connections. + // We use a CTE to do both inserts atomically. Data-modifying CTEs are + // always executed regardless of whether they're referenced in the outer query. + await prisma.$queryRaw` + WITH inserted AS ( + INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex") + SELECT + gen_random_uuid(), + ${runId}, + w.id, + ${projectId}, + NOW(), + NOW(), + ${spanIdToComplete ?? null}, + ${batchId ?? null}, + ${batchIndex ?? null} + FROM "Waitpoint" w + WHERE w.id IN (${Prisma.join(waitpointIds)}) + ON CONFLICT DO NOTHING + RETURNING "waitpointId" + ), + connected_runs AS ( + INSERT INTO "_WaitpointRunConnections" ("A", "B") + SELECT ${runId}, w.id + FROM "Waitpoint" w + WHERE w.id IN (${Prisma.join(waitpointIds)}) + ON CONFLICT DO NOTHING + ) + SELECT COUNT(*) FROM inserted`; + } + + async countPendingWaitpoints(waitpointIds: string[], client?: ReadClient): Promise { + const prisma = client ?? this.readOnlyPrisma; + + if (waitpointIds.length === 0) { + return 0; + } + + // Separate statement from the blocking CTE on purpose: under READ COMMITTED each + // statement gets its own snapshot, so this fresh query reflects concurrent commits the + // CTE's snapshot could not see. + if (this.schemaVariant === "dedicated") { + // The run-ops generated client binds a bare value array ambiguously (jsonb), so pass the id + // list as a single text[] param and match with `= ANY`, mirroring blockRunWithWaitpointEdges. + const pendingCheck = await prisma.$queryRaw<{ pending_count: bigint }[]>` + SELECT COUNT(*) as pending_count + FROM "Waitpoint" + WHERE id = ANY(${waitpointIds}::text[]) + AND status = 'PENDING' + `; + return Number(pendingCheck.at(0)?.pending_count ?? 0); + } + + const pendingCheck = await prisma.$queryRaw<{ pending_count: bigint }[]>` + SELECT COUNT(*) as pending_count + FROM "Waitpoint" + WHERE id IN (${Prisma.join(waitpointIds)}) + AND status = 'PENDING' + `; + return Number(pendingCheck.at(0)?.pending_count ?? 0); + } + + async createWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction + ): Promise> { + const prisma = tx ?? this.prisma; + + return prisma.waitpoint.create(args) as Promise>; + } + + async upsertWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction + ): Promise> { + const prisma = tx ?? this.prisma; + + return prisma.waitpoint.upsert(args) as Promise>; + } + + async findWaitpoint( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise | null> { + return this.#findWaitpointOn(client ?? this.readOnlyPrisma, args); + } + + // Read-after-write on the OWNING store's PRIMARY: the unblock path re-reads a waitpoint it just + // wrote on the primary, and the replica (findWaitpoint's default) can miss it under replication + // lag and wrongly throw "not found", stranding the parent run. + async findWaitpointOnPrimary( + args: Prisma.SelectSubset + ): Promise | null> { + return this.#findWaitpointOn(this.prisma, args); + } + + #findWaitpointOn( + prisma: ReadClient | RunOpsCapableClient, + args: Prisma.SelectSubset + ): Promise | null> { + if (this.schemaVariant !== "dedicated") { + return prisma.waitpoint.findFirst(args) as Promise | null>; + } + + const { where, orderBy, take, skip, cursor, ...projection } = args as Record; + return this.#runDedicatedSelect( + prisma as RunOpsCapableClient, + (stripped) => + (prisma as RunOpsCapableClient).waitpoint.findFirst({ + where, + orderBy, + take, + skip, + cursor, + ...stripped, + }), + projection, + WAITPOINT_DEDICATED + ) as Promise | null>; + } + + async findManyWaitpoints( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]> { + const prisma = client ?? this.readOnlyPrisma; + + if (this.schemaVariant !== "dedicated") { + return prisma.waitpoint.findMany(args) as Promise[]>; + } + + const { where, orderBy, take, skip, cursor, ...projection } = args as Record; + const { stripped, requested } = stripDedicatedRelations(projection, WAITPOINT_DEDICATED); + const rows = (await (prisma as RunOpsCapableClient).waitpoint.findMany({ + where, + orderBy, + take, + skip, + cursor, + ...stripped, + })) as Record[]; + for (const row of rows) { + await this.#hydrateDedicatedRelations( + prisma as RunOpsCapableClient, + row, + requested, + WAITPOINT_DEDICATED + ); + } + return rows as Prisma.WaitpointGetPayload[]; + } + + async updateWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction + ): Promise> { + const prisma = tx ?? this.prisma; + + return prisma.waitpoint.update(args) as Promise>; + } + + async updateManyWaitpoints( + args: Prisma.WaitpointUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const prisma = tx ?? this.prisma; + + return prisma.waitpoint.updateMany(args); + } + + async forWaitpointCompletion( + _waitpointId: string, + _context: ForWaitpointCompletionContext + ): Promise { + // Single store: the one store always owns the completion. No classification. + return this; + } + + async findManyTaskRunWaitpoints( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]> { + const prisma = client ?? this.readOnlyPrisma; + + if (this.schemaVariant !== "dedicated") { + return prisma.taskRunWaitpoint.findMany(args) as Promise< + Prisma.TaskRunWaitpointGetPayload[] + >; + } + + // Dedicated subset: strip the `waitpoint`/`taskRun` relation keys (no relation on the subset → + // straight-through would throw a Prisma validation error), run the scalar findMany, then hydrate + // from the edge's own client. A cross-DB token is missed here and re-resolved by the router. + const { where, orderBy, take, skip, cursor, ...projection } = args as Record; + const { stripped, requested } = stripDedicatedRelations( + projection, + TASK_RUN_WAITPOINT_DEDICATED + ); + // Keep the scalar ids the hydrators key off through a narrowed select. + if (stripped.select) { + stripped.select.waitpointId = true; + stripped.select.taskRunId = true; + } + const rows = (await (prisma as RunOpsCapableClient).taskRunWaitpoint.findMany({ + where, + orderBy, + take, + skip, + cursor, + ...stripped, + })) as Record[]; + for (const row of rows) { + await this.#hydrateDedicatedRelations( + prisma as RunOpsCapableClient, + row, + requested, + TASK_RUN_WAITPOINT_DEDICATED + ); + } + return rows as Prisma.TaskRunWaitpointGetPayload[]; + } + + async deleteManyTaskRunWaitpoints( + args: Prisma.TaskRunWaitpointDeleteManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const prisma = tx ?? this.prisma; + + return prisma.taskRunWaitpoint.deleteMany(args); + } + + async findTaskRunAttempt( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise | null> { + const prisma = client ?? this.readOnlyPrisma; + + return prisma.taskRunAttempt.findFirst( + args + ) as Promise | null>; + } + + async createTaskRunCheckpoint( + args: Prisma.SelectSubset, + // `ownerRunId` selects the residency at the router; a single store has one client and ignores it. + _ownerRunId?: string, + tx?: PrismaClientOrTransaction + ): Promise> { + const prisma = tx ?? this.prisma; + + return prisma.taskRunCheckpoint.create(args) as Promise>; + } + + // --- BatchTaskRun (run-ops) --- + + async createBatchTaskRun( + data: CreateBatchTaskRunData, + tx?: PrismaClientOrTransaction + ): Promise { + const prisma = tx ?? this.prisma; + + return prisma.batchTaskRun.create({ data }); + } + + async updateBatchTaskRun( + args: { + where: Prisma.BatchTaskRunWhereUniqueInput; + data: Prisma.BatchTaskRunUpdateInput; + select: S; + }, + tx?: PrismaClientOrTransaction + ): Promise> { + const prisma = tx ?? this.prisma; + + return prisma.batchTaskRun.update(args) as Promise< + Prisma.BatchTaskRunGetPayload<{ select: S }> + >; + } + + // Defaults to the primary: the worker reads the just-written batch row and replica + // lag would break it. + async findBatchTaskRunById( + id: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null> { + const prisma = client ?? this.prisma; + + return prisma.batchTaskRun.findFirst({ + where: { id }, + ...(args?.include ? { include: args.include } : {}), + }) as Promise | null>; + } + + async findBatchTaskRunByFriendlyId( + friendlyId: string, + environmentId: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null> { + const prisma = client ?? this.readOnlyPrisma; + + return prisma.batchTaskRun.findFirst({ + where: { friendlyId, runtimeEnvironmentId: environmentId }, + ...(args?.include ? { include: args.include } : {}), + }) as Promise | null>; + } + + // Defaults to the primary: the idempotency probe reads a batch that may have just been + // written within the same request. + async findBatchTaskRunByIdempotencyKey( + environmentId: string, + idempotencyKey: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null> { + const prisma = client ?? this.prisma; + + return prisma.batchTaskRun.findFirst({ + where: { runtimeEnvironmentId: environmentId, idempotencyKey }, + ...(args?.include ? { include: args.include } : {}), + }) as Promise | null>; + } + + async updateManyBatchTaskRun( + args: Prisma.BatchTaskRunUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const prisma = tx ?? this.prisma; + + return prisma.batchTaskRun.updateMany(args); + } + + async countBatchTaskRunItems( + where: { batchTaskRunId: string; status?: BatchTaskRunItemStatus }, + client?: ReadClient + ): Promise { + const prisma = client ?? this.prisma; + + return prisma.batchTaskRunItem.count({ where }); + } + + async updateManyBatchTaskRunItems( + args: Prisma.BatchTaskRunItemUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const prisma = tx ?? this.prisma; + + return prisma.batchTaskRunItem.updateMany(args); + } + + /** + * Run `taskRun.update` honoring a caller `{ select | include }` that may name dedicated-schema + * relation keys. Legacy passes through unchanged; dedicated strips + hydrates via the shared adapter. + */ + #updateTaskRunWithSelect( + prisma: RunOpsCapableClient, + where: Prisma.TaskRunWhereUniqueInput, + data: any, + args: { select?: any; include?: any } + ): Promise { + if (this.schemaVariant !== "dedicated") { + return prisma.taskRun.update({ where, data, ...args }); + } + return this.#runDedicatedSelect( + prisma, + (stripped) => prisma.taskRun.update({ where, data, ...stripped }), + args, + TASK_RUN_DEDICATED + ); + } + + /** Run `taskRun.findFirst`/`findFirstOrThrow` honoring a caller select/include (dedicated-schema-relation aware). */ + #findTaskRunWithSelect( + prisma: ReadClient | RunOpsCapableClient, + method: "findFirst" | "findFirstOrThrow", + where: Prisma.TaskRunWhereInput, + args: { select?: any; include?: any } + ): Promise { + const delegate = (prisma as RunOpsCapableClient).taskRun; + if (this.schemaVariant !== "dedicated") { + return delegate[method]({ where, ...args }); + } + return this.#runDedicatedSelect( + prisma as RunOpsCapableClient, + (stripped) => delegate[method]({ where, ...stripped }), + args, + TASK_RUN_DEDICATED + ); + } + + // --- dedicated-schema caller-select adapter (P2-store-bodies-2) --- + // On the dedicated subset the relation keys the run-engine selects don't exist (they're stripped on + // the dedicated schema and hydrated from scalars/joins). We strip + // them from the caller's select/include, run the query, then hydrate from the scalar/join model + // and merge back so the returned shape is unchanged. Legacy passes the keys through unchanged. + + // Strip the dedicated-schema relation keys, run the single-result delegate query, then hydrate the stripped keys back. + async #runDedicatedSelect( + client: RunOpsCapableClient, + runQuery: (strippedArgs: { select?: any; include?: any }) => Promise, + args: { select?: any; include?: any }, + spec: DedicatedRelationSpec + ): Promise { + const { stripped, requested } = stripDedicatedRelations(args, spec); + const row = await runQuery(stripped); + if (!row) { + return row; + } + await this.#hydrateDedicatedRelations(client, row, requested, spec); + return row; + } + + // Hydrate each requested dedicated-schema relation key onto `row` in place, honoring the caller's sub-select. + async #hydrateDedicatedRelations( + client: RunOpsCapableClient, + row: Record, + requested: Record, + spec: DedicatedRelationSpec + ): Promise { + for (const key of Object.keys(requested)) { + const hydrator = spec[key]; + if (!hydrator) { + continue; + } + const subArgs = requested[key]; + row[key] = await hydrator(client, row, projectionOf(subArgs), this); + } } /** @@ -737,7 +2148,7 @@ export class PostgresRunStore implements RunStore { client: ReadClient | undefined ): { args: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude }; - prisma: ReadClient; + prisma: ReadClient | RunOpsCapableClient; } { const isProjection = typeof argsOrClient === "object" && diff --git a/internal-packages/run-store/src/PostgresRunStore.writeAtomicity.test.ts b/internal-packages/run-store/src/PostgresRunStore.writeAtomicity.test.ts new file mode 100644 index 00000000000..2d43fc90830 --- /dev/null +++ b/internal-packages/run-store/src/PostgresRunStore.writeAtomicity.test.ts @@ -0,0 +1,314 @@ +// Cross-DB WRITE ATOMICITY against the REAL dedicated split topology. +// +// Under the run-ops split, several engine operations that were atomic-by-`prisma.$transaction` in +// single-DB make TWO distinct RunStore writes (e.g. startAttempt + createExecutionSnapshot, or +// promotePendingVersionRuns + createExecutionSnapshot). When the run is ksuid (#new), `RoutingRunStore` +// routes each write to the NEW store but DROPS the caller's control-plane `tx` — so the two writes +// execute as independent auto-commit statements on the NEW DB, OUTSIDE any shared transaction. A crash +// between them leaves partial state (a run EXECUTING with no matching snapshot; promoted-but-no-snapshot). +// +// `heteroRunOpsPostgresTest` gives the REAL production split: prisma17 = a real `RunOpsPrismaClient` +// over the @internal/run-ops-database SUBSET schema (#new), prisma14 = the full control-plane schema on +// a SEPARATE physical PG container (#legacy). No mocks. +// +// The first test EMPIRICALLY DEMONSTRATES the regression (two un-wrapped routed writes persist partial +// state on a mid-pair failure). The remaining tests prove `RoutingRunStore.runInTransaction(runId, fn)` +// wraps the co-resident multi-write unit in ONE `#new` transaction so a failure between the two writes +// rolls BOTH back — no partial state. + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { CreateRunInput, RunStore, RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH: 25 chars → cuid → LEGACY, 27 chars → ksuid → NEW. +const CUID_25 = "c".repeat(25); // → LEGACY (#legacy / control-plane DB, full schema) +const KSUID_27 = "k".repeat(27); // → NEW (#new / dedicated run-ops DB, subset schema) + +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + traceContext: { trace: "ctx" }, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +function makeDedicatedStore(prisma17: RunOpsPrismaClient) { + return new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); +} + +function makeLegacyStore(prisma14: PrismaClient) { + return new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); +} + +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = makeLegacyStore(prisma14); + const newStore = makeDedicatedStore(prisma17); + return { + router: new RoutingRunStore({ new: newStore, legacy: legacyStore }), + legacyStore, + newStore, + }; +} + +// Seed a ksuid run on #new (its create nests the initial RUN_CREATED snapshot) and return its ids. +async function seedKsuidRun( + router: RunStore, + prisma17: RunOpsPrismaClient, + suffix: string +): Promise<{ runId: string; env: { project: { id: string }; environment: { id: string } } }> { + const env = await seedEnvironment(prisma17, "dedicated", suffix); + const runId = `run_${KSUID_27}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_${suffix}`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + return { runId, env }; +} + +const ATTEMPT_SELECT = { id: true, status: true, attemptNumber: true } as const; + +function snapshotInput( + runId: string, + env: { project: { id: string }; environment: { id: string } } +) { + return { + run: { id: runId, status: "EXECUTING" as const, attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING" as const, description: "Attempt created, starting" }, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT" as const, + projectId: env.project.id, + organizationId: env.project.id, + }; +} + +describe("cross-DB write atomicity (startAttempt + createExecutionSnapshot)", () => { + // --------------------------------------------------------------------------------------------- + // RED demonstration: the BROKEN behaviour. Two separate routed writes (as the engine made them + // before the fix) on a ksuid run leave PARTIAL state on a mid-pair failure — the run is EXECUTING + // but no EXECUTING snapshot exists. This is the regression vs single-DB. + // --------------------------------------------------------------------------------------------- + heteroRunOpsPostgresTest( + "BROKEN baseline: two un-wrapped routed writes persist partial state on a mid-pair failure", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const { runId, env } = await seedKsuidRun(router, prisma17, "broken_atomic"); + + // Simulate the OLD engine pattern: startAttempt then a failure BEFORE createExecutionSnapshot, + // each as an independent routed (auto-commit) write — no shared transaction. + await expect( + (async () => { + await router.startAttempt( + runId, + { attemptNumber: 1, executedAt: new Date(), isWarmStart: false }, + { select: ATTEMPT_SELECT } + ); + throw new Error("boom between writes"); + // eslint-disable-next-line no-unreachable + await router.createExecutionSnapshot(snapshotInput(runId, env)); + })() + ).rejects.toThrow("boom between writes"); + + // The first write was auto-committed: the run is EXECUTING but there is NO EXECUTING snapshot. + const run = await prisma17.taskRun.findFirstOrThrow({ where: { id: runId } }); + expect(run.status).toBe("EXECUTING"); // partial state PERSISTED — the bug + const execSnap = await prisma17.taskRunExecutionSnapshot.findFirst({ + where: { runId, executionStatus: "EXECUTING" }, + }); + expect(execSnap).toBeNull(); // no snapshot → run executing without a snapshot + } + ); + + // --------------------------------------------------------------------------------------------- + // FIX: runInTransaction wraps the co-resident multi-write unit in ONE #new transaction. A failure + // BETWEEN the two writes rolls the FIRST write back — no partial state. + // --------------------------------------------------------------------------------------------- + heteroRunOpsPostgresTest( + "runInTransaction rolls back startAttempt when a failure is injected before the snapshot write (ksuid → #new)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const { runId, env } = await seedKsuidRun(router, prisma17, "rollback_new"); + + await expect( + router.runInTransaction(runId, async (store, tx) => { + await store.startAttempt( + runId, + { attemptNumber: 1, executedAt: new Date(), isWarmStart: false }, + { select: ATTEMPT_SELECT }, + tx + ); + // Inject the failure AFTER the first write, BEFORE the snapshot write. + throw new Error("boom between writes"); + // eslint-disable-next-line no-unreachable + await store.createExecutionSnapshot(snapshotInput(runId, env), tx); + }) + ).rejects.toThrow("boom between writes"); + + // Both writes rolled back: run is still PENDING and no EXECUTING snapshot exists. + const run = await prisma17.taskRun.findFirstOrThrow({ where: { id: runId } }); + expect(run.status).toBe("PENDING"); + expect(run.attemptNumber).toBeNull(); + const execSnap = await prisma17.taskRunExecutionSnapshot.findFirst({ + where: { runId, executionStatus: "EXECUTING" }, + }); + expect(execSnap).toBeNull(); + } + ); + + heteroRunOpsPostgresTest( + "runInTransaction commits BOTH writes atomically on success (ksuid → #new)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const { runId, env } = await seedKsuidRun(router, prisma17, "commit_new"); + + const result = await router.runInTransaction(runId, async (store, tx) => { + const run = await store.startAttempt( + runId, + { attemptNumber: 1, executedAt: new Date(), isWarmStart: false }, + { select: ATTEMPT_SELECT }, + tx + ); + const snapshot = await store.createExecutionSnapshot(snapshotInput(runId, env), tx); + return { run, snapshot }; + }); + + expect(result.run.status).toBe("EXECUTING"); + expect(result.snapshot.executionStatus).toBe("EXECUTING"); + + // Both persisted on #new. + const run = await prisma17.taskRun.findFirstOrThrow({ where: { id: runId } }); + expect(run.status).toBe("EXECUTING"); + expect(run.attemptNumber).toBe(1); + const execSnap = await prisma17.taskRunExecutionSnapshot.findFirst({ + where: { runId, executionStatus: "EXECUTING" }, + }); + expect(execSnap).not.toBeNull(); + } + ); + + // The same atomic guarantee for a cuid run on #legacy — the owning store is #legacy and the inner + // writes share its transaction. + heteroRunOpsPostgresTest( + "runInTransaction rolls back BOTH writes on a cuid run (#legacy)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "rollback_leg"); + const runId = `run_${CUID_25}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_rollback_leg`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + await expect( + router.runInTransaction(runId, async (store, tx) => { + await store.startAttempt( + runId, + { attemptNumber: 1, executedAt: new Date(), isWarmStart: false }, + { select: ATTEMPT_SELECT }, + tx + ); + throw new Error("boom between writes"); + }) + ).rejects.toThrow("boom between writes"); + + const run = await prisma14.taskRun.findFirstOrThrow({ where: { id: runId } }); + expect(run.status).toBe("PENDING"); + expect(run.attemptNumber).toBeNull(); + } + ); +}); diff --git a/internal-packages/run-store/src/batchCompletionResidency.test.ts b/internal-packages/run-store/src/batchCompletionResidency.test.ts new file mode 100644 index 00000000000..dc9730b74d3 --- /dev/null +++ b/internal-packages/run-store/src/batchCompletionResidency.test.ts @@ -0,0 +1,287 @@ +// REGRESSION suite for the run-ops split "control-plane tx/client forwarded into a NEW-resident +// store" bug class on the BatchTaskRun write/probe path. When the router resolves the owning store +// to #new but forwards the caller's control-plane handle, #new issues its statement against the +// CONTROL-PLANE DB where the ksuid row does not exist → "No record was found" (update), wrong-DB row +// (create), or wrong count. Covers updateBatchTaskRun (commit 62ae880af), createBatchTaskRun and +// countBatchTaskRunItems (this sweep). `heteroRunOpsPostgresTest` is the REAL two-DB split topology +// (prisma17 = dedicated #new, prisma14 = legacy #legacy); NEVER mocked. + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH (runOpsResidency.ts): 25 chars → cuid → LEGACY, +// 27 chars → ksuid → NEW. +const CUID_25 = "c".repeat(25); // → LEGACY (#legacy / prisma14, full schema) +const KSUID_27 = "k".repeat(27); // → NEW (#new / prisma17, dedicated subset schema) + +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +// BatchTaskRunItem.taskRunId has an FK into TaskRun on the dedicated schema, so seed the referenced +// run before creating an item that points at it. +async function seedDedicatedRun(prisma17: RunOpsPrismaClient, envId: string, runId: string) { + await prisma17.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: `run_${runId}`, + runtimeEnvironmentId: envId, + environmentType: "DEVELOPMENT", + organizationId: "org_cntitems_new", + projectId: "proj_cntitems_new", + taskIdentifier: "batch-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `t_${runId}`, + spanId: `s_${runId}`, + queue: "task/batch-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); +} + +function makeDedicatedStore(prisma17: RunOpsPrismaClient) { + return new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); +} + +function makeLegacyStore(prisma14: PrismaClient) { + return new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); +} + +// Real production split topology: #new = dedicated subset on prisma17, #legacy = full schema on +// prisma14 — two physically distinct DBs. +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = makeLegacyStore(prisma14); + const newStore = makeDedicatedStore(prisma17); + return { + router: new RoutingRunStore({ new: newStore, legacy: legacyStore }), + legacyStore, + newStore, + }; +} + +describe("run-ops split — BatchTaskRun writes/probes must NOT forward the control-plane tx/client into NEW", () => { + // =========================================================================================== + // updateBatchTaskRun (commit 62ae880af) — the batch-completion residency regression. + // =========================================================================================== + + // The live `batchSystem.#tryCompleteBatch` shape: a ksuid batch on #new is updated to COMPLETED + // while the control-plane client is passed as `tx`. RED on the pre-62ae880af code (the router + // forwarded tx → #new ran the UPDATE on the control-plane DB → "No record was found for an + // update"); GREEN now (tx dropped for NEW → the row updates on #new's own DB). + heteroRunOpsPostgresTest( + "updateBatchTaskRun marks a ksuid batch on #new COMPLETED even when the control-plane client is passed as tx", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "updbatch_new"); + const batchId = `batch_${KSUID_27}`; // ksuid → #new + + await prisma17.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_upd_new", + runtimeEnvironmentId: env.environment.id, + runCount: 2, + status: "PROCESSING", + }, + }); + + // Pass the LEGACY (control-plane) client as `tx`, EXACTLY as #tryCompleteBatch does. + const updated = await router.updateBatchTaskRun( + { where: { id: batchId }, data: { status: "COMPLETED" }, select: { id: true } }, + prisma14 as never + ); + expect(updated.id).toBe(batchId); + + // The row on #new (its own DB) is genuinely COMPLETED — not a phantom update on the wrong DB. + const onNew = await prisma17.batchTaskRun.findUnique({ where: { id: batchId } }); + expect(onNew?.status).toBe("COMPLETED"); + } + ); + + // Control: a cuid batch on #legacy still updates through the router when the same (legacy) client + // is passed as tx — the tx IS forwarded for LEGACY (same physical DB), so atomicity is preserved. + heteroRunOpsPostgresTest( + "updateBatchTaskRun control: a cuid batch on #legacy still updates with the control-plane tx forwarded", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "updbatch_leg"); + const batchId = `batch_${CUID_25}`; // cuid → #legacy + + await prisma14.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_upd_leg", + runtimeEnvironmentId: env.environment.id, + runCount: 1, + status: "PROCESSING", + }, + }); + + const updated = await router.updateBatchTaskRun( + { where: { id: batchId }, data: { status: "COMPLETED" }, select: { id: true } }, + prisma14 as never + ); + expect(updated.id).toBe(batchId); + const onLegacy = await prisma14.batchTaskRun.findUnique({ where: { id: batchId } }); + expect(onLegacy?.status).toBe("COMPLETED"); + } + ); + + // =========================================================================================== + // createBatchTaskRun (this sweep) — same anti-pattern on the create path. + // =========================================================================================== + + // A ksuid batch routed to #new with a forwarded control-plane tx must still be created on #new's + // OWN DB, not the control-plane DB (which would strand the batch away from its co-resident child + // runs/items). Forwarding tx unconditionally would land the row on prisma14. + heteroRunOpsPostgresTest( + "createBatchTaskRun lands a ksuid batch on #new even when the control-plane client is passed as tx", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "crbatch_new"); + const batchId = `batch_${KSUID_27}`; // ksuid → #new + + const created = await router.createBatchTaskRun( + { + id: batchId, + friendlyId: "batch_cr_new", + runtimeEnvironmentId: env.environment.id, + runCount: 1, + }, + prisma14 as never // control-plane tx + ); + expect(created.id).toBe(batchId); + + // Resident on #new (its own DB), absent from #legacy — co-located with its ksuid child runs. + const onNew = await prisma17.batchTaskRun.findUnique({ where: { id: batchId } }); + expect(onNew).not.toBeNull(); + const onLegacy = await prisma14.batchTaskRun.findUnique({ where: { id: batchId } }); + expect(onLegacy).toBeNull(); + } + ); + + // Control: a cuid batch is created on #legacy with the same control-plane tx forwarded (same DB). + heteroRunOpsPostgresTest( + "createBatchTaskRun control: a cuid batch lands on #legacy with the control-plane tx forwarded", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "crbatch_leg"); + const batchId = `batch_${CUID_25}`; // cuid → #legacy + + const created = await router.createBatchTaskRun( + { + id: batchId, + friendlyId: "batch_cr_leg", + runtimeEnvironmentId: env.environment.id, + runCount: 1, + }, + prisma14 as never + ); + expect(created.id).toBe(batchId); + const onLegacy = await prisma14.batchTaskRun.findUnique({ where: { id: batchId } }); + expect(onLegacy).not.toBeNull(); + const onNew = await prisma17.batchTaskRun.findUnique({ where: { id: batchId } }); + expect(onNew).toBeNull(); + } + ); + + // =========================================================================================== + // countBatchTaskRunItems (this sweep) — same anti-pattern on a routed probe read. + // =========================================================================================== + + // A ksuid batch's items live on #new; counting them with the control-plane client forwarded would + // count on the wrong DB (→ 0). The routed store must read its OWN DB and return the real count. + heteroRunOpsPostgresTest( + "countBatchTaskRunItems counts a ksuid batch's items on #new even when the control-plane client is passed", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "cntitems_new"); + const batchId = `batch_${KSUID_27}`; // ksuid → #new + + await prisma17.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_cnt_new", + runtimeEnvironmentId: env.environment.id, + runCount: 2, + status: "PROCESSING", + }, + }); + const runA = `run_${KSUID_27.slice(0, -3)}cra`; + const runB = `run_${KSUID_27.slice(0, -3)}crb`; + await seedDedicatedRun(prisma17, env.environment.id, runA); + await seedDedicatedRun(prisma17, env.environment.id, runB); + await prisma17.batchTaskRunItem.create({ + data: { batchTaskRunId: batchId, taskRunId: runA, status: "COMPLETED" }, + }); + await prisma17.batchTaskRunItem.create({ + data: { batchTaskRunId: batchId, taskRunId: runB, status: "PENDING" }, + }); + + // Pass the LEGACY (control-plane) client; the routed #new store must ignore it and read its own DB. + expect( + await router.countBatchTaskRunItems({ batchTaskRunId: batchId }, prisma14 as never) + ).toBe(2); + expect( + await router.countBatchTaskRunItems( + { batchTaskRunId: batchId, status: "COMPLETED" }, + prisma14 as never + ) + ).toBe(1); + } + ); +}); diff --git a/internal-packages/run-store/src/clientCompat.test.ts b/internal-packages/run-store/src/clientCompat.test.ts new file mode 100644 index 00000000000..934d9fe8881 --- /dev/null +++ b/internal-packages/run-store/src/clientCompat.test.ts @@ -0,0 +1,17 @@ +// A TYPE-LEVEL test: it must COMPILE. It proves a RunOpsPrismaClient can back a +// PostgresRunStore that satisfies the RunStore interface, alongside the legacy client. +import { expectTypeOf, it } from "vitest"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import type { RunStore } from "./types.js"; + +it("both clients can back a RunStore", () => { + // These are type-only assertions; no runtime DB needed. + const legacy = null as unknown as PrismaClient; + const dedicated = null as unknown as RunOpsPrismaClient; + const a: RunStore = new PostgresRunStore({ prisma: legacy, readOnlyPrisma: legacy }); + const b: RunStore = new PostgresRunStore({ prisma: dedicated, readOnlyPrisma: dedicated }); + expectTypeOf(a).toMatchTypeOf(); + expectTypeOf(b).toMatchTypeOf(); +}); diff --git a/internal-packages/run-store/src/index.ts b/internal-packages/run-store/src/index.ts index de9f7620d7c..81f88ed6d10 100644 --- a/internal-packages/run-store/src/index.ts +++ b/internal-packages/run-store/src/index.ts @@ -1,3 +1,3 @@ export * from "./types.js"; export * from "./PostgresRunStore.js"; -export * from "./NoopRunStore.js"; +export * from "./runOpsStore.js"; diff --git a/internal-packages/run-store/src/runOpsStore.flipWindowDuplicate.test.ts b/internal-packages/run-store/src/runOpsStore.flipWindowDuplicate.test.ts new file mode 100644 index 00000000000..7ee52506656 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.flipWindowDuplicate.test.ts @@ -0,0 +1,183 @@ +// DOCUMENTS the bounded concurrent-flip-window cross-DB idempotency +// duplicate. This is a known, bounded (<=mintCache TTL, 30s default) edge, NOT a closed gap. +// +// During the flip window, two CONCURRENT same-(env, idempotencyKey, taskIdentifier) ROOT triggers can +// land on instances with DIVERGENT cached mint-kinds: the stale instance mints a cuid run on #legacy, +// the flipped instance a ksuid run on #new. The dedup probe (probe-before-mint) only catches an +// ALREADY-COMMITTED run; two truly-simultaneous mints both miss, then both create. The per-DB unique +// constraint on (runtimeEnvironmentId, idempotencyKey, taskIdentifier) is PER PHYSICAL DB, so it +// cannot reject the second insert that lands on the OTHER DB. This test proves both creates SUCCEED +// (the duplicate is real) and the NEW-first read fan-out collapses subsequent reads to one run +// (the duplicate is bounded — see the cross-DB dedup tie-break test). A cross-DB write guard is +// intentionally not added here; that is a deliberate policy decision left to the operator. + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { CreateRunInput, RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id length (no internal underscore): 25 -> cuid -> LEGACY, +// 27 -> ksuid -> NEW. +const cuidLegacy = (seed: string) => (seed + "c".repeat(25)).slice(0, 25); +const ksuidNew = (seed: string) => (seed + "k".repeat(27)).slice(0, 27); + +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + idempotencyKey: string; + taskIdentifier: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + idempotencyKey: params.idempotencyKey, + idempotencyKeyExpiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000), + taskIdentifier: params.taskIdentifier, + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + return new RoutingRunStore({ new: newStore, legacy: legacyStore }); +} + +describe("RoutingRunStore — mint-on-flip bounded concurrent-window cross-DB duplicate (DOCUMENTED, not guarded)", () => { + heteroRunOpsPostgresTest( + "two divergent-cache root mints of the SAME (env, key) BOTH succeed, landing one-per-DB (per-DB unique cannot catch it)", + async ({ prisma14, prisma17 }) => { + const router = makeSplitRouter(prisma14, prisma17); + // One logical environment shared across both physical DBs (same scalar envId on each). + const seed = await seedEnvironment(prisma14, "legacy", "flipwin"); + const environmentId = seed.environment.id; + const idempotencyKey = "flip-window-key"; + const taskIdentifier = "my-task"; + + const staleCuidRunId = cuidLegacy("rfl"); // stale instance mints cuid -> #legacy + const flippedKsuidRunId = ksuidNew("rfn"); // flipped instance mints ksuid -> #new + + // Both concurrent mints commit. The second does NOT throw a unique violation: the constraint is + // PER-DB and these land on different physical DBs. + await router.createRun( + buildCreateRunInput({ + runId: staleCuidRunId, + friendlyId: "run_flip_legacy", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: environmentId, + idempotencyKey, + taskIdentifier, + }) + ); + await router.createRun( + buildCreateRunInput({ + runId: flippedKsuidRunId, + friendlyId: "run_flip_new", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: environmentId, + idempotencyKey, + taskIdentifier, + }) + ); + + // The duplicate is REAL: a row for the same key physically exists on BOTH DBs. + expect(await prisma14.taskRun.findFirst({ where: { id: staleCuidRunId } })).not.toBeNull(); + expect(await prisma17.taskRun.findFirst({ where: { id: flippedKsuidRunId } })).not.toBeNull(); + + // The duplicate is BOUNDED: subsequent reads via the id-less probe collapse to exactly ONE run, + // deterministically the NEW one (NEW-first fan-out) — the same tie-break the cross-DB dedup + // read locks. So at most one of the two divergent mints is observable after the window closes. + const found = (await router.findRun({ + runtimeEnvironmentId: environmentId, + idempotencyKey, + taskIdentifier, + })) as Record | null; + expect(found).not.toBeNull(); + expect(found!.id).toBe(flippedKsuidRunId); + } + ); +}); diff --git a/internal-packages/run-store/src/runOpsStore.forWaitpointCompletion.test.ts b/internal-packages/run-store/src/runOpsStore.forWaitpointCompletion.test.ts new file mode 100644 index 00000000000..f8a4b6fc153 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.forWaitpointCompletion.test.ts @@ -0,0 +1,109 @@ +import { describe, expect, it } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { RunStore } from "./types.js"; + +// forWaitpointCompletion is async: it picks a preferred store from the id-shape + pins, then +// PROBES findWaitpoint to resolve where the token ACTUALLY lives (drain can relocate a cuid +// waitpoint onto NEW, or a ksuid token can be pinned LEGACY), falling back to the other store. +// So the slots here are fakes whose only behaviour is "do I hold this waitpoint id?". +function fakeStore(slot: string, heldIds: Set): RunStore { + return { + __slot: slot, + async findWaitpoint(args: { where?: { id?: string } }) { + const id = args.where?.id; + return id !== undefined && heldIds.has(id) ? ({ id } as never) : null; + }, + } as unknown as RunStore; +} + +const KSUID_ID = "waitpoint_" + "a".repeat(27); +const CUID_ID = "waitpoint_" + "a".repeat(25); +const UNCLASSIFIABLE_ID = "waitpoint_" + "a".repeat(26); + +// Both stores hold the id under test unless a case overrides, so the resolver returns the +// preferred store and the assertion is purely about the preference rule. +function buildRouter(opts?: { newHolds?: string[]; legacyHolds?: string[] }): { + router: RoutingRunStore; + newStore: RunStore; + legacyStore: RunStore; +} { + const all = [KSUID_ID, CUID_ID, UNCLASSIFIABLE_ID]; + const newStore = fakeStore("new", new Set(opts?.newHolds ?? all)); + const legacyStore = fakeStore("legacy", new Set(opts?.legacyHolds ?? all)); + return { + router: new RoutingRunStore({ new: newStore, legacy: legacyStore }), + newStore, + legacyStore, + }; +} + +describe("RoutingRunStore.forWaitpointCompletion", () => { + it("resolves a ksuid waitpointId with no pins to the NEW slot", async () => { + const { router, newStore } = buildRouter(); + expect(await router.forWaitpointCompletion(KSUID_ID, { routeKind: "MANUAL" })).toBe(newStore); + }); + + it("resolves a cuid waitpointId with no pins to the LEGACY slot", async () => { + const { router, legacyStore } = buildRouter(); + expect(await router.forWaitpointCompletion(CUID_ID, { routeKind: "MANUAL" })).toBe(legacyStore); + }); + + it("pins to LEGACY when isCrossTreeIdempotency is true, even for a ksuid id", async () => { + const { router, legacyStore } = buildRouter(); + expect( + await router.forWaitpointCompletion(KSUID_ID, { + routeKind: "IDEMPOTENCY_REUSE", + isCrossTreeIdempotency: true, + }) + ).toBe(legacyStore); + }); + + it("pins to LEGACY when treeOwnerResidency is LEGACY, even for a ksuid id", async () => { + const { router, legacyStore } = buildRouter(); + expect( + await router.forWaitpointCompletion(KSUID_ID, { + routeKind: "MANUAL", + treeOwnerResidency: "LEGACY", + }) + ).toBe(legacyStore); + }); + + it("pins to LEGACY when hasLegacyParent is true, even for a ksuid id", async () => { + const { router, legacyStore } = buildRouter(); + expect( + await router.forWaitpointCompletion(KSUID_ID, { + routeKind: "RUN", + hasLegacyParent: true, + }) + ).toBe(legacyStore); + }); + + it("falls back to the OTHER store when the preferred store does not hold the token", async () => { + // ksuid id prefers NEW, but the token actually lives on LEGACY (drain/relocation): the + // probe must fall through to LEGACY rather than route by id-shape alone and miss it. + const { router, legacyStore } = buildRouter({ newHolds: [], legacyHolds: [KSUID_ID] }); + expect(await router.forWaitpointCompletion(KSUID_ID, { routeKind: "MANUAL" })).toBe( + legacyStore + ); + }); + + it("resolves an unclassifiable id to LEGACY-preferred (never throws)", async () => { + // #classifySafe treats an unclassifiable id as LEGACY; with both stores empty the preferred + // (LEGACY) is returned. The completion path must not blow up on an odd-length id. + const { router, legacyStore } = buildRouter({ newHolds: [], legacyHolds: [] }); + expect(await router.forWaitpointCompletion(UNCLASSIFIABLE_ID, { routeKind: "MANUAL" })).toBe( + legacyStore + ); + }); +}); + +describe("PostgresRunStore.forWaitpointCompletion", () => { + it("returns the same store instance without classifying, even for an unclassifiable id", async () => { + // No prisma client touched: forWaitpointCompletion is a pure `return this`. + const store = new PostgresRunStore({} as never); + expect(await store.forWaitpointCompletion(UNCLASSIFIABLE_ID, { routeKind: "MANUAL" })).toBe( + store + ); + }); +}); diff --git a/internal-packages/run-store/src/runOpsStore.idempotencyDedup.test.ts b/internal-packages/run-store/src/runOpsStore.idempotencyDedup.test.ts new file mode 100644 index 00000000000..8e523660b72 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.idempotencyDedup.test.ts @@ -0,0 +1,456 @@ +// Idempotency cross-DB dedup LOCK against the REAL two-physical-DB split. +// +// The trigger hot path dedupes before minting via the id-less probe +// `runStore.findRun({ runtimeEnvironmentId, idempotencyKey, taskIdentifier }, +// { include: { associatedWaitpoint: true } }, dedupClient)` +// (apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts). The existing run may live +// on EITHER physical DB (a cuid run on #legacy minted before the org flipped to ksuid; a ksuid run on +// #new after). The PG unique key is PER-DB and cannot enforce cross-DB uniqueness, so dedup must be +// correct at the routing layer. RoutingRunStore.findRun drops the caller +// dedupClient and, for an id-less where, fans out NEW→LEGACY (#findRunUnrouted). +// Highest risk: `associatedWaitpoint` hydration — the scalar-only #new store strips the relation and +// rehydrates from Waitpoint.completedByTaskRunId, whereas #legacy uses the Prisma include; the andWait +// idempotent hit reads existingRun.associatedWaitpoint. + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { + CreateRunInput, + RunAssociatedWaitpointInput, + RunStoreSchemaVariant, +} from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH after stripping a single leading `_`: +// 25 chars → cuid → LEGACY, 27 chars → ksuid → NEW. So a classifiable id +// must carry NO internal underscore. These mint a distinct id of the right length from a short seed. +function cuidLegacy(seed: string): string { + return (seed + "c".repeat(25)).slice(0, 25); // 25 chars, no underscore → LEGACY +} +function ksuidNew(seed: string): string { + return (seed + "k".repeat(27)).slice(0, 27); // 27 chars, no underscore → NEW +} + +// On the dedicated subset there are no Organization/Project/RuntimeEnvironment models (the run-ops +// rows carry FK-free scalar ids), so we mint synthetic owning ids. On legacy we seed the real rows +// the kept FKs require. +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildAssociatedWaitpoint(params: { + id: string; + friendlyId: string; + projectId: string; + environmentId: string; +}): RunAssociatedWaitpointInput { + return { + id: params.id, + friendlyId: params.friendlyId, + type: "RUN", + status: "PENDING", + idempotencyKey: `wpidem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + idempotencyKey: string; + taskIdentifier: string; + associatedWaitpoint?: RunAssociatedWaitpointInput; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + idempotencyKey: params.idempotencyKey, + idempotencyKeyExpiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000), + taskIdentifier: params.taskIdentifier, + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: `trace_${params.runId}`, + spanId: `span_${params.runId}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + associatedWaitpoint: params.associatedWaitpoint, + }; +} + +function makeDedicatedStore(prisma17: RunOpsPrismaClient) { + return new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); +} + +function makeLegacyStore(prisma14: PrismaClient) { + return new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); +} + +// The REAL production split topology: #new = dedicated subset on prisma17, #legacy = full schema on +// prisma14. Two physically-distinct DBs, dedicated schema on #new. +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = makeLegacyStore(prisma14); + const newStore = makeDedicatedStore(prisma17); + return { + router: new RoutingRunStore({ new: newStore, legacy: legacyStore }), + legacyStore, + newStore, + }; +} + +// The EXACT dedup probe the trigger hot path issues: id-less +// where keyed on (runtimeEnvironmentId, idempotencyKey, taskIdentifier), include associatedWaitpoint. +function dedupProbe( + router: RoutingRunStore, + params: { runtimeEnvironmentId: string; idempotencyKey: string; taskIdentifier: string } +) { + return router.findRun( + { + runtimeEnvironmentId: params.runtimeEnvironmentId, + idempotencyKey: params.idempotencyKey, + taskIdentifier: params.taskIdentifier, + }, + { include: { associatedWaitpoint: true } } + ); +} + +describe("RoutingRunStore — cross-DB idempotency dedup probe", () => { + // the matching run + its associated waitpoint live on #legacy (cuid, full schema). The + // probe fans out NEW (miss) → LEGACY (hit) and must hydrate the waitpoint via the legacy include. + heteroRunOpsPostgresTest( + "a cuid run on #legacy is found by the id-less probe with associatedWaitpoint hydrated", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "cg2_a"); + const runId = cuidLegacy("ral"); // 25 chars → LEGACY home + const waitpointId = cuidLegacy("wal"); + const idempotencyKey = "cg2-key-a"; + const taskIdentifier = "my-task"; + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_cg2_a`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier, + associatedWaitpoint: buildAssociatedWaitpoint({ + id: waitpointId, + friendlyId: `waitpoint_cg2_a`, + projectId: env.project.id, + environmentId: env.environment.id, + }), + }) + ); + + // It must NOT have landed on #new (the cuid id routes to LEGACY). + expect(await prisma17.taskRun.findFirst({ where: { id: runId } })).toBeNull(); + expect(await prisma14.taskRun.findFirst({ where: { id: runId } })).not.toBeNull(); + + const found = (await dedupProbe(router, { + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier, + })) as Record | null; + + expect(found).not.toBeNull(); + expect(found!.id).toBe(runId); + expect(found!.idempotencyKey).toBe(idempotencyKey); + // The load-bearing assertion: the andWait idempotent hit reads existingRun.associatedWaitpoint. + expect(found!.associatedWaitpoint).not.toBeNull(); + expect(found!.associatedWaitpoint.id).toBe(waitpointId); + expect(found!.associatedWaitpoint.type).toBe("RUN"); + expect(found!.associatedWaitpoint.completedByTaskRunId).toBe(runId); + } + ); + + // the matching run + its associated waitpoint live on #new (ksuid, dedicated subset). The + // probe hits the NEW leg first; the SCALAR-ONLY store must strip the `associatedWaitpoint` relation + // and re-hydrate it from `Waitpoint.completedByTaskRunId`. + heteroRunOpsPostgresTest( + "a ksuid run on #new is found by the id-less probe with associatedWaitpoint hydrated from scalar", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "cg2_b"); + const runId = ksuidNew("rbn"); // 27 chars → NEW home + const waitpointId = ksuidNew("wbn"); + const idempotencyKey = "cg2-key-b"; + const taskIdentifier = "my-task"; + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_cg2_b`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier, + associatedWaitpoint: buildAssociatedWaitpoint({ + id: waitpointId, + friendlyId: `waitpoint_cg2_b`, + projectId: env.project.id, + environmentId: env.environment.id, + }), + }) + ); + + // It must NOT have landed on #legacy (the ksuid id routes to NEW). + expect(await prisma14.taskRun.findFirst({ where: { id: runId } })).toBeNull(); + expect(await prisma17.taskRun.findFirst({ where: { id: runId } })).not.toBeNull(); + + const found = (await dedupProbe(router, { + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier, + })) as Record | null; + + expect(found).not.toBeNull(); + expect(found!.id).toBe(runId); + expect(found!.idempotencyKey).toBe(idempotencyKey); + expect(found!.associatedWaitpoint).not.toBeNull(); + expect(found!.associatedWaitpoint.id).toBe(waitpointId); + expect(found!.associatedWaitpoint.type).toBe("RUN"); + expect(found!.associatedWaitpoint.completedByTaskRunId).toBe(runId); + } + ); + + // duplicate-guard contract: a run with the SAME (env, idempotencyKey, taskIdentifier) + // exists on BOTH DBs. The per-DB unique constraint allows one row each (it cannot enforce cross-DB + // uniqueness); the probe MUST still resolve to exactly ONE run, deterministically the NEW (ksuid) + // one per #findRunUnrouted (NEW-first). The duplicate itself is prevented upstream by + // probe-before-mint plus the per-DB unique constraint; this locks the read tie-break contract. + heteroRunOpsPostgresTest( + "the same (env, key) on BOTH DBs resolves deterministically to the NEW run", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + // ONE logical environment id shared by both DBs (the run-ops envId is the same scalar on each). + const legacySeed = await seedEnvironment(prisma14, "legacy", "cg2_c"); + const environmentId = legacySeed.environment.id; + const idempotencyKey = "cg2-key-c"; + const taskIdentifier = "my-task"; + + const legacyRunId = cuidLegacy("rcl"); // cuid → LEGACY + const newRunId = ksuidNew("rcn"); // ksuid → NEW + const legacyWaitpointId = cuidLegacy("wcl"); + const newWaitpointId = ksuidNew("wcn"); + + await router.createRun( + buildCreateRunInput({ + runId: legacyRunId, + friendlyId: `run_friendly_cg2_c_l`, + organizationId: legacySeed.organization.id, + projectId: legacySeed.project.id, + runtimeEnvironmentId: environmentId, + idempotencyKey, + taskIdentifier, + associatedWaitpoint: buildAssociatedWaitpoint({ + id: legacyWaitpointId, + friendlyId: `waitpoint_cg2_c_l`, + projectId: legacySeed.project.id, + environmentId, + }), + }) + ); + await router.createRun( + buildCreateRunInput({ + // #new is the dedicated subset (FK-free scalar ids), so the same environmentId scalar is + // valid there with no owning rows needed. + runId: newRunId, + friendlyId: `run_friendly_cg2_c_n`, + organizationId: legacySeed.organization.id, + projectId: legacySeed.project.id, + runtimeEnvironmentId: environmentId, + idempotencyKey, + taskIdentifier, + associatedWaitpoint: buildAssociatedWaitpoint({ + id: newWaitpointId, + friendlyId: `waitpoint_cg2_c_n`, + projectId: legacySeed.project.id, + environmentId, + }), + }) + ); + + // Sanity: both physical DBs really do carry a row for this key. + expect(await prisma14.taskRun.findFirst({ where: { id: legacyRunId } })).not.toBeNull(); + expect(await prisma17.taskRun.findFirst({ where: { id: newRunId } })).not.toBeNull(); + + const found = (await dedupProbe(router, { + runtimeEnvironmentId: environmentId, + idempotencyKey, + taskIdentifier, + })) as Record | null; + + // Exactly ONE run, deterministically the NEW one (NEW-first fan-out), with its + // own DB's associated waitpoint hydrated. + expect(found).not.toBeNull(); + expect(found!.id).toBe(newRunId); + expect(found!.associatedWaitpoint).not.toBeNull(); + expect(found!.associatedWaitpoint.id).toBe(newWaitpointId); + } + ); + + // Negative: no row on either DB → null (so the trigger path proceeds to mint a fresh run). + heteroRunOpsPostgresTest( + "miss: an unknown (env, key) returns null from the cross-DB probe", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "cg2_miss"); + + const found = await dedupProbe(router, { + runtimeEnvironmentId: env.environment.id, + idempotencyKey: "cg2-key-does-not-exist", + taskIdentifier: "my-task", + }); + + expect(found).toBeNull(); + } + ); + + // Standalone idempotent hit (no associatedWaitpoint): the include key must still be PRESENT in the + // result and be null, on BOTH DB homes — the andWait path lazily creates the waitpoint when this is + // falsy, so a MISSING key (undefined) vs null must not differ. + heteroRunOpsPostgresTest( + "standalone: a run with no associatedWaitpoint hydrates the include key as null on #new", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma17, "dedicated", "cg2_sa_n"); + const runId = ksuidNew("rsan"); // ksuid → NEW + const idempotencyKey = "cg2-key-sa-n"; + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_cg2_sa_n`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier: "my-task", + // no associatedWaitpoint + }) + ); + + const found = (await dedupProbe(router, { + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier: "my-task", + })) as Record | null; + + expect(found).not.toBeNull(); + expect(found!.id).toBe(runId); + expect("associatedWaitpoint" in found!).toBe(true); + expect(found!.associatedWaitpoint).toBeNull(); + } + ); + + heteroRunOpsPostgresTest( + "standalone: a run with no associatedWaitpoint hydrates the include key as null on #legacy", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedEnvironment(prisma14, "legacy", "cg2_sa_l"); + const runId = cuidLegacy("rsal"); // cuid → LEGACY + const idempotencyKey = "cg2-key-sa-l"; + + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_cg2_sa_l`, + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier: "my-task", + }) + ); + + const found = (await dedupProbe(router, { + runtimeEnvironmentId: env.environment.id, + idempotencyKey, + taskIdentifier: "my-task", + })) as Record | null; + + expect(found).not.toBeNull(); + expect(found!.id).toBe(runId); + expect("associatedWaitpoint" in found!).toBe(true); + expect(found!.associatedWaitpoint).toBeNull(); + } + ); +}); diff --git a/internal-packages/run-store/src/runOpsStore.mixedResidency.test.ts b/internal-packages/run-store/src/runOpsStore.mixedResidency.test.ts new file mode 100644 index 00000000000..f3b5d164655 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.mixedResidency.test.ts @@ -0,0 +1,900 @@ +// MIXED-RESIDENCY MATRIX — systematic LOCK that every RoutingRunStore fan-out / partition / merge / +// dedup method behaves correctly when cuid (#legacy) AND ksuid (#new) data COEXIST in the SAME call, +// against the REAL two-physical-DB split (heteroRunOpsPostgresTest: prisma14 = full/legacy on PG14, +// prisma17 = RunOpsPrismaClient / dedicated subset on PG17). NEVER mocked. +// +// Existing tests exercise these methods one residency at a time or for a single specific bug. This +// file is the cross-residency matrix: each case seeds BOTH a cuid row on #legacy AND a ksuid row on +// #new in one environment, then drives the wired router and asserts the merge/partition is correct. +// The matrix MUST go RED if a fan-out leg is dropped or a NEW-wins dedup regresses (see the reverted +// mutation probes recorded in the task report). + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { CreateRunInput, RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH after stripping a leading `_` +// (runOpsResidency.ts): 25 chars (no internal underscore) → cuid → LEGACY, 27 chars → ksuid → NEW. +// These mint a distinct classifiable id of the right length from a short seed. +function cuidLegacy(seed: string): string { + return (seed + "c".repeat(25)).slice(0, 25); // 25 chars → LEGACY (#legacy / prisma14) +} +function ksuidNew(seed: string): string { + return (seed + "k".repeat(27)).slice(0, 27); // 27 chars → NEW (#new / prisma17) +} + +// On the dedicated subset there are no Organization/Project/RuntimeEnvironment models (run-ops rows +// carry FK-free scalar ids), so mint synthetic owning ids. On legacy seed the real rows the kept FKs +// require. +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + taskIdentifier?: string; + status?: "PENDING" | "EXECUTING"; + spanId?: string; + batchId?: string; + createdAt?: Date; + idempotencyKey?: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: params.status ?? "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: params.taskIdentifier ?? "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: `trace_${params.runId}`, + spanId: params.spanId ?? `span_${params.runId}`, + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: params.createdAt ?? new Date("2024-01-01T00:00:00.000Z"), + ...(params.batchId && { batchId: params.batchId }), + ...(params.idempotencyKey && { + idempotencyKey: params.idempotencyKey, + idempotencyKeyExpiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000), + }), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: params.status ?? "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +async function seedPendingWaitpoint( + prisma: AnyClient, + params: { + id: string; + friendlyId: string; + projectId: string; + environmentId: string; + type?: "MANUAL" | "RUN" | "DATETIME"; + status?: "PENDING" | "COMPLETED"; + completedByTaskRunId?: string; + completedByBatchId?: string; + } +) { + return (prisma as PrismaClient).waitpoint.create({ + data: { + id: params.id, + friendlyId: params.friendlyId, + type: params.type ?? "MANUAL", + status: params.status ?? "PENDING", + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + ...(params.completedByTaskRunId && { completedByTaskRunId: params.completedByTaskRunId }), + ...(params.completedByBatchId && { completedByBatchId: params.completedByBatchId }), + }, + }); +} + +function makeDedicatedStore(prisma17: RunOpsPrismaClient) { + return new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); +} + +function makeLegacyStore(prisma14: PrismaClient) { + return new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); +} + +// The REAL production split topology: #new = dedicated subset on prisma17, #legacy = full schema on +// prisma14. Two physically-distinct DBs, dedicated subset schema on #new. +function makeSplitRouter(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const legacyStore = makeLegacyStore(prisma14); + const newStore = makeDedicatedStore(prisma17); + return { + router: new RoutingRunStore({ new: newStore, legacy: legacyStore }), + legacyStore, + newStore, + }; +} + +// Seed ONE logical environment whose scalar env/project/org ids are shared by both physical DBs (the +// run-ops scalar ids are identical on each), with real owning rows on #legacy and synthetic ids on +// #new. Returns the shared scalar ids used by every mixed-residency seed. +async function seedSharedEnv(prisma14: PrismaClient, suffix: string) { + const legacy = await seedEnvironment(prisma14, "legacy", suffix); + return { + organizationId: legacy.organization.id, + projectId: legacy.project.id, + runtimeEnvironmentId: legacy.environment.id, + environmentId: legacy.environment.id, + }; +} + +describe("RoutingRunStore — mixed-residency matrix (cuid #legacy + ksuid #new coexisting)", () => { + // ── Case 1: findRuns by a MIXED bounded id-set (#findRunsByIdSet, runOpsStore.ts:294) ── + // A list-hydrate id set spans cuid (legacy) + ksuid (new) ids plus a ksuid id absent from legacy. + // Both resident runs returned; take/skip applied GLOBALLY post-merge; orderBy honored; the absent + // ksuid short-circuits (never probed on LEGACY, :309). + heteroRunOpsPostgresTest( + "case 1: findRuns by a mixed id-set returns both DBs' runs, ordered, take/skip global", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m1"); + + const legacyId = cuidLegacy("m1l"); // cuid → #legacy + const newId = ksuidNew("m1n"); // ksuid → #new + const ghostKsuid = ksuidNew("m1g"); // ksuid, NEVER created → tests the LEGACY short-circuit + + await router.createRun( + buildCreateRunInput({ + runId: legacyId, + friendlyId: "run_m1_legacy", + createdAt: new Date("2024-01-02T00:00:00.000Z"), + ...env, + }) + ); + await router.createRun( + buildCreateRunInput({ + runId: newId, + friendlyId: "run_m1_new", + createdAt: new Date("2024-01-01T00:00:00.000Z"), + ...env, + }) + ); + + // Physical residency sanity: each landed on its own DB only. + expect(await prisma14.taskRun.findUnique({ where: { id: legacyId } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: legacyId } })).toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: newId } })).not.toBeNull(); + expect(await prisma14.taskRun.findUnique({ where: { id: newId } })).toBeNull(); + + // Full merge, ordered by createdAt asc → newId (Jan 1) before legacyId (Jan 2). + const all = await router.findRuns({ + where: { id: { in: [legacyId, newId, ghostKsuid] } }, + select: { id: true, createdAt: true }, + orderBy: { createdAt: "asc" }, + }); + expect(all.map((r) => r.id)).toEqual([newId, legacyId]); + + // take=1 after the merge → only the first (newId). Proves take is applied GLOBALLY, not per-leg + // (a per-leg take=1 would return one row from EACH DB → both ids). + const firstOnly = await router.findRuns({ + where: { id: { in: [legacyId, newId, ghostKsuid] } }, + select: { id: true }, + orderBy: { createdAt: "asc" }, + take: 1, + }); + expect(firstOnly.map((r) => r.id)).toEqual([newId]); + + // skip=1 take=1 → the second (legacyId). + const second = await router.findRuns({ + where: { id: { in: [legacyId, newId, ghostKsuid] } }, + select: { id: true }, + orderBy: { createdAt: "asc" }, + skip: 1, + take: 1, + }); + expect(second.map((r) => r.id)).toEqual([legacyId]); + } + ); + + // ── Case 1b: NEW-wins on id collision in #findRunsByIdSet ── + // The copy→fence window can leave the same id on both DBs. The id-set path queries NEW first; an id + // already found on NEW must NOT be re-fetched from LEGACY, so the NEW copy wins. + heteroRunOpsPostgresTest( + "case 1b: findRuns by id-set with a colliding id resolves to the NEW copy", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m1b"); + + // A cuid id (LEGACY id-shape) that exists on BOTH DBs with a distinguishing field. + const collidingId = cuidLegacy("m1b"); + await router.createRun( + buildCreateRunInput({ runId: collidingId, friendlyId: "run_m1b_legacy", ...env }) + ); // → #legacy (cuid) + // Force the same id onto #new with a different taskIdentifier so we can tell the copies apart. + await prisma17.taskRun.create({ + data: { + id: collidingId, + engine: "V2", + status: "PENDING", + friendlyId: "run_m1b_new", + runtimeEnvironmentId: env.environmentId, + environmentType: "DEVELOPMENT", + organizationId: env.organizationId, + projectId: env.projectId, + taskIdentifier: "new-copy-wins", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: "t", + spanId: "s", + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + + const rows = await router.findRuns({ + where: { id: { in: [collidingId] } }, + select: { id: true, taskIdentifier: true }, + }); + expect(rows).toHaveLength(1); // deduped, not double-reported + expect((rows[0] as any).taskIdentifier).toBe("new-copy-wins"); // NEW wins + } + ); + + // ── Case 2: findRuns by an OPEN predicate (#findRunsOpen, runOpsStore.ts:319) ── + // No id set → query BOTH stores, union, dedup by id (NEW wins). Filter by a shared scalar + // (runtimeEnvironmentId + status) that matches rows on both DBs. + heteroRunOpsPostgresTest( + "case 2: findRuns by an open predicate unions rows from both DBs (NEW-wins dedup)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m2"); + + const legacyId = cuidLegacy("m2l"); + const newId = ksuidNew("m2n"); + await router.createRun( + buildCreateRunInput({ + runId: legacyId, + friendlyId: "run_m2_legacy", + status: "EXECUTING", + ...env, + }) + ); + await router.createRun( + buildCreateRunInput({ runId: newId, friendlyId: "run_m2_new", status: "EXECUTING", ...env }) + ); + // A PENDING run on each DB that must be FILTERED OUT by the status predicate. + await router.createRun( + buildCreateRunInput({ + runId: cuidLegacy("m2lp"), + friendlyId: "run_m2_legacy_pending", + status: "PENDING", + ...env, + }) + ); + await router.createRun( + buildCreateRunInput({ + runId: ksuidNew("m2np"), + friendlyId: "run_m2_new_pending", + status: "PENDING", + ...env, + }) + ); + + const executing = await router.findRuns({ + where: { runtimeEnvironmentId: env.environmentId, status: "EXECUTING" }, + select: { id: true }, + orderBy: { id: "asc" }, + }); + expect(executing.map((r) => r.id).sort()).toEqual([legacyId, newId].sort()); + } + ); + + // ── Case 3: expireRunsBatch with a MIXED id list (runOpsStore.ts:474) ── + // Partitions ksuid→NEW / cuid→LEGACY; each leg called only when non-empty; counts summed; each row + // updated on its OWN DB only. + heteroRunOpsPostgresTest( + "case 3: expireRunsBatch partitions a mixed id list per-DB and sums the count", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m3"); + + const legacyId = cuidLegacy("m3l"); + const newId = ksuidNew("m3n"); + await router.createRun( + buildCreateRunInput({ runId: legacyId, friendlyId: "run_m3_legacy", ...env }) + ); + await router.createRun( + buildCreateRunInput({ runId: newId, friendlyId: "run_m3_new", ...env }) + ); + + const now = new Date("2024-03-03T00:00:00.000Z"); + const count = await router.expireRunsBatch([legacyId, newId], { + error: { type: "STRING_ERROR", raw: "expired" }, + now, + }); + expect(count).toBe(2); // one updated on each DB, summed + + // Each row is EXPIRED on its OWN DB only. + expect((await prisma14.taskRun.findUnique({ where: { id: legacyId } }))?.status).toBe( + "EXPIRED" + ); + expect((await prisma17.taskRun.findUnique({ where: { id: newId } }))?.status).toBe("EXPIRED"); + } + ); + + // ── Case 4: clearIdempotencyKey fan-out arm (byFriendlyIds, runOpsStore.ts:358) ── + // byFriendlyIds spans mixed residency → fan out to both, sum the count, each row cleared on its home. + heteroRunOpsPostgresTest( + "case 4: clearIdempotencyKey byFriendlyIds clears across both DBs and sums the count", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m4"); + + const legacyId = cuidLegacy("m4l"); + const newId = ksuidNew("m4n"); + await router.createRun( + buildCreateRunInput({ + runId: legacyId, + friendlyId: "run_m4_legacy", + idempotencyKey: "m4-key-legacy", + ...env, + }) + ); + await router.createRun( + buildCreateRunInput({ + runId: newId, + friendlyId: "run_m4_new", + idempotencyKey: "m4-key-new", + ...env, + }) + ); + + const { count } = await router.clearIdempotencyKey({ + byFriendlyIds: ["run_m4_legacy", "run_m4_new"], + }); + expect(count).toBe(2); // one cleared on each DB, summed + + expect((await prisma14.taskRun.findUnique({ where: { id: legacyId } }))?.idempotencyKey).toBe( + null + ); + expect((await prisma17.taskRun.findUnique({ where: { id: newId } }))?.idempotencyKey).toBe( + null + ); + } + ); + + // ── Case 5: countPendingWaitpoints scattered across both DBs (runOpsStore.ts:731) ── + // A run's pending waitpoints can be split across both stores mid-drain → count on each and sum. + heteroRunOpsPostgresTest( + "case 5: countPendingWaitpoints sums PENDING waitpoints scattered across both DBs", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m5"); + + const legacyWp = cuidLegacy("m5l"); // PENDING on #legacy + const newWp = ksuidNew("m5n"); // PENDING on #new + const completedWp = ksuidNew("m5c"); // COMPLETED on #new → must NOT be counted + await seedPendingWaitpoint(prisma14, { + id: legacyWp, + friendlyId: "wp_m5_legacy", + projectId: env.projectId, + environmentId: env.environmentId, + }); + await seedPendingWaitpoint(prisma17, { + id: newWp, + friendlyId: "wp_m5_new", + projectId: env.projectId, + environmentId: env.environmentId, + }); + await seedPendingWaitpoint(prisma17, { + id: completedWp, + friendlyId: "wp_m5_completed", + projectId: env.projectId, + environmentId: env.environmentId, + status: "COMPLETED", + }); + + // Both PENDING ones counted (one per DB); the COMPLETED one excluded. + expect(await router.countPendingWaitpoints([legacyWp, newWp, completedWp])).toBe(2); + } + ); + + // ── Case 6: findManyWaitpoints { id: { in: [...mixed...] } } (runOpsStore.ts:793) ── + // Merge waitpoints from both DBs for a mixed id set. + heteroRunOpsPostgresTest( + "case 6: findManyWaitpoints merges a mixed id set from both DBs", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m6"); + + const legacyWp = cuidLegacy("m6l"); + const newWp = ksuidNew("m6n"); + await seedPendingWaitpoint(prisma14, { + id: legacyWp, + friendlyId: "wp_m6_legacy", + projectId: env.projectId, + environmentId: env.environmentId, + }); + await seedPendingWaitpoint(prisma17, { + id: newWp, + friendlyId: "wp_m6_new", + projectId: env.projectId, + environmentId: env.environmentId, + }); + + const found = await router.findManyWaitpoints({ where: { id: { in: [legacyWp, newWp] } } }); + expect(found.map((w) => w.id).sort()).toEqual([legacyWp, newWp].sort()); + } + ); + + // ── Case 8: findExecutionSnapshot / findManyExecutionSnapshots OPEN (no runId) where ── + // A by-snapshot-id-only lookup (snapshot ids are non-classifiable cuids) must fan out NEW→LEGACY + // (findExecutionSnapshot, :675) / merge both (findManyExecutionSnapshots, :688). Seed a snapshot on + // EACH DB (one ksuid run on #new, one cuid run on #legacy) and read with a no-runId where. + heteroRunOpsPostgresTest( + "case 8: findExecutionSnapshot/findManyExecutionSnapshots with an open where reach both DBs", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m8"); + + const legacyRun = cuidLegacy("m8l"); + const newRun = ksuidNew("m8n"); + await router.createRun( + buildCreateRunInput({ runId: legacyRun, friendlyId: "run_m8_legacy", ...env }) + ); + await router.createRun( + buildCreateRunInput({ runId: newRun, friendlyId: "run_m8_new", ...env }) + ); + + const snapEnv = { + environmentId: env.environmentId, + environmentType: "DEVELOPMENT" as const, + projectId: env.projectId, + organizationId: env.organizationId, + }; + const legacySnap = await router.createExecutionSnapshot({ + run: { id: legacyRun, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "m8 legacy snap" }, + ...snapEnv, + }); + const newSnap = await router.createExecutionSnapshot({ + run: { id: newRun, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "m8 new snap" }, + ...snapEnv, + }); + + // findExecutionSnapshot with a no-runId where targeting the LEGACY snapshot id: NEW miss → LEGACY hit. + const foundLegacy = await router.findExecutionSnapshot({ where: { id: legacySnap.id } }); + expect(foundLegacy?.id).toBe(legacySnap.id); + // And the NEW snapshot id resolves on the NEW leg. + const foundNew = await router.findExecutionSnapshot({ where: { id: newSnap.id } }); + expect(foundNew?.id).toBe(newSnap.id); + + // findManyExecutionSnapshots open where (both ids) merges both DBs. + const many = await router.findManyExecutionSnapshots({ + where: { id: { in: [legacySnap.id, newSnap.id] } }, + }); + expect(many.map((s) => s.id).sort()).toEqual([legacySnap.id, newSnap.id].sort()); + } + ); + + // ── Case 9a: findRun with an UNCLASSIFIABLE where (spanId) on a #legacy run (#findRunUnrouted, :213) ── + // A ksuid run on #new and a cuid run on #legacy each carry a distinct spanId. A spanId where can't + // be id-classified → fan out NEW-first then LEGACY. The legacy-resident run must be found. + heteroRunOpsPostgresTest( + "case 9a: findRun by spanId fans out and finds a #legacy run (NEW miss → LEGACY hit)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m9a"); + + const legacyRun = cuidLegacy("m9al"); + const newRun = ksuidNew("m9an"); + await router.createRun( + buildCreateRunInput({ + runId: legacyRun, + friendlyId: "run_m9a_legacy", + spanId: "span_m9a_legacy", + ...env, + }) + ); + await router.createRun( + buildCreateRunInput({ + runId: newRun, + friendlyId: "run_m9a_new", + spanId: "span_m9a_new", + ...env, + }) + ); + + const onLegacy = (await router.findRun( + { spanId: "span_m9a_legacy" }, + { select: { id: true } } + )) as Record | null; + expect(onLegacy?.id).toBe(legacyRun); + + const onNew = (await router.findRun( + { spanId: "span_m9a_new" }, + { select: { id: true } } + )) as Record | null; + expect(onNew?.id).toBe(newRun); + } + ); + + // ── Case 9b: findRunOrThrow with an UNCLASSIFIABLE where (spanId) on a #legacy run (:593) ── + // The throwing twin must match findRun's fan-out: an unclassifiable where whose only matching run + // lives on #legacy must NOT throw. A NEW-only fallback would miss the legacy run and throw. + heteroRunOpsPostgresTest( + "case 9b: findRunOrThrow by spanId fans out and finds a #legacy run without throwing", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m9b"); + + const legacyRun = cuidLegacy("m9bl"); + const newRun = ksuidNew("m9bn"); + await router.createRun( + buildCreateRunInput({ + runId: legacyRun, + friendlyId: "run_m9b_legacy", + spanId: "span_m9b_legacy", + ...env, + }) + ); + await router.createRun( + buildCreateRunInput({ + runId: newRun, + friendlyId: "run_m9b_new", + spanId: "span_m9b_new", + ...env, + }) + ); + + const onLegacy = (await router.findRunOrThrow( + { spanId: "span_m9b_legacy" }, + { select: { id: true } } + )) as Record; + expect(onLegacy.id).toBe(legacyRun); + + const onNew = (await router.findRunOrThrow( + { spanId: "span_m9b_new" }, + { select: { id: true } } + )) as Record; + expect(onNew.id).toBe(newRun); + } + ); + + // ── Case 7: findManyTaskRunWaitpoints with edges whose relations STRADDLE DBs (runOpsStore.ts:876) ── + // An edge co-locates with its RUN, but its `waitpoint`/`taskRun` relations can live on the OTHER DB + // (a cuid token blocking a ksuid run, and vice versa). The per-leg scalar query is stripped of the + // relation keys; the router re-hydrates `waitpoint`/`taskRun` across BOTH DBs. Exercises BOTH + // straddle directions in one read by querying both edges via { taskRunId: { in } }. + heteroRunOpsPostgresTest( + "case 7: findManyTaskRunWaitpoints rehydrates waitpoint/taskRun relations across both DBs (both straddle directions)", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m7"); + + // Direction A: ksuid run on #new, blocked on a cuid token that lives ONLY on #legacy. Edge on #new. + const newRun = ksuidNew("m7nr"); + const legacyToken = cuidLegacy("m7lt"); + await router.createRun( + buildCreateRunInput({ runId: newRun, friendlyId: "run_m7_new", ...env }) + ); + await seedPendingWaitpoint(prisma14, { + id: legacyToken, + friendlyId: "wp_m7_legacy_token", + projectId: env.projectId, + environmentId: env.environmentId, + }); + // Write the edge on #new (the run's DB) directly — the cuid token is absent from #new, so the + // edge's `waitpoint` must be re-hydrated from #legacy. + await prisma17.$executeRawUnsafe( + `INSERT INTO "TaskRunWaitpoint" ("id","taskRunId","waitpointId","projectId","createdAt","updatedAt") VALUES (gen_random_uuid(),'${newRun}','${legacyToken}','${env.projectId}',NOW(),NOW())` + ); + + // Direction B: cuid run on #legacy, blocked on a ksuid token mirrored onto BOTH DBs (drain + // window). The #legacy copy is a STALE placeholder (PENDING) that satisfies the legacy edge FK; + // the AUTHORITATIVE #new copy is COMPLETED. Edge on #legacy. Hydration re-resolves cross-DB and + // NEW-wins the dedup → the edge's waitpoint must read the #new (COMPLETED) copy, not the local mirror. + const legacyRun = cuidLegacy("m7lr"); + const newToken = ksuidNew("m7nt"); + await router.createRun( + buildCreateRunInput({ runId: legacyRun, friendlyId: "run_m7_legacy", ...env }) + ); + await seedPendingWaitpoint(prisma14, { + id: newToken, + friendlyId: "wp_m7_legacy_mirror", + projectId: env.projectId, + environmentId: env.environmentId, + status: "PENDING", + }); + await seedPendingWaitpoint(prisma17, { + id: newToken, + friendlyId: "wp_m7_new_token", + projectId: env.projectId, + environmentId: env.environmentId, + status: "COMPLETED", + }); + await prisma14.$executeRawUnsafe( + `INSERT INTO "TaskRunWaitpoint" ("id","taskRunId","waitpointId","projectId","createdAt","updatedAt") VALUES (gen_random_uuid(),'${legacyRun}','${newToken}','${env.projectId}',NOW(),NOW())` + ); + + // Edges sanity: each edge lives on its run's DB only. + expect(await prisma17.taskRunWaitpoint.count({ where: { taskRunId: newRun } })).toBe(1); + expect(await prisma14.taskRunWaitpoint.count({ where: { taskRunId: newRun } })).toBe(0); + expect(await prisma14.taskRunWaitpoint.count({ where: { taskRunId: legacyRun } })).toBe(1); + expect(await prisma17.taskRunWaitpoint.count({ where: { taskRunId: legacyRun } })).toBe(0); + + // One read spanning both runs: both edges returned (deduped by id), and each edge's `waitpoint` + // + `taskRun` re-hydrated from whichever DB holds them. + const edges = (await router.findManyTaskRunWaitpoints({ + where: { taskRunId: { in: [newRun, legacyRun] } }, + select: { + id: true, + taskRunId: true, + waitpointId: true, + waitpoint: { select: { id: true, status: true } }, + taskRun: { select: { id: true } }, + }, + })) as Array>; + + expect(edges).toHaveLength(2); + const byRun = new Map(edges.map((e) => [e.taskRunId as string, e])); + + // Direction A edge: waitpoint hydrated from #legacy (cuid token), taskRun is the #new run. + const aEdge = byRun.get(newRun)!; + expect(aEdge.waitpoint?.id).toBe(legacyToken); + expect(aEdge.waitpoint?.status).toBe("PENDING"); + expect(aEdge.taskRun?.id).toBe(newRun); + + // Direction B edge: waitpoint hydrated from the AUTHORITATIVE #new copy (COMPLETED), proving the + // relation was re-resolved cross-DB and NEW won the dedup over the stale local #legacy mirror. + const bEdge = byRun.get(legacyRun)!; + expect(bEdge.waitpoint?.id).toBe(newToken); + expect(bEdge.waitpoint?.status).toBe("COMPLETED"); + expect(bEdge.taskRun?.id).toBe(legacyRun); + } + ); + + // ── Case 7b: the "blocking waitpoint not found on either DB" HARD ERROR (runOpsStore.ts:917) ── + // An edge whose `waitpointId` resolves on NEITHER DB must throw rather than leave a null status that + // would strand (hang) or wrongly unblock the run. + heteroRunOpsPostgresTest( + "case 7b: findManyTaskRunWaitpoints throws when a blocking waitpoint is on neither DB", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m7b"); + + const newRun = ksuidNew("m7br"); + const ghostToken = ksuidNew("m7bg"); // never created on either DB + await router.createRun( + buildCreateRunInput({ runId: newRun, friendlyId: "run_m7b_new", ...env }) + ); + await prisma17.$executeRawUnsafe( + `INSERT INTO "TaskRunWaitpoint" ("id","taskRunId","waitpointId","projectId","createdAt","updatedAt") VALUES (gen_random_uuid(),'${newRun}','${ghostToken}','${env.projectId}',NOW(),NOW())` + ); + + await expect( + router.findManyTaskRunWaitpoints({ + where: { taskRunId: newRun }, + select: { id: true, waitpointId: true, waitpoint: { select: { status: true } } }, + }) + ).rejects.toThrow(/not found on either run-ops DB/); + } + ); + + // ── Case 10: findBatchTaskRunById / findBatchTaskRunByFriendlyId NEW-then-LEGACY probe (:1124,:1137) ── + // A batch resident on #legacy AND a ksuid-id batch landed on #new (the control-plane window mints + // cuid ids, but a ksuid batch resides on #new) are BOTH found via the probe, regardless of id-shape. + heteroRunOpsPostgresTest( + "case 10: findBatchTaskRunById/byFriendlyId probe NEW then LEGACY and find batches on either DB", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m10"); + + const legacyBatch = cuidLegacy("m10l"); // cuid → #legacy + const newBatch = ksuidNew("m10n"); // ksuid → #new + await prisma14.batchTaskRun.create({ + data: { + id: legacyBatch, + friendlyId: "batch_m10_legacy", + runtimeEnvironmentId: env.environmentId, + runCount: 1, + status: "PROCESSING", + }, + }); + await prisma17.batchTaskRun.create({ + data: { + id: newBatch, + friendlyId: "batch_m10_new", + runtimeEnvironmentId: env.environmentId, + runCount: 1, + status: "PROCESSING", + }, + }); + + // by id: each found on its own DB via the NEW-then-LEGACY probe. + expect((await router.findBatchTaskRunById(legacyBatch))?.id).toBe(legacyBatch); + expect((await router.findBatchTaskRunById(newBatch))?.id).toBe(newBatch); + + // by friendlyId (env-scoped): same probe order, both resolved. + expect( + (await router.findBatchTaskRunByFriendlyId("batch_m10_legacy", env.environmentId))?.id + ).toBe(legacyBatch); + expect( + (await router.findBatchTaskRunByFriendlyId("batch_m10_new", env.environmentId))?.id + ).toBe(newBatch); + } + ); + + // ── Case 11a: updateManyWaitpoints with a NO-ID (batch) where fans out to both and sums (:822) ── + // A batch where (no single routable id, e.g. completedByTaskRunId IS NULL + status PENDING) must + // apply on BOTH DBs and sum the count. + heteroRunOpsPostgresTest( + "case 11a: updateManyWaitpoints with a no-id where updates both DBs and sums the count", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m11a"); + + const legacyWp = cuidLegacy("m11al"); + const newWp = ksuidNew("m11an"); + await seedPendingWaitpoint(prisma14, { + id: legacyWp, + friendlyId: "wp_m11a_legacy", + projectId: env.projectId, + environmentId: env.environmentId, + }); + await seedPendingWaitpoint(prisma17, { + id: newWp, + friendlyId: "wp_m11a_new", + projectId: env.projectId, + environmentId: env.environmentId, + }); + + const { count } = await router.updateManyWaitpoints({ + where: { status: "PENDING", projectId: env.projectId }, + data: { status: "COMPLETED" }, + }); + expect(count).toBe(2); // one per DB, summed + + expect((await prisma14.waitpoint.findUnique({ where: { id: legacyWp } }))?.status).toBe( + "COMPLETED" + ); + expect((await prisma17.waitpoint.findUnique({ where: { id: newWp } }))?.status).toBe( + "COMPLETED" + ); + } + ); + + // ── Case 11b: deleteManyTaskRunWaitpoints by taskRunId fans out to both and sums (:944) ── + // A run's edges can straddle DBs mid-drain; a delete keyed by taskRunId (not waitpointId) must + // delete from BOTH DBs and sum the count. + heteroRunOpsPostgresTest( + "case 11b: deleteManyTaskRunWaitpoints by taskRunId deletes edges on both DBs and sums", + async ({ prisma14, prisma17 }) => { + const { router } = makeSplitRouter(prisma14, prisma17); + const env = await seedSharedEnv(prisma14, "m11b"); + + // ONE logical run id whose edges happen to exist on BOTH DBs (the straddle the fan-out guards). + // The edge is FK-free on #new (unnest path) and FK-bound on #legacy, so seed a co-resident + // waitpoint + run on #legacy for its edge, and write the #new edge directly. + const runId = ksuidNew("m11br"); + const legacyToken = cuidLegacy("m11bt"); + await router.createRun(buildCreateRunInput({ runId, friendlyId: "run_m11b", ...env })); + // #legacy needs the run + token present for the FK-bound edge insert. + await prisma14.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: "run_m11b_legacy_mirror", + runtimeEnvironmentId: env.environmentId, + environmentType: "DEVELOPMENT", + organizationId: env.organizationId, + projectId: env.projectId, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: "t", + spanId: "s_m11b", + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await seedPendingWaitpoint(prisma14, { + id: legacyToken, + friendlyId: "wp_m11b_legacy", + projectId: env.projectId, + environmentId: env.environmentId, + }); + await prisma14.$executeRawUnsafe( + `INSERT INTO "TaskRunWaitpoint" ("id","taskRunId","waitpointId","projectId","createdAt","updatedAt") VALUES (gen_random_uuid(),'${runId}','${legacyToken}','${env.projectId}',NOW(),NOW())` + ); + // #new edge (FK-free) pointing at a ksuid token absent locally — drain straddle. + const newToken = ksuidNew("m11bn"); + await prisma17.$executeRawUnsafe( + `INSERT INTO "TaskRunWaitpoint" ("id","taskRunId","waitpointId","projectId","createdAt","updatedAt") VALUES (gen_random_uuid(),'${runId}','${newToken}','${env.projectId}',NOW(),NOW())` + ); + + expect(await prisma14.taskRunWaitpoint.count({ where: { taskRunId: runId } })).toBe(1); + expect(await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } })).toBe(1); + + const { count } = await router.deleteManyTaskRunWaitpoints({ where: { taskRunId: runId } }); + expect(count).toBe(2); // one edge deleted on each DB, summed + + expect(await prisma14.taskRunWaitpoint.count({ where: { taskRunId: runId } })).toBe(0); + expect(await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } })).toBe(0); + } + ); +}); diff --git a/internal-packages/run-store/src/runOpsStore.readAfterWrite.test.ts b/internal-packages/run-store/src/runOpsStore.readAfterWrite.test.ts new file mode 100644 index 00000000000..a32ac9a12d5 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.readAfterWrite.test.ts @@ -0,0 +1,317 @@ +// RED→GREEN repro for the run-ops split READ-AFTER-WRITE hole: +// RoutingRunStore.findRun/findRunOrThrow dropped the caller's client and always routed the read to +// the owning store's REPLICA (readOnlyPrisma). Read-after-write callers +// (api.v1.sessions / api.v1.tasks.$taskId.trigger) deliberately pass the control-plane WRITER +// (`prisma`) to read back a run they just committed and beat replica lag. Routed to the lagging +// replica the read returned null → "Triggered run X not found" → HTTP 500. +// +// The fix keys on the passed client's IDENTITY: a WRITER (has `$transaction`) means read-your-writes +// → route to the OWNING store's own writer (findRunOnPrimary), for BOTH residencies, WITHOUT leaking +// a control-plane client into a NEW-DB query (each store reads its OWN writer). A replica / nothing +// keeps the default (owning store's replica). +// +// `heteroRunOpsPostgresTest` gives a REAL split topology: prisma17 = RunOpsPrismaClient over the +// dedicated subset schema (#new / 5434), prisma14 = full legacy schema on a SEPARATE physical PG +// container (#legacy / control-plane). NEVER mocked. Replica lag is simulated by backing each store's +// `readOnlyPrisma` with a recording proxy whose taskRun reads return EMPTY (a lagging replica has not +// yet seen the fresh row) while recording that it was hit — so a replica-routed read MISSES and a +// writer-routed read FINDS. Seeds/writes always go through the real writer. + +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH: 25 chars → cuid → LEGACY, 27 → ksuid → NEW. +const CUID_25 = "c".repeat(25); // → LEGACY (#legacy / prisma14, full schema) +const KSUID_27 = "k".repeat(27); // → NEW (#new / prisma17, dedicated subset schema) + +// A recording "replica" that has NOT yet caught up: its taskRun reads always come back empty and +// record that they ran, so a replica-routed read misses the just-written row. Everything else +// forwards to the real client. `hit` flips true iff a taskRun read was routed here. +function laggingReplica(real: C): { client: C; wasHit: () => boolean } { + let hit = false; + const laggingTaskRun = new Proxy((real as any).taskRun, { + get(target, prop) { + if (prop === "findFirst" || prop === "findMany") { + return async () => { + hit = true; + return prop === "findMany" ? [] : null; + }; + } + if (prop === "findFirstOrThrow") { + return async () => { + hit = true; + throw new Error("lagging replica: row not visible"); + }; + } + return (target as any)[prop]; + }, + }); + const client = new Proxy(real, { + get(target, prop) { + if (prop === "taskRun") { + return laggingTaskRun; + } + return (target as any)[prop]; + }, + }) as C; + return { client, wasHit: () => hit }; +} + +async function seedEnvironmentLegacy(prisma: PrismaClient, suffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${suffix}`, + pkApiKey: `pk_dev_${suffix}`, + shortcode: `short_${suffix}`, + }, + }); + return { organization, project, environment }; +} + +function seedEnvironmentDedicated(suffix: string) { + return { + organization: { id: `org_${suffix}` }, + project: { id: `proj_${suffix}` }, + environment: { id: `env_${suffix}` }, + }; +} + +function taskRunData(opts: { + id: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}) { + return { + id: opts.id, + engine: "V2" as const, + status: "PENDING" as const, + friendlyId: opts.friendlyId, + runtimeEnvironmentId: opts.runtimeEnvironmentId, + environmentType: "DEVELOPMENT" as const, + organizationId: opts.organizationId, + projectId: opts.projectId, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${opts.id}`, + spanId: `span_${opts.id}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }; +} + +describe("run-ops split — read-after-write reads the OWNING store's WRITER, not its lagging replica", () => { + // (a) LEGACY-resident (cuid) run: the run was just committed to the control-plane writer; the + // control-plane replica lags. Passing the control-plane WRITER as the read-your-writes client must + // resolve the run via the owning (legacy) writer, NOT the replica. + heteroRunOpsPostgresTest( + "LEGACY cuid: read-after-write via the control-plane WRITER finds the fresh run despite replica lag", + async ({ prisma14, prisma17 }) => { + const legacyReplica = laggingReplica(prisma14); + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: legacyReplica.client, + schemaVariant: "legacy", + }); + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed = await seedEnvironmentLegacy(prisma14, "raw_leg"); + const runId = `run_${CUID_25}`; // cuid → LEGACY + await prisma14.taskRun.create({ + data: taskRunData({ + id: runId, + friendlyId: "run_raw_leg", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }), + }); + + // FAIL-BEFORE proof: a plain replica read (no client) hits the lagging replica → miss. + const viaReplica = await router.findRun( + { id: runId }, + { select: { friendlyId: true } } + // no client → default replica + ); + expect(viaReplica).toBeNull(); + expect(legacyReplica.wasHit()).toBe(true); + + // PASS-AFTER: read-your-writes with the control-plane WRITER resolves the fresh run. + const legacyReplica2 = laggingReplica(prisma14); + const legacyStore2 = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: legacyReplica2.client, + schemaVariant: "legacy", + }); + const router2 = new RoutingRunStore({ new: newStore, legacy: legacyStore2 }); + const viaWriter = await router2.findRun( + { id: runId }, + { select: { friendlyId: true } }, + prisma14 // control-plane WRITER → read-your-writes + ); + expect(viaWriter).not.toBeNull(); + expect((viaWriter as { friendlyId: string }).friendlyId).toBe("run_raw_leg"); + // The read hit the WRITER, never the replica. + expect(legacyReplica2.wasHit()).toBe(false); + + // findRunOrThrow: same behavior — writer resolves, replica would have thrown. + const legacyReplica3 = laggingReplica(prisma14); + const legacyStore3 = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: legacyReplica3.client, + schemaVariant: "legacy", + }); + const router3 = new RoutingRunStore({ new: newStore, legacy: legacyStore3 }); + const orThrow = await router3.findRunOrThrow( + { id: runId }, + { select: { friendlyId: true } }, + prisma14 + ); + expect((orThrow as { friendlyId: string }).friendlyId).toBe("run_raw_leg"); + expect(legacyReplica3.wasHit()).toBe(false); + } + ); + + // (b) NEW-resident (ksuid) run: born on the NEW DB (5434). The NEW replica lags. Passing the NEW + // WRITER as the read-your-writes client must resolve the run via the NEW writer, NOT its replica — + // and (proving the constraint that motivated the original client-drop) the control-plane writer is + // never leaked into the NEW query: each store reads its OWN writer. + heteroRunOpsPostgresTest( + "NEW ksuid: read-after-write via the NEW WRITER finds the fresh run despite NEW replica lag", + async ({ prisma14, prisma17 }) => { + const newReplica = laggingReplica(prisma17); + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: newReplica.client as never, + schemaVariant: "dedicated", + }); + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: prisma14, + schemaVariant: "legacy", + }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed = seedEnvironmentDedicated("raw_new"); + const runId = `run_${KSUID_27}`; // ksuid → NEW + await prisma17.taskRun.create({ + data: taskRunData({ + id: runId, + friendlyId: "run_raw_new", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }), + }); + + // FAIL-BEFORE proof: a plain replica read hits the lagging NEW replica → miss. + const viaReplica = await router.findRun({ id: runId }, { select: { friendlyId: true } }); + expect(viaReplica).toBeNull(); + expect(newReplica.wasHit()).toBe(true); + + // PASS-AFTER: read-your-writes with the NEW WRITER resolves the fresh run on the NEW DB. + const newReplica2 = laggingReplica(prisma17); + const newStore2 = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: newReplica2.client as never, + schemaVariant: "dedicated", + }); + const router2 = new RoutingRunStore({ new: newStore2, legacy: legacyStore }); + const viaWriter = await router2.findRun( + { id: runId }, + { select: { friendlyId: true } }, + prisma17 as never // NEW WRITER → read-your-writes + ); + expect(viaWriter).not.toBeNull(); + expect((viaWriter as { friendlyId: string }).friendlyId).toBe("run_raw_new"); + // The read hit the NEW WRITER, never the NEW replica. + expect(newReplica2.wasHit()).toBe(false); + + // Even passing the LEGACY (control-plane) WRITER as the read-your-writes signal resolves the + // ksuid run: the router routes by residency to the NEW store's OWN writer, never forwarding the + // control-plane client into the NEW DB. (This is the exact live shape — sessions/trigger pass + // the control-plane `prisma`, and the run may be NEW-resident under split-ON.) + const newReplica3 = laggingReplica(prisma17); + const newStore3 = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: newReplica3.client as never, + schemaVariant: "dedicated", + }); + const router3 = new RoutingRunStore({ new: newStore3, legacy: legacyStore }); + const viaControlPlaneWriter = await router3.findRun( + { id: runId }, + { select: { friendlyId: true } }, + prisma14 // control-plane WRITER (writer identity) — router routes to NEW's own writer + ); + expect((viaControlPlaneWriter as { friendlyId: string }).friendlyId).toBe("run_raw_new"); + expect(newReplica3.wasHit()).toBe(false); + } + ); + + // Guard: a plain replica read (no client, or a replica client) still routes to the replica — the + // fix must not turn every read into a primary read (which would defeat replica offload). + heteroRunOpsPostgresTest( + "plain reads still route to the replica (no read-your-writes escalation)", + async ({ prisma14, prisma17 }) => { + const legacyReplica = laggingReplica(prisma14); + const legacyStore = new PostgresRunStore({ + prisma: prisma14, + readOnlyPrisma: legacyReplica.client, + schemaVariant: "legacy", + }); + const newStore = new PostgresRunStore({ + prisma: prisma17 as never, + readOnlyPrisma: prisma17 as never, + schemaVariant: "dedicated", + }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed = await seedEnvironmentLegacy(prisma14, "plain_leg"); + const runId = `run_${CUID_25}`; + await prisma14.taskRun.create({ + data: taskRunData({ + id: runId, + friendlyId: "run_plain_leg", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }), + }); + + await router.findRun({ id: runId }, { select: { friendlyId: true } }); + // No writer passed → the read went to the replica, exactly as before the fix. + expect(legacyReplica.wasHit()).toBe(true); + } + ); +}); diff --git a/internal-packages/run-store/src/runOpsStore.snapshots.test.ts b/internal-packages/run-store/src/runOpsStore.snapshots.test.ts new file mode 100644 index 00000000000..564afa08131 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.snapshots.test.ts @@ -0,0 +1,363 @@ +// RunStore run-ops persistence — snapshots, against the REAL dedicated split topology. +// +// `heteroRunOpsPostgresTest` gives prisma14 = the full control-plane schema (#legacy) and +// prisma17 = a real `RunOpsPrismaClient` over the @internal/run-ops-database SUBSET schema (#new). +// These were previously on the weaker `heteroPostgresTest` (full schema on BOTH sides), which could +// not catch dedicated-subset behaviour differences — the entire point of the split. On the subset +// there are no Organization/Project/RuntimeEnvironment models and no implicit M2M join tables +// (`_completedWaitpoints` is the explicit `CompletedWaitpoint` model), so the snapshot store must +// behave identically whether backed by the legacy implicit M2M or the dedicated explicit join. +// +// The assertions still compare the store's behaviour across the two physical DBs (control-plane vs +// dedicated): a snapshot created + read through the store yields the same observable result on both. + +import { heteroRunOpsPostgresTest, HETERO_PINNED_ICU_COLLATION } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import type { CreateRunInput, RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// On the dedicated subset there are no Organization/Project/RuntimeEnvironment models (the run-ops +// rows carry FK-free scalar ids), so we mint synthetic owning ids. On legacy we seed the real rows +// the kept FKs require. +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + slugSuffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${slugSuffix}` }, + project: { id: `proj_${slugSuffix}` }, + environment: { id: `env_${slugSuffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +// ownerEngine classifies by internal-id LENGTH after stripping a single leading `_`: 27 chars +// → ksuid → NEW (#new / dedicated run-ops DB subset), 25 chars → cuid → LEGACY (#legacy / full schema). +const KSUID_27 = "k".repeat(27); // → NEW residency, exercises the dedicated store +const CUID_25 = "c".repeat(25); // → LEGACY residency, exercises the full-schema store + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + taskIdentifier: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: params.taskIdentifier, + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: "trace_1", + spanId: "span_1", + runTags: ["alpha", "beta"], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +async function seedPendingWaitpoint( + prisma: AnyClient, + params: { id: string; friendlyId: string; projectId: string; environmentId: string } +) { + return (prisma as PrismaClient).waitpoint.create({ + data: { + id: params.id, + friendlyId: params.friendlyId, + type: "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + }, + }); +} + +function makeStore(prisma: AnyClient, schemaVariant: RunStoreSchemaVariant) { + return new PostgresRunStore({ + prisma: prisma as never, + readOnlyPrisma: prisma as never, + schemaVariant, + }); +} + +// Strip the prisma-managed / per-DB id fields so two rows born on different physical DBs +// (legacy full schema vs dedicated subset) compare field-for-field for behavioural parity. +function normalizeSnapshot(row: Record) { + const r = { ...row }; + delete r.id; + delete r.runId; + delete r.previousSnapshotId; + delete r.createdAt; + delete r.updatedAt; + delete r.environmentId; + delete r.projectId; + delete r.organizationId; + return r; +} + +describe("RunStore run-ops persistence — snapshots", () => { + // an identical run + ≥2 snapshots (one invalid, one valid) seeded on #legacy (full schema) + // and #new (dedicated subset) yield a deep-equal `findLatestExecutionSnapshot` row, and it is the + // valid one — proving the dedicated store's group-A hydration does not perturb the scalar columns. + heteroRunOpsPostgresTest( + "snapshot findLatest is behaviourally identical across #legacy and #new", + async ({ prisma14, prisma17 }) => { + const seed = async ( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + runId: string, + suffix: string + ) => { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_latest_${suffix}`, + taskIdentifier: "my-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const ids = { + environmentId: env.environment.id, + environmentType: "DEVELOPMENT" as const, + projectId: env.project.id, + organizationId: env.organization.id, + }; + + // An invalid snapshot (error set) that must NOT be returned by findLatest. + await store.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description: "invalid one" }, + error: "boom", + ...ids, + }); + // The valid snapshot created last — this is the one findLatest must return. + const valid = await store.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING_WITH_WAITPOINTS", description: "valid latest" }, + ...ids, + }); + return { store, validId: valid.id }; + }; + + const legacyRunId = `run_${CUID_25}`; // → #legacy (full schema) + const newRunId = `run_${KSUID_27}`; // → #new (dedicated subset) + const seed14 = await seed(prisma14, "legacy", legacyRunId, "sa14"); + const seed17 = await seed(prisma17, "dedicated", newRunId, "sa17"); + + const latest14 = await seed14.store.findLatestExecutionSnapshot(legacyRunId); + const latest17 = await seed17.store.findLatestExecutionSnapshot(newRunId); + + expect(latest14).not.toBeNull(); + expect(latest17).not.toBeNull(); + // The valid snapshot wins over the earlier invalid one. + expect(latest14!.id).toBe(seed14.validId); + expect(latest17!.id).toBe(seed17.validId); + expect(latest14!.isValid).toBe(true); + expect(latest14!.description).toBe("valid latest"); + expect(latest17!.isValid).toBe(true); + expect(latest17!.description).toBe("valid latest"); + + // Compare the persisted columns (drop relation arrays + per-DB ids). The dedicated store + // hydrates `completedWaitpoints` from the explicit CompletedWaitpoint join, the legacy store + // from the implicit M2M — both stripped here, leaving the scalar columns to compare. + const strip = ( + row: NonNullable>> + ) => { + const { completedWaitpoints, checkpoint, ...rest } = row; + return normalizeSnapshot(rest as Record); + }; + expect(strip(latest14!)).toEqual(strip(latest17!)); + } + ); + + // completedWaitpoints round-trips through the join (implicit `_completedWaitpoints` on legacy, + // explicit `CompletedWaitpoint` on the dedicated subset), and the derived completedWaitpointOrder + // preserves the supplied index order, on both stores. + heteroRunOpsPostgresTest( + "completedWaitpoints round-trip preserves order across #legacy and #new", + async ({ prisma14, prisma17 }) => { + const run = async ( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + runId: string, + suffix: string + ) => { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_cw_${suffix}`, + taskIdentifier: "my-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const w1 = `wp_${suffix}_1`; + const w2 = `wp_${suffix}_2`; + await seedPendingWaitpoint(prisma, { + id: w1, + friendlyId: `waitpoint_${suffix}_1`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + await seedPendingWaitpoint(prisma, { + id: w2, + friendlyId: `waitpoint_${suffix}_2`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + + const snapshot = await store.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { + executionStatus: "EXECUTING_WITH_WAITPOINTS", + description: "with waitpoints", + }, + completedWaitpoints: [ + { id: w1, index: 0 }, + { id: w2, index: 1 }, + ], + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + + const joinIds = await store.findSnapshotCompletedWaitpointIds(snapshot.id); + return { w1, w2, joinIds, order: snapshot.completedWaitpointOrder }; + }; + + const r14 = await run(prisma14, "legacy", `run_${CUID_25}`, "sb14"); + const r17 = await run(prisma17, "dedicated", `run_${KSUID_27}`, "sb17"); + + // The join links the snapshot to both waitpoints (set-equal) on both stores. + expect([...r14.joinIds].sort()).toEqual([r14.w1, r14.w2].sort()); + expect([...r17.joinIds].sort()).toEqual([r17.w1, r17.w2].sort()); + + // The derived order column reflects the supplied index order, identically per store. + expect(r14.order).toEqual([r14.w1, r14.w2]); + expect(r17.order).toEqual([r17.w1, r17.w2]); + } + ); + + // a collation-sensitive ORDER BY over a text column pinned to the shared ICU collation + // (`und-x-icu`, present on both the #legacy container and the #new container) returns the + // identical sequence of snapshot descriptions on #legacy and #new. The pin keeps the comparison a + // proof of the split rather than of a default-collation difference between the two DBs. + heteroRunOpsPostgresTest( + "snapshot ORDER BY pinned to the shared ICU collation is identical across #legacy and #new", + async ({ prisma14, prisma17 }) => { + const descriptions = ["Zebra", "apple", "Apple", "éclair", "banana", "_underscore"]; + + const seed = async ( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + runId: string, + suffix: string + ) => { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_order_${suffix}`, + taskIdentifier: "my-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + for (const description of descriptions) { + await store.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING", description }, + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + } + }; + + await seed(prisma14, "legacy", `run_${CUID_25}`, "sc14"); + await seed(prisma17, "dedicated", `run_${KSUID_27}`, "sc17"); + + const orderedDescriptions = async (client: AnyClient) => { + const rows = await (client as PrismaClient).$queryRawUnsafe<{ description: string }[]>( + `SELECT "description" FROM "TaskRunExecutionSnapshot" WHERE "description" != 'Run was created' ORDER BY "description" COLLATE "${HETERO_PINNED_ICU_COLLATION}" ASC` + ); + return rows.map((r) => r.description); + }; + + const ordered14 = await orderedDescriptions(prisma14); + const ordered17 = await orderedDescriptions(prisma17); + + expect(ordered14).toEqual(ordered17); + expect(ordered14).toHaveLength(descriptions.length); + } + ); +}); diff --git a/internal-packages/run-store/src/runOpsStore.test.ts b/internal-packages/run-store/src/runOpsStore.test.ts new file mode 100644 index 00000000000..5980a5c24e7 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.test.ts @@ -0,0 +1,2120 @@ +import { heteroPostgresTest, heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient, TaskRunStatus } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { CreateRunInput } from "./types.js"; + +// 25-char internal id → cuid → LEGACY; 27-char internal id → ksuid → NEW. +const CUID_25 = "c".repeat(25); +const KSUID_27 = "k".repeat(27); + +async function seedEnvironment(prisma: PrismaClient, slugSuffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + taskIdentifier: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: params.taskIdentifier, + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: "trace_1", + spanId: "span_1", + runTags: ["alpha", "beta"], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +// Strip the prisma-managed/connection-volatile fields so two rows born on different +// physical DBs can be compared field-for-field for cross-version byte-identity. +function normalizeRow(row: Record) { + const { id, createdAt, updatedAt, ...rest } = row as { + id: unknown; + createdAt: unknown; + updatedAt: unknown; + } & Record; + return rest; +} + +describe("RoutingRunStore (TaskRun-core)", () => { + // Test A: identical CreateRunInput through a PostgresRunStore over PG14 and over PG17 + // yields deep-equal persisted rows (cross-version byte-identity). + heteroPostgresTest( + "TaskRun create/find round-trip is byte-identical across PG14 and PG17", + async ({ prisma14, prisma17 }) => { + const store14 = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const store17 = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + + const seed14 = await seedEnvironment(prisma14, "a14"); + const seed17 = await seedEnvironment(prisma17, "a17"); + + const runId = "run_roundtrip_1"; + await store14.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_friendly_1", + taskIdentifier: "my-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + await store17.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_friendly_1", + taskIdentifier: "my-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + + const row14 = await store14.findRun({ id: runId }, prisma14); + const row17 = await store17.findRun({ id: runId }, prisma17); + + expect(row14).not.toBeNull(); + expect(row17).not.toBeNull(); + + // Drop env/project/org ids (differ per DB seed) plus the prisma-managed fields, + // then assert every remaining persisted column is identical across versions. + const strip = (row: Record) => { + const r = { ...normalizeRow(row) }; + delete r.runtimeEnvironmentId; + delete r.projectId; + delete r.organizationId; + return r; + }; + expect(strip(row14 as Record)).toEqual( + strip(row17 as Record) + ); + // The payload / JSON / array / scalar columns specifically survive byte-identically. + expect(row14!.payload).toBe('{"hello":"world"}'); + expect(row17!.payload).toBe('{"hello":"world"}'); + expect(row14!.runTags).toEqual(["alpha", "beta"]); + expect(row17!.runTags).toEqual(["alpha", "beta"]); + expect(row14!.traceContext).toEqual({ trace: "ctx" }); + expect(row17!.traceContext).toEqual({ trace: "ctx" }); + expect(row14!.createdAt.toISOString()).toBe(row17!.createdAt.toISOString()); + } + ); + + // Test B: a collation-sensitive ORDER BY pinned to the shared ICU collation returns + // the identical sequence on PG14 and PG17 (keyset-cursor / pagination parity guard). + heteroPostgresTest( + "ORDER BY pinned to the shared ICU collation is identical across PG14 and PG17", + async ({ prisma14, prisma17, pinnedCollation }) => { + const store14 = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const store17 = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const seed14 = await seedEnvironment(prisma14, "b14"); + const seed17 = await seedEnvironment(prisma17, "b17"); + + // Mixed-case, punctuated, accented values where C-locale vs ICU sort differs. + const friendlyIds = [ + "run_Zebra", + "run_apple", + "run_Apple", + "run_éclair", + "run_banana", + "run__underscore", + ]; + + let n = 0; + for (const fid of friendlyIds) { + const idSuffix = `${n++}`; + await store14.createRun( + buildCreateRunInput({ + runId: `run_b14_${idSuffix}`, + friendlyId: fid, + taskIdentifier: fid, + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + await store17.createRun( + buildCreateRunInput({ + runId: `run_b17_${idSuffix}`, + friendlyId: fid, + taskIdentifier: fid, + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + } + + // Prisma `orderBy` cannot express an explicit COLLATE, so prove column-level + // collation parity via $queryRaw with the pinned ICU collation on each client. + const orderedFriendlyIds = async (client: PrismaClient) => { + const rows = await client.$queryRawUnsafe<{ friendlyId: string }[]>( + `SELECT "friendlyId" FROM "TaskRun" ORDER BY "friendlyId" COLLATE "${pinnedCollation}" ASC` + ); + return rows.map((r) => r.friendlyId); + }; + + const ordered14 = await orderedFriendlyIds(prisma14); + const ordered17 = await orderedFriendlyIds(prisma17); + + expect(ordered14).toEqual(ordered17); + expect(ordered14).toHaveLength(friendlyIds.length); + } + ); + + // Test C: the router writes new runs to NEW and routes existing-id finds by residency. + heteroPostgresTest( + "RoutingRunStore selects the underlying store by residency", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "c14"); + const seed17 = await seedEnvironment(prisma17, "c17"); + + // (i) createRun lands on NEW, never on LEGACY. + const bornId = "run_born_on_new"; + await router.createRun( + buildCreateRunInput({ + runId: bornId, + friendlyId: "run_born", + taskIdentifier: "my-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + expect(await prisma17.taskRun.findUnique({ where: { id: bornId } })).not.toBeNull(); + expect(await prisma14.taskRun.findUnique({ where: { id: bornId } })).toBeNull(); + + // (ii) seed a cuid-length (LEGACY) row on the legacy DB and a ksuid-length (NEW) row on + // the new DB, then prove residency selection via ownerEngine length classification. + const legacyRunId = `run_${CUID_25}`; + const newRunId = `run_${KSUID_27}`; + await legacyStore.createRun( + buildCreateRunInput({ + runId: legacyRunId, + friendlyId: "run_legacy", + taskIdentifier: "legacy-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + await newStore.createRun( + buildCreateRunInput({ + runId: newRunId, + friendlyId: "run_new", + taskIdentifier: "new-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + + const legacyFound = await router.findRun({ id: legacyRunId }); + const newFound = await router.findRun({ id: newRunId }); + + expect(legacyFound?.id).toBe(legacyRunId); + expect(legacyFound?.taskIdentifier).toBe("legacy-task"); + expect(newFound?.id).toBe(newRunId); + expect(newFound?.taskIdentifier).toBe("new-task"); + + // The LEGACY-residency id must NOT resolve from the NEW store, and vice versa. + expect(await newStore.findRun({ id: legacyRunId })).toBeNull(); + expect(await legacyStore.findRun({ id: newRunId })).toBeNull(); + } + ); + + // Test C2: create routes by the MINTED id-kind, not hardcoded NEW. + // A cuid (LEGACY) child must be physically created on LEGACY, never NEW. + heteroPostgresTest( + "createRun routes by minted residency: a cuid child is born on LEGACY", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "c2_14"); + + // A cuid-length (LEGACY-residency) child id — e.g. an inherited-residency child of a legacy parent. + const legacyChildId = `run_${CUID_25}`; + await router.createRun( + buildCreateRunInput({ + runId: legacyChildId, + friendlyId: "run_legacy_child", + taskIdentifier: "legacy-child-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + // Born on LEGACY, NOT on NEW. + expect(await prisma14.taskRun.findUnique({ where: { id: legacyChildId } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: legacyChildId } })).toBeNull(); + } + ); + + // Test C4: write routing is pure id-shape — a cuid run's writes go to LEGACY. + heteroPostgresTest( + "writes route by id-shape (LEGACY for cuid)", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const legacyId = `run_${CUID_25}`; + const seed14 = await seedEnvironment(prisma14, "c4_14"); + await legacyStore.createRun( + buildCreateRunInput({ + runId: legacyId, + friendlyId: "run_legacy_write", + taskIdentifier: "legacy-write-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + const result = await router.updateMetadata( + legacyId, + { + metadata: '{"y":2}', + metadataVersion: { increment: 1 }, + updatedAt: new Date("2024-02-02T00:00:00.000Z"), + }, + {} + ); + expect(result.count).toBe(1); + const onLegacy = await prisma14.taskRun.findUnique({ where: { id: legacyId } }); + expect(onLegacy?.metadata).toBe('{"y":2}'); + } + ); + + // Test D: single-DB / passthrough — both slots are the same store over one client. + heteroPostgresTest("single-DB binds one client (passthrough)", async ({ prisma14, prisma17 }) => { + const store = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const router = new RoutingRunStore({ new: store, legacy: store }); + + const seed = await seedEnvironment(prisma14, "d14"); + + // Use a ksuid-length (NEW-residency) id to exercise the route; in single-DB both + // slots are the same store, so the round-trip must succeed on the one client. + const runId = `run_${KSUID_27}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_passthrough", + taskIdentifier: "passthrough-task", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }) + ); + + const found = await router.findRun({ id: runId }); + expect(found?.id).toBe(runId); + expect(found?.taskIdentifier).toBe("passthrough-task"); + + // The single client is the only one that holds the row; the second fixture DB was + // never touched by the router (no second connection opened). + expect(await prisma14.taskRun.findUnique({ where: { id: runId } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: runId } })).toBeNull(); + }); +}); + +describe("BatchTaskRun group", () => { + function batchCreateData(params: { + id: string; + friendlyId: string; + runtimeEnvironmentId: string; + runCount: number; + }) { + return { + id: params.id, + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + runCount: params.runCount, + runIds: [] as string[], + payload: '{"hello":"world"}', + payloadType: "application/json", + options: { foo: "bar" }, + batchVersion: "runengine:v1", + }; + } + + // Create/find/update round-trip on PostgresRunStore, asserted byte-identical across + // PG14 and PG17 (the text[] runIds array + JSON payload/options survive cross-version). + heteroPostgresTest( + "BatchTaskRun create/find/update round-trip is byte-identical across PG14 and PG17", + async ({ prisma14, prisma17 }) => { + const store14 = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const store17 = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + + const seed14 = await seedEnvironment(prisma14, "batcha14"); + const seed17 = await seedEnvironment(prisma17, "batcha17"); + + const batchId = "batch_roundtrip_1"; + const created14 = await store14.createBatchTaskRun( + batchCreateData({ + id: batchId, + friendlyId: "batch_friendly_1", + runtimeEnvironmentId: seed14.environment.id, + runCount: 2, + }) + ); + const created17 = await store17.createBatchTaskRun( + batchCreateData({ + id: batchId, + friendlyId: "batch_friendly_1", + runtimeEnvironmentId: seed17.environment.id, + runCount: 2, + }) + ); + + // create returns the full default row (the onBatchTaskRunCreated event shape). + expect(created14.runCount).toBe(2); + expect(created14.batchVersion).toBe("runengine:v1"); + expect(created14.runIds).toEqual([]); + + // find defaults to the primary client (worker reads the just-written row). + const found14 = await store14.findBatchTaskRunById(batchId); + const found17 = await store17.findBatchTaskRunById(batchId); + expect(found14?.id).toBe(batchId); + expect(found17?.id).toBe(batchId); + + const strip = (row: Record) => { + const { id, createdAt, updatedAt, runtimeEnvironmentId, ...rest } = row; + return rest; + }; + expect(strip(found14 as Record)).toEqual( + strip(found17 as Record) + ); + expect(found14!.payload).toBe('{"hello":"world"}'); + expect(found17!.payload).toBe('{"hello":"world"}'); + expect(found14!.options).toEqual({ foo: "bar" }); + expect(found17!.options).toEqual({ foo: "bar" }); + + // update pushes runIds + increments processingJobsCount; the select narrows the row. + const updated14 = await store14.updateBatchTaskRun({ + where: { id: batchId }, + data: { runIds: { push: ["run_a", "run_b"] }, processingJobsCount: { increment: 2 } }, + select: { processingJobsCount: true, runCount: true }, + }); + const updated17 = await store17.updateBatchTaskRun({ + where: { id: batchId }, + data: { runIds: { push: ["run_a", "run_b"] }, processingJobsCount: { increment: 2 } }, + select: { processingJobsCount: true, runCount: true }, + }); + expect(updated14).toEqual({ processingJobsCount: 2, runCount: 2 }); + expect(updated17).toEqual({ processingJobsCount: 2, runCount: 2 }); + + // the runIds array survived the push on both versions. + const reread14 = await store14.findBatchTaskRunById(batchId); + const reread17 = await store17.findBatchTaskRunById(batchId); + expect(reread14!.runIds).toEqual(["run_a", "run_b"]); + expect(reread17!.runIds).toEqual(["run_a", "run_b"]); + } + ); + + // Unclassifiable id falls back to NEW; find probe also hits NEW first, so round-trip stays on NEW. + heteroPostgresTest( + "RoutingRunStore routes BatchTaskRun create/find/update to NEW", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed17 = await seedEnvironment(prisma17, "batchb17"); + + const batchId = "batch_born_on_new"; + await router.createBatchTaskRun( + batchCreateData({ + id: batchId, + friendlyId: "batch_born", + runtimeEnvironmentId: seed17.environment.id, + runCount: 1, + }) + ); + + // born on NEW, never on LEGACY. + expect(await prisma17.batchTaskRun.findUnique({ where: { id: batchId } })).not.toBeNull(); + expect(await prisma14.batchTaskRun.findUnique({ where: { id: batchId } })).toBeNull(); + + // find + update route to NEW as well. + expect((await router.findBatchTaskRunById(batchId))?.id).toBe(batchId); + const updated = await router.updateBatchTaskRun({ + where: { id: batchId }, + data: { runIds: { push: ["run_x"] }, processingJobsCount: { increment: 1 } }, + select: { processingJobsCount: true, runCount: true }, + }); + expect(updated).toEqual({ processingJobsCount: 1, runCount: 1 }); + expect(await prisma14.batchTaskRun.findUnique({ where: { id: batchId } })).toBeNull(); + } + ); + + // Single-DB passthrough: both slots are the same store over one client. + heteroPostgresTest( + "single-DB binds one client for BatchTaskRun (passthrough)", + async ({ prisma14, prisma17 }) => { + const store = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const router = new RoutingRunStore({ new: store, legacy: store }); + + const seed = await seedEnvironment(prisma14, "batchd14"); + + const batchId = "batch_passthrough"; + await router.createBatchTaskRun( + batchCreateData({ + id: batchId, + friendlyId: "batch_passthrough", + runtimeEnvironmentId: seed.environment.id, + runCount: 1, + }) + ); + + expect((await router.findBatchTaskRunById(batchId))?.id).toBe(batchId); + expect(await prisma14.batchTaskRun.findUnique({ where: { id: batchId } })).not.toBeNull(); + expect(await prisma17.batchTaskRun.findUnique({ where: { id: batchId } })).toBeNull(); + } + ); + + heteroPostgresTest( + "findBatchTaskRunById routes ksuid→NEW and cuid→LEGACY", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "p8a_cuid14"); + const seed17 = await seedEnvironment(prisma17, "p8a_ksuid17"); + + await newStore.createBatchTaskRun( + batchCreateData({ + id: KSUID_27, + friendlyId: "batch_ksuid_p8a", + runtimeEnvironmentId: seed17.environment.id, + runCount: 1, + }) + ); + await legacyStore.createBatchTaskRun( + batchCreateData({ + id: CUID_25, + friendlyId: "batch_cuid_p8a", + runtimeEnvironmentId: seed14.environment.id, + runCount: 1, + }) + ); + + expect((await router.findBatchTaskRunById(KSUID_27))?.id).toBe(KSUID_27); + expect((await router.findBatchTaskRunById(CUID_25))?.id).toBe(CUID_25); + } + ); + + heteroPostgresTest( + "updateBatchTaskRun routes cuid→LEGACY and ksuid→NEW", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "p8a_upd14"); + const seed17 = await seedEnvironment(prisma17, "p8a_upd17"); + + const ksuidBatchId = `${KSUID_27.slice(0, -1)}u`; + const cuidBatchId = `${CUID_25.slice(0, -1)}u`; + + await newStore.createBatchTaskRun( + batchCreateData({ + id: ksuidBatchId, + friendlyId: "batch_ksuid_upd", + runtimeEnvironmentId: seed17.environment.id, + runCount: 2, + }) + ); + await legacyStore.createBatchTaskRun( + batchCreateData({ + id: cuidBatchId, + friendlyId: "batch_cuid_upd", + runtimeEnvironmentId: seed14.environment.id, + runCount: 2, + }) + ); + + const updNew = await router.updateBatchTaskRun({ + where: { id: ksuidBatchId }, + data: { processingJobsCount: { increment: 1 } }, + select: { processingJobsCount: true, runCount: true }, + }); + expect(updNew).toEqual({ processingJobsCount: 1, runCount: 2 }); + expect(await prisma14.batchTaskRun.findUnique({ where: { id: ksuidBatchId } })).toBeNull(); + + const updLegacy = await router.updateBatchTaskRun({ + where: { id: cuidBatchId }, + data: { processingJobsCount: { increment: 1 } }, + select: { processingJobsCount: true, runCount: true }, + }); + expect(updLegacy).toEqual({ processingJobsCount: 1, runCount: 2 }); + expect(await prisma17.batchTaskRun.findUnique({ where: { id: cuidBatchId } })).toBeNull(); + } + ); + + heteroPostgresTest( + "findBatchTaskRunById({ include: { items: true } }) returns BatchTaskRunItems", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed17 = await seedEnvironment(prisma17, "p8a_inc17"); + const ksuidBatchId = `${KSUID_27.slice(0, -2)}in`; + + await newStore.createBatchTaskRun( + batchCreateData({ + id: ksuidBatchId, + friendlyId: "batch_inc_p8a", + runtimeEnvironmentId: seed17.environment.id, + runCount: 1, + }) + ); + + const runId = `${KSUID_27.slice(0, -3)}run`; + await prisma17.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: `run_${runId}`, + runtimeEnvironmentId: seed17.environment.id, + environmentType: "DEVELOPMENT", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + taskIdentifier: "inc-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t1", + spanId: "s1", + queue: "task/inc-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + await prisma17.batchTaskRunItem.create({ + data: { batchTaskRunId: ksuidBatchId, taskRunId: runId, status: "PENDING" }, + }); + + const withItems = await router.findBatchTaskRunById(ksuidBatchId, { + include: { items: true }, + }); + expect(withItems?.items).toBeDefined(); + expect(withItems?.items?.length).toBe(1); + expect(withItems?.items?.[0]?.taskRunId).toBe(runId); + } + ); + + heteroPostgresTest( + "createBatchTaskRun routes ksuid→NEW and cuid→LEGACY", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "p8c_cuid14"); + const seed17 = await seedEnvironment(prisma17, "p8c_ksuid17"); + + const ksuidBatchId = `${KSUID_27.slice(0, -2)}c1`; + const cuidBatchId = `${CUID_25.slice(0, -2)}c1`; + + await router.createBatchTaskRun( + batchCreateData({ + id: ksuidBatchId, + friendlyId: "batch_p8c_ksuid", + runtimeEnvironmentId: seed17.environment.id, + runCount: 1, + }) + ); + expect( + await prisma17.batchTaskRun.findUnique({ where: { id: ksuidBatchId } }) + ).not.toBeNull(); + expect(await prisma14.batchTaskRun.findUnique({ where: { id: ksuidBatchId } })).toBeNull(); + + await router.createBatchTaskRun( + batchCreateData({ + id: cuidBatchId, + friendlyId: "batch_p8c_cuid", + runtimeEnvironmentId: seed14.environment.id, + runCount: 1, + }) + ); + expect(await prisma14.batchTaskRun.findUnique({ where: { id: cuidBatchId } })).not.toBeNull(); + expect(await prisma17.batchTaskRun.findUnique({ where: { id: cuidBatchId } })).toBeNull(); + } + ); + + // Probe: a ksuid-id batch physically resident on LEGACY (written by batchTriggerV3 raw + // to the control-plane) must be found; strict id-routing (ksuid→NEW only) would miss it. + heteroPostgresTest( + "findBatchTaskRunById probe finds ksuid-id batch resident on LEGACY (cross-residency)", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "p8c_probe14"); + const seed17 = await seedEnvironment(prisma17, "p8c_probe17"); + + const ksuidOnLegacy = `${KSUID_27.slice(0, -2)}pl`; + await legacyStore.createBatchTaskRun( + batchCreateData({ + id: ksuidOnLegacy, + friendlyId: "batch_p8c_probe_legacy", + runtimeEnvironmentId: seed14.environment.id, + runCount: 1, + }) + ); + expect( + await prisma14.batchTaskRun.findUnique({ where: { id: ksuidOnLegacy } }) + ).not.toBeNull(); + expect(await prisma17.batchTaskRun.findUnique({ where: { id: ksuidOnLegacy } })).toBeNull(); + + expect((await router.findBatchTaskRunById(ksuidOnLegacy))?.id).toBe(ksuidOnLegacy); + + const ksuidOnNew = `${KSUID_27.slice(0, -2)}pn`; + await newStore.createBatchTaskRun( + batchCreateData({ + id: ksuidOnNew, + friendlyId: "batch_p8c_probe_new", + runtimeEnvironmentId: seed17.environment.id, + runCount: 1, + }) + ); + expect((await router.findBatchTaskRunById(ksuidOnNew))?.id).toBe(ksuidOnNew); + } + ); + + // A BATCH-completion waitpoint (cuid own-id, `completedByBatchId` = ksuid batch on NEW) must be + // born on NEW alongside its batch. On the control-plane DB (prisma14) the Waitpoint→BatchTaskRun + // FK is enforced, so routing by the waitpoint's own cuid id-shape would land it on LEGACY and + // FK-violate against the absent batch. The dedicated run-ops schema carries `completedByBatchId` as a scalar. + heteroRunOpsPostgresTest( + "createWaitpoint co-locates a BATCH-completion waitpoint with its batch on NEW", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ + prisma: prisma17 as any, + readOnlyPrisma: prisma17 as any, + }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + // The ksuid batch lives on NEW only — never on the control-plane DB. + const batchId = `${KSUID_27.slice(0, -2)}bw`; + await prisma17.batchTaskRun.create({ + data: { + id: batchId, + friendlyId: "batch_wp_residency", + runtimeEnvironmentId: "synthetic-env-id", + runCount: 1, + payload: "{}", + payloadType: "application/json", + batchVersion: "runengine:v1", + }, + }); + + // The waitpoint's OWN id is a cuid (→ would route to LEGACY by id-shape), but it points at + // the NEW-resident batch. It must follow the batch, not its own id. + const cuidWp = `waitpoint_${CUID_25}`; + await router.createWaitpoint({ + data: { + id: cuidWp, + friendlyId: "waitpoint_batch_residency", + type: "BATCH", + idempotencyKey: batchId, + userProvidedIdempotencyKey: false, + completedByBatchId: batchId, + projectId: "synthetic-project-id", + environmentId: "synthetic-env-id", + }, + }); + + // Lands on NEW (no FK, co-resident with the batch); never on the control-plane DB + // (where the create would have FK-violated). + expect(await prisma17.waitpoint.findUnique({ where: { id: cuidWp } })).not.toBeNull(); + expect(await prisma14.waitpoint.findUnique({ where: { id: cuidWp } })).toBeNull(); + + // And the batch-keyed lookup (batchSystem.unblockRunForBatch) still finds it cross-DB. + const byBatch = await router.findWaitpoint({ where: { completedByBatchId: batchId } }); + expect(byBatch?.id).toBe(cuidWp); + }, + 120_000 + ); +}); + +// Regression locks: the router must execute every routed op on the OWNING store's own +// client and route reads by friendlyId — never on the caller-forwarded client (callers +// pass the control-plane client, which is the wrong physical DB once a run lives in NEW). +describe("RoutingRunStore cross-DB client + friendlyId routing (regression)", () => { + // A create routed to NEW must land on NEW even when the caller forwards the LEGACY + // client as `tx` (the webapp passes its control-plane client there). If the router + // forwarded it, the ksuid run would be written through the legacy connection. + heteroPostgresTest( + "createRun ignores a forwarded wrong-DB tx and lands the run on its owning store", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed17 = await seedEnvironment(prisma17, "txnew17"); + + const newRunId = KSUID_27; + await router.createRun( + buildCreateRunInput({ + runId: newRunId, + friendlyId: `run_${KSUID_27}`, + taskIdentifier: "tx-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }), + // Forwarded LEGACY client — must be ignored in favour of the NEW store's own client. + prisma14 + ); + + expect(await prisma17.taskRun.findUnique({ where: { id: newRunId } })).not.toBeNull(); + expect(await prisma14.taskRun.findUnique({ where: { id: newRunId } })).toBeNull(); + } + ); + + // findRun keyed on friendlyId (the common presenter case) must route to the owning + // store by residency — friendlyIds classify identically to internal ids. + heteroPostgresTest( + "findRun routes by friendlyId to the owning store", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "fid14"); + const seed17 = await seedEnvironment(prisma17, "fid17"); + + const legacyFriendly = `run_${CUID_25}`; + const newFriendly = `run_${KSUID_27}`; + await legacyStore.createRun( + buildCreateRunInput({ + runId: CUID_25, + friendlyId: legacyFriendly, + taskIdentifier: "legacy-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + await newStore.createRun( + buildCreateRunInput({ + runId: KSUID_27, + friendlyId: newFriendly, + taskIdentifier: "new-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + + expect((await router.findRun({ friendlyId: legacyFriendly }))?.id).toBe(CUID_25); + expect((await router.findRun({ friendlyId: newFriendly }))?.id).toBe(KSUID_27); + } + ); + + // A routed write (updateMetadata) must mutate the run on its owning store, ignoring a + // forwarded wrong-DB client — otherwise the write targets the legacy DB and silently + // no-ops (count 0) against a NEW-resident run. + heteroPostgresTest( + "a routed write ignores a forwarded wrong-DB tx and hits the owning store", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed17 = await seedEnvironment(prisma17, "wr17"); + + await newStore.createRun( + buildCreateRunInput({ + runId: KSUID_27, + friendlyId: `run_${KSUID_27}`, + taskIdentifier: "write-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + + const result = await router.updateMetadata( + KSUID_27, + { metadata: '{"x":1}', metadataVersion: { increment: 1 }, updatedAt: new Date() }, + {}, + // Forwarded LEGACY client — must be ignored. + prisma14 + ); + + expect(result.count).toBe(1); + const row = await prisma17.taskRun.findUnique({ + where: { id: KSUID_27 }, + select: { metadata: true }, + }); + expect(row?.metadata).toBe('{"x":1}'); + } + ); +}); + +describe("RoutingRunStore.findRuns split-mode fan-out + drain", () => { + // Internal-id convention (matches the file): `run_` + a 25-char body (cuid → LEGACY) or + // a 27-char body (ksuid → NEW). The classifier strips `run_` then keys on body length. + const legacyId = (suffix: string) => `run_${"c".repeat(25 - suffix.length)}${suffix}`; + const newId = (suffix: string) => `run_${"k".repeat(27 - suffix.length)}${suffix}`; + + async function createRunOn( + store: PostgresRunStore, + seed: Awaited>, + opts: { + id: string; + friendlyId: string; + taskIdentifier?: string; + createdAt?: Date; + status?: TaskRunStatus; + spanId?: string; + } + ) { + const input = buildCreateRunInput({ + runId: opts.id, + friendlyId: opts.friendlyId, + taskIdentifier: opts.taskIdentifier ?? "my-task", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }); + if (opts.spanId) input.data.spanId = opts.spanId; + if (opts.createdAt) input.data.createdAt = opts.createdAt; + if (opts.status) { + input.data.status = opts.status; + input.snapshot.runStatus = opts.status; + } + await store.createRun(input); + } + + // A bounded id set spanning both DBs must return BOTH residencies (the runs-list bug: + // the old stub returned NEW only, dropping every legacy run). + heteroPostgresTest("id-set fans out across NEW and LEGACY", async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "fo14"); + const seed17 = await seedEnvironment(prisma17, "fo17"); + const lId = legacyId("1"); + const nId = newId("1"); + await createRunOn(legacyStore, seed14, { id: lId, friendlyId: "run_fo_l1" }); + await createRunOn(newStore, seed17, { id: nId, friendlyId: "run_fo_n1" }); + + const rows = (await router.findRuns({ + where: { id: { in: [lId, nId] } }, + select: { id: true }, + })) as Array<{ id: string }>; + expect(rows.map((r) => r.id).sort()).toEqual([lId, nId].sort()); + }); + + // Fan-out preserved after drain removal; onLegacyRead is no longer an accepted option. + heteroPostgresTest( + "fan-out spans NEW+LEGACY with no drain seam", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "nd14"); + const seed17 = await seedEnvironment(prisma17, "nd17"); + const lId = legacyId("x"); + const nId = newId("x"); + await createRunOn(legacyStore, seed14, { id: lId, friendlyId: "run_nd_l" }); + await createRunOn(newStore, seed17, { id: nId, friendlyId: "run_nd_n" }); + + const rows = (await router.findRuns({ + where: { id: { in: [lId, nId] } }, + select: { id: true }, + })) as Array<{ id: string }>; + expect(rows.map((r) => r.id).sort()).toEqual([lId, nId].sort()); + + // @ts-expect-error onLegacyRead has been removed from RoutingRunStore options + void new RoutingRunStore({ new: newStore, legacy: legacyStore, onLegacyRead: () => {} }); + } + ); + + // A run present on BOTH DBs (the copy->fence migration window) must be returned ONCE, + // and the NEW copy wins. + heteroPostgresTest( + "id-set dedupes a run present on both DBs, preferring NEW", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "dd14"); + const seed17 = await seedEnvironment(prisma17, "dd17"); + const dupId = legacyId("9"); + await createRunOn(legacyStore, seed14, { + id: dupId, + friendlyId: "run_dd_l", + taskIdentifier: "from-legacy", + }); + await createRunOn(newStore, seed17, { + id: dupId, + friendlyId: "run_dd_n", + taskIdentifier: "from-new", + }); + + const rows = (await router.findRuns({ + where: { id: { in: [dupId] } }, + select: { id: true, taskIdentifier: true }, + })) as Array<{ id: string; taskIdentifier: string }>; + expect(rows).toHaveLength(1); + expect(rows[0]!.taskIdentifier).toBe("from-new"); + } + ); + + // An open predicate (no id set) unions both DBs and dedupes by id (NEW wins). + heteroPostgresTest( + "open predicate unions both DBs and dedupes by id", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "op14"); + const seed17 = await seedEnvironment(prisma17, "op17"); + const tId = "open-shared-task"; + const lOnly = legacyId("a"); + const nOnly = newId("a"); + const dup = legacyId("b"); + await createRunOn(legacyStore, seed14, { + id: lOnly, + friendlyId: "run_o_l", + taskIdentifier: tId, + }); + await createRunOn(newStore, seed17, { + id: nOnly, + friendlyId: "run_o_n", + taskIdentifier: tId, + }); + await createRunOn(legacyStore, seed14, { + id: dup, + friendlyId: "run_o_dl", + taskIdentifier: tId, + }); + await createRunOn(newStore, seed17, { id: dup, friendlyId: "run_o_dn", taskIdentifier: tId }); + + const rows = (await router.findRuns({ + where: { taskIdentifier: tId }, + select: { id: true, friendlyId: true }, + })) as Array<{ id: string; friendlyId: string }>; + expect(rows.map((r) => r.id).sort()).toEqual([lOnly, nOnly, dup].sort()); + expect(rows.find((r) => r.id === dup)?.friendlyId).toBe("run_o_dn"); + } + ); + + // orderBy + take are re-imposed across the MERGED set, not per-DB: the global top-2 by + // createdAt desc interleaves the two databases. + heteroPostgresTest( + "re-imposes orderBy and take across the merged set", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "ot14"); + const seed17 = await seedEnvironment(prisma17, "ot17"); + const t = "order-take-task"; + const a = legacyId("p"); + const b = newId("p"); + const c = legacyId("q"); + const d = newId("q"); + await createRunOn(legacyStore, seed14, { + id: a, + friendlyId: "run_ot_a", + taskIdentifier: t, + createdAt: new Date("2024-03-04T00:00:00.000Z"), + }); + await createRunOn(newStore, seed17, { + id: b, + friendlyId: "run_ot_b", + taskIdentifier: t, + createdAt: new Date("2024-03-03T00:00:00.000Z"), + }); + await createRunOn(legacyStore, seed14, { + id: c, + friendlyId: "run_ot_c", + taskIdentifier: t, + createdAt: new Date("2024-03-02T00:00:00.000Z"), + }); + await createRunOn(newStore, seed17, { + id: d, + friendlyId: "run_ot_d", + taskIdentifier: t, + createdAt: new Date("2024-03-01T00:00:00.000Z"), + }); + + const rows = (await router.findRuns({ + where: { taskIdentifier: t }, + orderBy: { createdAt: "desc" }, + take: 2, + select: { id: true }, + })) as Array<{ id: string }>; + expect(rows.map((r) => r.id)).toEqual([a, b]); + } + ); + + // A bounded id set containing a LEGACY-resident cuid run is returned via the UNCONDITIONAL + // LEGACY probe. Routing is pure id-shape (no isMigrated predicate), so a cuid id NEW misses + // is always probed on LEGACY — where, with no migration, it always lives. + heteroPostgresTest( + "id-set returns a not-migrated LEGACY-resident cuid run via the unconditional probe (no isMigrated)", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "unc14"); + await seedEnvironment(prisma17, "unc17"); + const cuid = legacyId("u"); // 25-char body -> cuid -> LEGACY-resident + await createRunOn(legacyStore, seed14, { id: cuid, friendlyId: "run_unc" }); + + const rows = (await router.findRuns({ + where: { id: { in: [cuid] } }, + select: { id: true }, + })) as Array<{ id: string }>; + // The cuid run lives only on LEGACY; without a probe-skip it MUST be returned. + expect(rows.map((r) => r.id)).toEqual([cuid]); + } + ); + + // An id-set combined with orderBy + take must page across the MERGED set, not per-store: + // the global top-2 by createdAt desc interleaves both databases. + heteroPostgresTest( + "id-set with orderBy + take pages globally across both DBs", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "it14"); + const seed17 = await seedEnvironment(prisma17, "it17"); + const a = legacyId("r"); + const b = newId("r"); + const c = legacyId("s"); + const d = newId("s"); + await createRunOn(legacyStore, seed14, { + id: a, + friendlyId: "run_it_a", + createdAt: new Date("2024-04-04T00:00:00.000Z"), + }); + await createRunOn(newStore, seed17, { + id: b, + friendlyId: "run_it_b", + createdAt: new Date("2024-04-03T00:00:00.000Z"), + }); + await createRunOn(legacyStore, seed14, { + id: c, + friendlyId: "run_it_c", + createdAt: new Date("2024-04-02T00:00:00.000Z"), + }); + await createRunOn(newStore, seed17, { + id: d, + friendlyId: "run_it_d", + createdAt: new Date("2024-04-01T00:00:00.000Z"), + }); + + const rows = (await router.findRuns({ + where: { id: { in: [a, b, c, d] } }, + orderBy: { createdAt: "desc" }, + take: 2, + select: { id: true }, + })) as Array<{ id: string }>; + expect(rows.map((r) => r.id)).toEqual([a, b]); + } + ); + + // A findRun whose `where` is NOT residency-classifiable (e.g. by spanId — the span-detail + // pane) must fan out and find a LEGACY-resident run, not default to NEW and miss it. + heteroPostgresTest( + "findRun by an unclassifiable where (spanId) finds a LEGACY-resident run", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "sp14"); + const seed17 = await seedEnvironment(prisma17, "sp17"); + + const legacyRunId = legacyId("s"); + await createRunOn(legacyStore, seed14, { + id: legacyRunId, + friendlyId: "run_span_legacy", + spanId: "span_on_legacy", + }); + const newRunId = newId("s"); + await createRunOn(newStore, seed17, { + id: newRunId, + friendlyId: "run_span_new", + spanId: "span_on_new", + }); + + const legacyHit = (await router.findRun( + { spanId: "span_on_legacy" }, + { select: { id: true, spanId: true } } + )) as { id: string } | null; + expect(legacyHit?.id).toBe(legacyRunId); + + const newHit = (await router.findRun( + { spanId: "span_on_new" }, + { select: { id: true, spanId: true } } + )) as { id: string } | null; + expect(newHit?.id).toBe(newRunId); + } + ); + + // A waitpoint can live on NEW with a LEGACY-classified (cuid) id — e.g. a migrated run's + // waitpoint. forWaitpointCompletion must resolve to the store that actually holds it, not + // route by id-shape and miss it (which leaves the blocked run stuck forever). + heteroPostgresTest( + "forWaitpointCompletion resolves to the store holding the waitpoint, not its id-shape", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed17 = await seedEnvironment(prisma17, "wpc17"); + + const cuidWaitpointId = `waitpoint_${"c".repeat(25)}`; // classifies LEGACY by id-shape + await prisma17.waitpoint.create({ + data: { + id: cuidWaitpointId, + friendlyId: "waitpoint_wpc_x", + type: "MANUAL", + idempotencyKey: "wpc-key", + userProvidedIdempotencyKey: false, + projectId: seed17.project.id, + environmentId: seed17.environment.id, + }, + }); + + const store = await router.forWaitpointCompletion(cuidWaitpointId, { routeKind: "MANUAL" }); + const found = await store.findWaitpoint({ where: { id: cuidWaitpointId } }); + expect(found?.id).toBe(cuidWaitpointId); + } + ); + + // A waitpoint must be born on the same DB as its run (cuid → LEGACY, ksuid → NEW) so that + // completion and the blocking edge — which already routes by run id — line up. A cuid + // waitpoint landing on NEW is the regression that strands a non-opted org's wait forever. + heteroPostgresTest( + "createWaitpoint co-locates a waitpoint with its run by id-shape", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "wpco14"); + const seed17 = await seedEnvironment(prisma17, "wpco17"); + + const cuidWp = `waitpoint_${CUID_25}`; + await router.createWaitpoint({ + data: { + id: cuidWp, + friendlyId: "waitpoint_co_c", + type: "MANUAL", + idempotencyKey: "co-key-c", + userProvidedIdempotencyKey: false, + projectId: seed14.project.id, + environmentId: seed14.environment.id, + }, + }); + expect(await prisma14.waitpoint.findUnique({ where: { id: cuidWp } })).not.toBeNull(); + expect(await prisma17.waitpoint.findUnique({ where: { id: cuidWp } })).toBeNull(); + + const ksuidWp = `waitpoint_${KSUID_27}`; + await router.createWaitpoint({ + data: { + id: ksuidWp, + friendlyId: "waitpoint_co_k", + type: "MANUAL", + idempotencyKey: "co-key-k", + userProvidedIdempotencyKey: false, + projectId: seed17.project.id, + environmentId: seed17.environment.id, + }, + }); + expect(await prisma17.waitpoint.findUnique({ where: { id: ksuidWp } })).not.toBeNull(); + expect(await prisma14.waitpoint.findUnique({ where: { id: ksuidWp } })).toBeNull(); + } + ); +}); + +// Fan-out over the two DISTINCT generated schemas. +// prisma17 is RunOpsPrismaClient (subset schema, no control-plane tables). +describe("RoutingRunStore.findRuns cross-DB fan-out over distinct schemas", () => { + const legacyId = (suffix: string) => `run_${"c".repeat(25 - suffix.length)}${suffix}`; + const newId = (suffix: string) => `run_${"k".repeat(27 - suffix.length)}${suffix}`; + + heteroRunOpsPostgresTest( + "id-set fans out across NEW (RunOpsPrismaClient) and LEGACY (PrismaClient) distinct schemas", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ + prisma: prisma17 as any, + readOnlyPrisma: prisma17 as any, + }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "t10_14"); + + const cuidId = legacyId("t10"); + await legacyStore.createRun( + buildCreateRunInput({ + runId: cuidId, + friendlyId: "run_t10_legacy", + taskIdentifier: "t10-legacy-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + // NEW side has no control-plane tables and no associatedWaitpoint relation; + // seed the TaskRun row directly with synthetic scalar ids. + const ksuidId = newId("t10"); + await prisma17.taskRun.create({ + data: { + id: ksuidId, + engine: "V2", + status: "PENDING", + friendlyId: "run_t10_new", + runtimeEnvironmentId: "synthetic-env-id", + environmentType: "DEVELOPMENT", + organizationId: "synthetic-org-id", + projectId: "synthetic-project-id", + taskIdentifier: "t10-new-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: "trace_t10", + spanId: "span_t10", + runTags: [], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-06-01T00:00:00.000Z"), + }, + }); + + const rows = (await router.findRuns({ + where: { id: { in: [cuidId, ksuidId] } }, + select: { id: true }, + })) as Array<{ id: string }>; + + expect(rows.map((r) => r.id).sort()).toEqual([cuidId, ksuidId].sort()); + }, + 120_000 + ); +}); + +describe("RoutingRunStore write-path fan-outs", () => { + const legacyId = (suffix: string) => `run_${"c".repeat(25 - suffix.length)}${suffix}`; + const newId = (suffix: string) => `run_${"k".repeat(27 - suffix.length)}${suffix}`; + + async function createRunWithKey( + store: PostgresRunStore, + seed: Awaited>, + opts: { id: string; friendlyId: string; idempotencyKey?: string } + ) { + const input = buildCreateRunInput({ + runId: opts.id, + friendlyId: opts.friendlyId, + taskIdentifier: "idem-task", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }); + if (opts.idempotencyKey) input.data.idempotencyKey = opts.idempotencyKey; + await store.createRun(input); + } + + // clearIdempotencyKey byFriendlyIds fans out to both DBs and sums counts. + heteroPostgresTest( + "clearIdempotencyKey byFriendlyIds fans out across NEW+LEGACY and sums count", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "ci_fo14"); + const seed17 = await seedEnvironment(prisma17, "ci_fo17"); + + const lId = legacyId("ci1"); + const nId = newId("ci1"); + await createRunWithKey(legacyStore, seed14, { + id: lId, + friendlyId: "run_ci_legacy", + idempotencyKey: "key-legacy", + }); + await createRunWithKey(newStore, seed17, { + id: nId, + friendlyId: "run_ci_new", + idempotencyKey: "key-new", + }); + + const result = await router.clearIdempotencyKey({ + byFriendlyIds: ["run_ci_legacy", "run_ci_new"], + }); + + expect(result.count).toBe(2); + expect( + ( + await prisma14.taskRun.findUnique({ + where: { id: lId }, + select: { idempotencyKey: true }, + }) + )?.idempotencyKey + ).toBeNull(); + expect( + ( + await prisma17.taskRun.findUnique({ + where: { id: nId }, + select: { idempotencyKey: true }, + }) + )?.idempotencyKey + ).toBeNull(); + } + ); + + // clearIdempotencyKey byPredicate fans out to both DBs and sums counts. + heteroPostgresTest( + "clearIdempotencyKey byPredicate fans out across NEW+LEGACY and sums count", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "cp_fo14"); + const seed17 = await seedEnvironment(prisma17, "cp_fo17"); + + const sharedKey = "shared-idem-key"; + const sharedTask = "shared-task"; + const lId = legacyId("cp1"); + const nId = newId("cp1"); + await createRunWithKey(legacyStore, seed14, { + id: lId, + friendlyId: "run_cp_legacy", + idempotencyKey: sharedKey, + }); + // Override taskIdentifier to match the predicate. + const input = buildCreateRunInput({ + runId: nId, + friendlyId: "run_cp_new", + taskIdentifier: sharedTask, + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }); + input.data.idempotencyKey = sharedKey; + await newStore.createRun(input); + + // byPredicate matches on (idempotencyKey, taskIdentifier, runtimeEnvironmentId). + // We target each DB's env separately to keep counts predictable (1 hit per DB). + const [r14, r17] = await Promise.all([ + router.clearIdempotencyKey({ + byPredicate: { + idempotencyKey: sharedKey, + taskIdentifier: "idem-task", + runtimeEnvironmentId: seed14.environment.id, + }, + }), + router.clearIdempotencyKey({ + byPredicate: { + idempotencyKey: sharedKey, + taskIdentifier: sharedTask, + runtimeEnvironmentId: seed17.environment.id, + }, + }), + ]); + + // Each predicate call fans out to both stores; only the matching DB has a hit. + expect(r14.count).toBe(1); + expect(r17.count).toBe(1); + } + ); + + // expireRunsBatch with mixed ksuid+cuid ids partitions across both DBs and sums. + heteroPostgresTest( + "expireRunsBatch with mixed ids partitions across NEW+LEGACY and sums count", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + const seed14 = await seedEnvironment(prisma14, "exp_14"); + const seed17 = await seedEnvironment(prisma17, "exp_17"); + + const lId = legacyId("ex1"); + const nId = newId("ex1"); + const input14 = buildCreateRunInput({ + runId: lId, + friendlyId: "run_exp_l", + taskIdentifier: "expire-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }); + await legacyStore.createRun(input14); + const input17 = buildCreateRunInput({ + runId: nId, + friendlyId: "run_exp_n", + taskIdentifier: "expire-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }); + await newStore.createRun(input17); + + const expireData = { + error: { type: "STRING_ERROR" as const, raw: "ttl expired" }, + now: new Date("2024-05-01T00:00:00.000Z"), + }; + const count = await router.expireRunsBatch([lId, nId], expireData); + + expect(count).toBe(2); + expect( + (await prisma14.taskRun.findUnique({ where: { id: lId }, select: { status: true } })) + ?.status + ).toBe("EXPIRED"); + expect( + (await prisma17.taskRun.findUnique({ where: { id: nId }, select: { status: true } })) + ?.status + ).toBe("EXPIRED"); + } + ); + + // all-ksuid batch goes only to NEW; LEGACY store is not called with an empty list. + heteroPostgresTest( + "expireRunsBatch all-ksuid batch skips LEGACY (no empty IN query)", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + let legacyCalled = false; + const spyLegacy: RunStore = new Proxy(legacyStore, { + get(target, prop) { + if (prop === "expireRunsBatch") { + return (...args: unknown[]) => { + legacyCalled = true; + return (target as any).expireRunsBatch(...args); + }; + } + return (target as any)[prop]; + }, + }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: spyLegacy }); + const seed17 = await seedEnvironment(prisma17, "ks_17"); + + const nId = newId("kb1"); + await newStore.createRun( + buildCreateRunInput({ + runId: nId, + friendlyId: "run_ks_n", + taskIdentifier: "ksuid-only-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + + const expireData = { + error: { type: "STRING_ERROR" as const, raw: "ttl" }, + now: new Date("2024-05-01T00:00:00.000Z"), + }; + const count = await router.expireRunsBatch([nId], expireData); + + expect(count).toBe(1); + expect(legacyCalled).toBe(false); + } + ); +}); + +describe("RoutingRunStore.findTaskRunAttempt residency routing", () => { + const legacyRunId = (suffix: string) => `run_${"c".repeat(25 - suffix.length)}${suffix}`; + const newRunId = (suffix: string) => `run_${"k".repeat(27 - suffix.length)}${suffix}`; + + async function seedAttempt( + prisma: PrismaClient, + opts: { + attemptId: string; + friendlyId: string; + runId: string; + runtimeEnvironmentId: string; + status?: string; + } + ) { + await prisma.$executeRawUnsafe(`SET session_replication_role = replica`); + await prisma.$executeRawUnsafe( + `INSERT INTO "TaskRunAttempt" (id, number, "friendlyId", "taskRunId", "backgroundWorkerId", "backgroundWorkerTaskId", "runtimeEnvironmentId", "queueId", status, "createdAt", "updatedAt", "usageDurationMs", "outputType") + VALUES ($1, 1, $2, $3, 'synthetic-worker', 'synthetic-worker-task', $4, 'synthetic-queue', $5::"TaskRunAttemptStatus", NOW(), NOW(), 0, 'application/json')`, + opts.attemptId, + opts.friendlyId, + opts.runId, + opts.runtimeEnvironmentId, + opts.status ?? "COMPLETED" + ); + await prisma.$executeRawUnsafe(`SET session_replication_role = DEFAULT`); + } + + heteroPostgresTest( + "a cuid (LEGACY) run's attempt resolves via findTaskRunAttempt (regression: was hardcoded NEW)", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "t9a_cuid14"); + const runId = legacyRunId("t9a1"); + await legacyStore.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_t9a_legacy", + taskIdentifier: "t9a-legacy-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + const attemptId = "attempt_t9a_cuid1"; + await seedAttempt(prisma14, { + attemptId, + friendlyId: "attempt_t9a_c1", + runId, + runtimeEnvironmentId: seed14.environment.id, + status: "COMPLETED", + }); + + const found = await router.findTaskRunAttempt({ + select: { id: true, taskRunId: true }, + where: { taskRunId: runId }, + }); + + expect(found?.id).toBe(attemptId); + expect(found?.taskRunId).toBe(runId); + expect(await prisma17.taskRunAttempt.findUnique({ where: { id: attemptId } })).toBeNull(); + } + ); + + heteroPostgresTest( + "a ksuid (NEW) run's attempt still resolves via findTaskRunAttempt", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed17 = await seedEnvironment(prisma17, "t9a_ksuid17"); + const runId = newRunId("t9a2"); + await newStore.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_t9a_new", + taskIdentifier: "t9a-new-task", + organizationId: seed17.organization.id, + projectId: seed17.project.id, + runtimeEnvironmentId: seed17.environment.id, + }) + ); + + const attemptId = "attempt_t9a_ksuid1"; + await seedAttempt(prisma17, { + attemptId, + friendlyId: "attempt_t9a_k1", + runId, + runtimeEnvironmentId: seed17.environment.id, + status: "COMPLETED", + }); + + const found = await router.findTaskRunAttempt({ + select: { id: true, taskRunId: true }, + where: { taskRunId: runId }, + }); + + expect(found?.id).toBe(attemptId); + expect(found?.taskRunId).toBe(runId); + } + ); + + // No taskRunId in where → fan out NEW-first then LEGACY. + heteroPostgresTest( + "no taskRunId where fans out NEW-first then LEGACY and finds a LEGACY attempt", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "t9a_fanout14"); + const runId = legacyRunId("t9a3"); + await legacyStore.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_t9a_fo", + taskIdentifier: "t9a-fanout-task", + organizationId: seed14.organization.id, + projectId: seed14.project.id, + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + const attemptId = "attempt_t9a_fo1"; + const uniqueFriendlyId = `attempt_t9a_fo_${Date.now()}`; + await seedAttempt(prisma14, { + attemptId, + friendlyId: uniqueFriendlyId, + runId, + runtimeEnvironmentId: seed14.environment.id, + status: "COMPLETED", + }); + + const found = await router.findTaskRunAttempt({ + select: { id: true, friendlyId: true }, + where: { friendlyId: uniqueFriendlyId }, + }); + + expect(found?.id).toBe(attemptId); + } + ); +}); + +describe("findBatchTaskRunByFriendlyId probe", () => { + function batchData(params: { id: string; friendlyId: string; runtimeEnvironmentId: string }) { + return { + id: params.id, + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + runCount: 1, + runIds: [] as string[], + payload: '{"hello":"world"}', + payloadType: "application/json", + options: {}, + batchVersion: "runengine:v1", + }; + } + + heteroPostgresTest( + "a batch on LEGACY resolves via the NEW-first probe", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "t08_leg14"); + await legacyStore.createBatchTaskRun( + batchData({ + id: "batch_t08_legacy", + friendlyId: "batch_t08_leg", + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + const found = await router.findBatchTaskRunByFriendlyId( + "batch_t08_leg", + seed14.environment.id + ); + expect(found?.id).toBe("batch_t08_legacy"); + expect( + await prisma17.batchTaskRun.findUnique({ where: { id: "batch_t08_legacy" } }) + ).toBeNull(); + } + ); + + heteroPostgresTest("a batch on NEW resolves immediately", async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed17 = await seedEnvironment(prisma17, "t08_new17"); + await newStore.createBatchTaskRun( + batchData({ + id: "batch_t08_new", + friendlyId: "batch_t08_new", + runtimeEnvironmentId: seed17.environment.id, + }) + ); + + const found = await router.findBatchTaskRunByFriendlyId("batch_t08_new", seed17.environment.id); + expect(found?.id).toBe("batch_t08_new"); + expect(await prisma14.batchTaskRun.findUnique({ where: { id: "batch_t08_new" } })).toBeNull(); + }); + + heteroPostgresTest( + "env-scoping: wrong environmentId returns null", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "t08_es14"); + await legacyStore.createBatchTaskRun( + batchData({ + id: "batch_t08_scope", + friendlyId: "batch_t08_scope", + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + const found = await router.findBatchTaskRunByFriendlyId( + "batch_t08_scope", + "wrong-env-id-00000000000000000" + ); + expect(found).toBeNull(); + } + ); + + heteroPostgresTest( + "include:{ errors:true } returns seeded BatchTaskRunError through the probe", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed17 = await seedEnvironment(prisma17, "t08_inc17"); + const batchId = "batch_t08_inc"; + await newStore.createBatchTaskRun( + batchData({ + id: batchId, + friendlyId: "batch_t08_inc", + runtimeEnvironmentId: seed17.environment.id, + }) + ); + await prisma17.batchTaskRunError.create({ + data: { + id: "bterr_t08_1", + batchTaskRunId: batchId, + index: 0, + taskIdentifier: "my-task", + error: "something went wrong", + }, + }); + + const found = (await router.findBatchTaskRunByFriendlyId( + "batch_t08_inc", + seed17.environment.id, + { include: { errors: true } } + )) as ({ errors: Array<{ id: string }> } & Record) | null; + + expect(found).not.toBeNull(); + expect(found?.errors).toHaveLength(1); + expect(found?.errors[0]?.id).toBe("bterr_t08_1"); + } + ); +}); + +// Batch residency: the four new accessors must route by batch id so a ksuid +// batch + its items live on NEW with its child runs, and fall back to fan-out where there +// is no classifiable id (idempotency probe; status-only updateMany). +describe("RoutingRunStore batch-residency accessors", () => { + function batchData(params: { + id: string; + friendlyId: string; + runtimeEnvironmentId: string; + idempotencyKey?: string; + }) { + return { + id: params.id, + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + runCount: 1, + runIds: [] as string[], + payload: "{}", + payloadType: "application/json", + options: {}, + batchVersion: "runengine:v1", + ...(params.idempotencyKey ? { idempotencyKey: params.idempotencyKey } : {}), + }; + } + + // The dedicated run-ops schema has scalarized env/project/org FKs, so a TaskRun can be + // created with arbitrary scalar ids — no Organization/Project/RuntimeEnvironment seeding (those + // models don't exist on the dedicated subset). Items' taskRunId FK to TaskRun is KEPT, so the run + // must exist before the item. + async function seedDedicatedRun(prisma: RunOpsPrismaClient, envId: string, runId: string) { + await prisma.taskRun.create({ + data: { + id: runId, + engine: "V2", + status: "PENDING", + friendlyId: `run_${runId}`, + runtimeEnvironmentId: envId, + environmentType: "DEVELOPMENT", + organizationId: "org_dedicated", + projectId: "proj_dedicated", + taskIdentifier: "batch-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: `t_${runId}`, + spanId: `s_${runId}`, + queue: "task/batch-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + }); + } + + const ENV_NEW = "env_dedicated_new"; + + // findBatchTaskRunByIdempotencyKey: no classifiable id ⇒ NEW-first probe finds a batch on either DB. + heteroRunOpsPostgresTest( + "findBatchTaskRunByIdempotencyKey probes NEW then LEGACY", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "optA_idem14"); + + // ksuid batch with an idempotency key on NEW (dedicated, scalar env id) + await newStore.createBatchTaskRun( + batchData({ + id: `${KSUID_27.slice(0, -2)}i1`, + friendlyId: "batch_idem_new", + runtimeEnvironmentId: ENV_NEW, + idempotencyKey: "key-new", + }) + ); + // cuid batch with an idempotency key on LEGACY (full schema, real env) + await legacyStore.createBatchTaskRun( + batchData({ + id: `${CUID_25.slice(0, -2)}i1`, + friendlyId: "batch_idem_legacy", + runtimeEnvironmentId: seed14.environment.id, + idempotencyKey: "key-legacy", + }) + ); + + expect((await router.findBatchTaskRunByIdempotencyKey(ENV_NEW, "key-new"))?.friendlyId).toBe( + "batch_idem_new" + ); + expect( + (await router.findBatchTaskRunByIdempotencyKey(seed14.environment.id, "key-legacy")) + ?.friendlyId + ).toBe("batch_idem_legacy"); + // miss + expect(await router.findBatchTaskRunByIdempotencyKey(ENV_NEW, "absent")).toBeNull(); + } + ); + + // updateManyBatchTaskRun: routes by where.id (ksuid→NEW, cuid→LEGACY); fans out + sums when unrouted. + heteroRunOpsPostgresTest( + "updateManyBatchTaskRun routes by where.id and fans out otherwise", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const seed14 = await seedEnvironment(prisma14, "optA_um14"); + + const ksuidBatchId = `${KSUID_27.slice(0, -2)}m1`; + const cuidBatchId = `${CUID_25.slice(0, -2)}m1`; + await newStore.createBatchTaskRun( + batchData({ id: ksuidBatchId, friendlyId: "batch_um_new", runtimeEnvironmentId: ENV_NEW }) + ); + await legacyStore.createBatchTaskRun( + batchData({ + id: cuidBatchId, + friendlyId: "batch_um_legacy", + runtimeEnvironmentId: seed14.environment.id, + }) + ); + + // where.id ksuid → NEW only + const upNew = await router.updateManyBatchTaskRun({ + where: { id: ksuidBatchId }, + data: { status: "COMPLETED" }, + }); + expect(upNew.count).toBe(1); + expect( + (await prisma17.batchTaskRun.findUnique({ where: { id: ksuidBatchId } }))?.status + ).toBe("COMPLETED"); + + // where.id cuid → LEGACY only + const upLegacy = await router.updateManyBatchTaskRun({ + where: { id: cuidBatchId }, + data: { status: "COMPLETED" }, + }); + expect(upLegacy.count).toBe(1); + expect((await prisma14.batchTaskRun.findUnique({ where: { id: cuidBatchId } }))?.status).toBe( + "COMPLETED" + ); + + // status-only where (no id): fans out to BOTH and sums (both already COMPLETED) + const upBoth = await router.updateManyBatchTaskRun({ + where: { status: "COMPLETED" }, + data: { status: "ABORTED" }, + }); + expect(upBoth.count).toBe(2); + } + ); + + // countBatchTaskRunItems: routes by batchTaskRunId residency (items co-reside with the batch). + heteroRunOpsPostgresTest( + "countBatchTaskRunItems routes by batchTaskRunId residency", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const ksuidBatchId = `${KSUID_27.slice(0, -2)}c1`; + await newStore.createBatchTaskRun( + batchData({ id: ksuidBatchId, friendlyId: "batch_cnt_new", runtimeEnvironmentId: ENV_NEW }) + ); + + const runA = `${KSUID_27.slice(0, -3)}cra`; + const runB = `${KSUID_27.slice(0, -3)}crb`; + await seedDedicatedRun(prisma17, ENV_NEW, runA); + await seedDedicatedRun(prisma17, ENV_NEW, runB); + await prisma17.batchTaskRunItem.create({ + data: { batchTaskRunId: ksuidBatchId, taskRunId: runA, status: "COMPLETED" }, + }); + await prisma17.batchTaskRunItem.create({ + data: { batchTaskRunId: ksuidBatchId, taskRunId: runB, status: "PENDING" }, + }); + + expect(await router.countBatchTaskRunItems({ batchTaskRunId: ksuidBatchId })).toBe(2); + expect( + await router.countBatchTaskRunItems({ batchTaskRunId: ksuidBatchId, status: "COMPLETED" }) + ).toBe(1); + } + ); + + // updateManyBatchTaskRunItems: routes by where.batchTaskRunId so items move with their batch. + heteroRunOpsPostgresTest( + "updateManyBatchTaskRunItems routes by where.batchTaskRunId", + async ({ prisma14, prisma17 }) => { + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const ksuidBatchId = `${KSUID_27.slice(0, -2)}u1`; + await newStore.createBatchTaskRun( + batchData({ id: ksuidBatchId, friendlyId: "batch_ui_new", runtimeEnvironmentId: ENV_NEW }) + ); + + const runX = `${KSUID_27.slice(0, -3)}uix`; + await seedDedicatedRun(prisma17, ENV_NEW, runX); + await prisma17.batchTaskRunItem.create({ + data: { batchTaskRunId: ksuidBatchId, taskRunId: runX, status: "PENDING" }, + }); + + const res = await router.updateManyBatchTaskRunItems({ + where: { batchTaskRunId: ksuidBatchId, taskRunId: runX }, + data: { status: "COMPLETED" }, + }); + expect(res.count).toBe(1); + const item = await prisma17.batchTaskRunItem.findFirst({ + where: { batchTaskRunId: ksuidBatchId, taskRunId: runX }, + }); + expect(item?.status).toBe("COMPLETED"); + } + ); + + // Single-DB passthrough: both stores are the same; all four accessors collapse to it. + heteroRunOpsPostgresTest( + "single-DB passthrough for the batch-residency accessors", + async ({ prisma17 }) => { + const store = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const router = new RoutingRunStore({ new: store, legacy: store }); + + const batchId = `${KSUID_27.slice(0, -2)}s1`; + await router.createBatchTaskRun( + batchData({ + id: batchId, + friendlyId: "batch_single", + runtimeEnvironmentId: ENV_NEW, + idempotencyKey: "single-key", + }) + ); + + expect((await router.findBatchTaskRunByIdempotencyKey(ENV_NEW, "single-key"))?.id).toBe( + batchId + ); + + const runId = `${KSUID_27.slice(0, -3)}srn`; + await seedDedicatedRun(prisma17, ENV_NEW, runId); + await prisma17.batchTaskRunItem.create({ + data: { batchTaskRunId: batchId, taskRunId: runId, status: "PENDING" }, + }); + expect(await router.countBatchTaskRunItems({ batchTaskRunId: batchId })).toBe(1); + expect( + ( + await router.updateManyBatchTaskRunItems({ + where: { batchTaskRunId: batchId }, + data: { status: "COMPLETED" }, + }) + ).count + ).toBe(1); + expect( + ( + await router.updateManyBatchTaskRun({ + where: { id: batchId }, + data: { status: "COMPLETED" }, + }) + ).count + ).toBe(1); + } + ); +}); diff --git a/internal-packages/run-store/src/runOpsStore.ts b/internal-packages/run-store/src/runOpsStore.ts new file mode 100644 index 00000000000..66bd4a976dd --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.ts @@ -0,0 +1,1691 @@ +import type { + BatchTaskRun, + BatchTaskRunItemStatus, + Prisma, + PrismaClientOrTransaction, + TaskRun, + TaskRunStatus, +} from "@trigger.dev/database"; +import { ownerEngine, type Residency } from "@trigger.dev/core/v3/isomorphic"; +import type { TaskRunError } from "@trigger.dev/core/v3/schemas"; +import type { + ClearIdempotencyKeyInput, + CompletionSnapshotInput, + CreateBatchTaskRunData, + CreateCancelledRunInput, + CreateExecutionSnapshotInput, + CreateFailedRunInput, + CreateRunInput, + ExpireSnapshotInput, + ForWaitpointCompletionContext, + LockRunData, + ReadClient, + RescheduleSnapshotInput, + RewriteDebouncedRunData, + RunStore, + TaskRunWithWaitpoint, + WaitpointColocationOptions, +} from "./types.js"; + +/** + * Run-ops routing substrate for the TaskRun-core method group. Implements {@link RunStore} + * by selecting between a NEW store (the dedicated run-ops DB, where new runs are born) and + * a LEGACY store (the control-plane DB) via the residency classifier (`ownerEngine`: + * ksuid→NEW, cuid→LEGACY). In single-DB both stores are the same, so routing is a no-op + * passthrough. Inert until the injecting seam wires it in under `isSplitEnabled()`; reads no + * flag here. The TaskRun-core methods (create/find/findRuns + updateMetadata/clearIdempotencyKey) + * route by residency; all other methods are mechanical residency-routing delegates. + */ +export class RoutingRunStore implements RunStore { + readonly #new: RunStore; + readonly #legacy: RunStore; + readonly #classify: (id: string) => Residency; + + constructor(options: { new: RunStore; legacy: RunStore; classify?: (id: string) => Residency }) { + this.#new = options.new; + this.#legacy = options.legacy; + this.#classify = options.classify ?? ownerEngine; + } + + // An unclassifiable id is treated as LEGACY (probe the control-plane DB rather than drop a + // real run), matching the read-through layer's policy. + #classifySafe(id: string): Residency { + try { + return this.#classify(id); + } catch { + return "LEGACY"; + } + } + + // A `findRuns` caller bound to the given store (preserves `this`; the overload set isn't + // assignable to a single call signature, so it's cast through the implementation shape). + #findManyOn(store: RunStore): (args: unknown) => Promise>> { + const fn = store.findRuns as (args: unknown) => Promise>>; + return fn.bind(store); + } + + // Route an existing run-ops id by residency. Throws on an unclassifiable id. + #route(id: string): RunStore { + return this.#classify(id) === "NEW" ? this.#new : this.#legacy; + } + + // Best-effort route; falls back to NEW (the steady-state home) when the id is absent + // or unclassifiable. + #routeOrNew(id: string | undefined): RunStore { + if (typeof id !== "string") { + return this.#new; + } + try { + return this.#route(id); + } catch { + return this.#new; + } + } + + // WRITE routing is pure id-shape (cuid → LEGACY, ksuid → NEW). A LEGACY-classified id is + // always LEGACY-resident; no marker check exists. Kept async so the many + // `await this.#routeForWrite(...)` call sites need no edits (awaiting a resolved store is + // a no-op). + async #routeForWrite(id: string): Promise { + return this.#route(id); + } + + async #routeOrNewForWrite(id: string | undefined): Promise { + return this.#routeOrNew(id); + } + + // Resolve the store that OWNS the run and open ONE transaction on ITS own client. The + // co-resident multi-write unit (e.g. startAttempt + createExecutionSnapshot) runs against the + // tx-bound store the owner yields, so both writes share one transaction on the run's DB and a + // failure between them rolls BOTH back. This is NOT a cross-DB transaction — the unit is co-resident + // by construction (all writes target the one run on the one owning DB). Unclassifiable / absent id + // falls back to NEW (the steady-state home), mirroring #routeOrNewForWrite. + runInTransaction( + runId: string | undefined, + fn: (store: RunStore, tx: PrismaClientOrTransaction) => Promise + ): Promise { + return this.#routeOrNew(runId).runInTransaction(runId, fn); + } + + // A waitpoint WRITE co-locates with its run by id-shape (cuid → LEGACY, ksuid → NEW, + // unclassifiable → LEGACY), mirroring how `blockRunWithWaitpointEdges` routes the edge by + // run id. `tx` is forwarded only to LEGACY (same physical DB as the control-plane tx); + // for NEW it's dropped so the row lands on NEW's own client. + #routeWaitpointWrite( + id: string | undefined, + tx?: PrismaClientOrTransaction + ): { store: RunStore; tx?: PrismaClientOrTransaction } { + const store = + typeof id === "string" && this.#classifySafe(id) === "NEW" ? this.#new : this.#legacy; + return { store, tx: store === this.#legacy ? tx : undefined }; + } + + // Resolve which store ACTUALLY holds a waitpoint id: drain-on-read can relocate a cuid + // waitpoint onto NEW while keeping its id, so probe the id-shape's home then the other. + async #resolveWaitpointStore(id: string | undefined): Promise { + const home = + typeof id === "string" && this.#classifySafe(id) === "NEW" ? this.#new : this.#legacy; + if (typeof id !== "string") { + return home; + } + if (await home.findWaitpoint({ where: { id } })) { + return home; + } + const other = home === this.#new ? this.#legacy : this.#new; + return (await other.findWaitpoint({ where: { id } })) ? other : home; + } + + static #waitpointId(clause: unknown): string | undefined { + const id = clause && typeof clause === "object" ? (clause as { id?: unknown }).id : undefined; + return typeof id === "string" ? id : undefined; + } + + // --------------------------------------------------------------------------- + // TaskRun-core: Create — a run is born on the store named by its MINTED id-kind: + // cuid → LEGACY, ksuid → NEW, unclassifiable → NEW. The mint layer encodes + // inherited residency into the id-kind, so create-by-id-shape is correct; + // a brand-new run has no redirect marker. + // + // The caller's `tx` is intentionally NOT forwarded: it is the control-plane + // client, but a residency-routed create must run on the OWNING store's own + // client or the row lands in the wrong DB. Safe to drop — a create is a single + // nested `taskRun.create` that joins no cross-DB transaction. + // --------------------------------------------------------------------------- + + createRun( + params: CreateRunInput, + _tx?: PrismaClientOrTransaction + ): Promise { + return this.#routeOrNew(params.data.id).createRun(params); + } + + createCancelledRun( + params: CreateCancelledRunInput, + _tx?: PrismaClientOrTransaction + ): Promise { + return this.#routeOrNew(params.data.id).createCancelledRun(params); + } + + createFailedRun( + params: CreateFailedRunInput, + _tx?: PrismaClientOrTransaction + ): Promise { + return this.#routeOrNew(params.data.id).createFailedRun(params); + } + + // --------------------------------------------------------------------------- + // TaskRun-core: Read — route existing-id lookups by residency + // --------------------------------------------------------------------------- + + findRun( + where: Prisma.TaskRunWhereInput, + args: { select: S }, + client?: ReadClient + ): Promise | null>; + findRun( + where: Prisma.TaskRunWhereInput, + args: { include: I }, + client?: ReadClient + ): Promise | null>; + findRun(where: Prisma.TaskRunWhereInput, client?: ReadClient): Promise; + findRun( + where: Prisma.TaskRunWhereInput, + argsOrClient?: { select?: unknown; include?: unknown } | ReadClient, + _client?: ReadClient + ): Promise { + // Pass through only the select/include args; the caller's actual client object is never + // forwarded to the routed store (the control-plane writer can't query the NEW DB). But its + // IDENTITY is the read-your-writes signal: a WRITER means the caller just wrote this run and + // needs to beat replica lag, so route to the OWNING store's own primary (writer). A replica / + // nothing keeps the default — the owning store's replica. + const args = selectOrIncludeArgs(argsOrClient); + const onPrimary = readYourWrites(argsOrClient, _client); + const id = idFromWhere(where); + if (id !== undefined) { + // Residency-classifiable (id/friendlyId): route to the owning store. + const store = this.#routeOrNew(id); + const method = onPrimary ? "findRunOnPrimary" : "findRun"; + return (store[method] as (...rest: unknown[]) => Promise)(where, args); + } + // Unclassifiable where (e.g. spanId, idempotencyKey): the run may live on either DB, + // so fan out NEW-first then LEGACY rather than defaulting to NEW — defaulting silently + // misses legacy-resident runs (span detail, idempotency-dedup probe, etc.). + return this.#findRunUnrouted(where, args, onPrimary); + } + + async #findRunUnrouted( + where: Prisma.TaskRunWhereInput, + args: unknown, + onPrimary: boolean + ): Promise { + const method = onPrimary ? "findRunOnPrimary" : "findRun"; + const fromNew = await (this.#new[method] as (...rest: unknown[]) => Promise)( + where, + args + ); + if (fromNew != null) { + return fromNew; + } + return (this.#legacy[method] as (...rest: unknown[]) => Promise)(where, args); + } + + findRuns( + args: { + where: Prisma.TaskRunWhereInput; + select: S; + orderBy?: Prisma.TaskRunOrderByWithRelationInput | Prisma.TaskRunOrderByWithRelationInput[]; + take?: number; + skip?: number; + cursor?: Prisma.TaskRunWhereUniqueInput; + }, + client?: ReadClient + ): Promise[]>; + findRuns( + args: { + where: Prisma.TaskRunWhereInput; + include: I; + orderBy?: Prisma.TaskRunOrderByWithRelationInput | Prisma.TaskRunOrderByWithRelationInput[]; + take?: number; + skip?: number; + cursor?: Prisma.TaskRunWhereUniqueInput; + }, + client?: ReadClient + ): Promise[]>; + findRuns( + args: { + where: Prisma.TaskRunWhereInput; + orderBy?: Prisma.TaskRunOrderByWithRelationInput | Prisma.TaskRunOrderByWithRelationInput[]; + take?: number; + skip?: number; + cursor?: Prisma.TaskRunWhereUniqueInput; + }, + client?: ReadClient + ): Promise; + findRuns( + args: { + where: Prisma.TaskRunWhereInput; + select?: unknown; + include?: unknown; + orderBy?: Prisma.TaskRunOrderByWithRelationInput | Prisma.TaskRunOrderByWithRelationInput[]; + take?: number; + skip?: number; + cursor?: Prisma.TaskRunWhereUniqueInput; + }, + _client?: ReadClient + ): Promise { + // SPLIT-mode fan-out across NEW + LEGACY. A `findRuns` `where` can span ids of mixed + // residency, so we resolve each owning store and merge, preserving orderBy/take/skip. + // The caller's client is intentionally NOT forwarded (it is the control-plane client / + // a primary handle); each store reads its own replica, as the prior delegate did. NEW + // wins on id collisions (the copy->fence migration window) so a half-migrated run is + // never double-reported. + return this.#findRunsRouted(args); + } + + async #findRunsRouted(args: FindRunsArgs): Promise { + if (args.cursor) { + // No caller paginates findRuns by Prisma cursor in split mode (the runs list + // paginates in ClickHouse and hydrates a bounded id set). Merging cursor windows + // across two DBs is unsound, so fail loud rather than silently mis-page. + throw new Error( + "RoutingRunStore.findRuns: cursor pagination is unsupported in split mode; pass a bounded id set or take/skip" + ); + } + + const idList = idListFromWhere(args.where); + return idList ? this.#findRunsByIdSet(args, idList) : this.#findRunsOpen(args); + } + + // Bounded id-set (the list hydrate + engine sweeps). Query NEW for the whole set first + // (it holds ksuid runs); probe LEGACY only for the ids NEW missed that could still live + // there (cuid). The two id sets are disjoint by construction, so the merge needs no dedupe. + async #findRunsByIdSet(args: FindRunsArgs, ids: string[]): Promise { + const { args: selArgs, addedFields } = ensureProjected(args); + // The id set already bounds the per-store result, so never push take/skip down — doing + // so would truncate a store's page before the merge knows membership and mis-attribute + // rows. take/skip are applied once, globally, in finalizeRows. + const fan = { ...selArgs, take: undefined, skip: undefined }; + const findNew = this.#findManyOn(this.#new); + const findLegacy = this.#findManyOn(this.#legacy); + + const newRows = await findNew(fan); + const foundIds = new Set(newRows.map((r) => r.id as string)); + + const toLegacy: string[] = []; + for (const id of ids) { + if (foundIds.has(id)) continue; + if (this.#classifySafe(id) === "NEW") continue; // ksuid: cannot live on LEGACY + toLegacy.push(id); + } + + const legacyRows = toLegacy.length > 0 ? await findLegacy(narrowToIds(fan, toLegacy)) : []; + return finalizeRows([...newRows, ...legacyRows], args, addedFields); + } + + // Open predicate (e.g. `{ batchId }`, `{ status, runtimeEnvironmentId }`): no id set to + // partition, so query both stores and dedupe by id (NEW wins). + async #findRunsOpen(args: FindRunsArgs): Promise { + const { args: selArgs, addedFields } = ensureProjected(args); + const fan = widenForMerge(selArgs); + const findNew = this.#findManyOn(this.#new); + const findLegacy = this.#findManyOn(this.#legacy); + const [newRows, legacyRows] = await Promise.all([findNew(fan), findLegacy(fan)]); + const byId = new Map>(); + for (const r of legacyRows) byId.set(r.id as string, r); + for (const r of newRows) byId.set(r.id as string, r); + return finalizeRows([...byId.values()], args, addedFields); + } + + // --------------------------------------------------------------------------- + // TaskRun-core: update-family — route by run id in params + // --------------------------------------------------------------------------- + + async updateMetadata( + runId: string, + data: { + metadata: string | null; + metadataType?: string; + metadataVersion: { increment: number }; + updatedAt: Date; + }, + options: { expectedMetadataVersion?: number }, + tx?: PrismaClientOrTransaction + ): Promise<{ count: number }> { + return (await this.#routeOrNewForWrite(runId)).updateMetadata(runId, data, options); + } + + clearIdempotencyKey( + params: ClearIdempotencyKeyInput, + tx?: PrismaClientOrTransaction + ): Promise<{ count: number }> { + // `byId` has a single classifiable run id — route on it. + if ("byId" in params && params.byId) { + const store = this.#route(params.byId.runId); + return store.clearIdempotencyKey(params, store === this.#legacy ? tx : undefined); + } + // `byFriendlyIds` / `byPredicate` can span mixed residency — fan out and sum. + return Promise.all([ + this.#new.clearIdempotencyKey(params), + this.#legacy.clearIdempotencyKey(params), + ]).then(([fromNew, fromLegacy]) => ({ count: fromNew.count + fromLegacy.count })); + } + + // --------------------------------------------------------------------------- + // Mechanical residency-routing delegates so `implements RunStore` is satisfied and the + // router is usable end-to-end. Do NOT add per-method create/fan-out nuance here. + // --------------------------------------------------------------------------- + + async startAttempt( + runId: string, + data: { attemptNumber: number; executedAt?: Date; isWarmStart: boolean }, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).startAttempt(runId, data, args); + } + + async completeAttemptSuccess( + runId: string, + data: { + completedAt: Date; + output?: string; + outputType: string; + usageDurationMs: number; + costInCents: number; + snapshot: CompletionSnapshotInput; + }, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).completeAttemptSuccess(runId, data, args); + } + + async recordRetryOutcome( + runId: string, + data: { machinePreset?: string; usageDurationMs: number; costInCents: number }, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).recordRetryOutcome(runId, data, args); + } + + async requeueRun( + runId: string, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).requeueRun(runId, args); + } + + async recordBulkActionMembership( + runId: string, + bulkActionId: string, + tx?: PrismaClientOrTransaction + ): Promise { + return (await this.#routeForWrite(runId)).recordBulkActionMembership(runId, bulkActionId); + } + + async cancelRun( + runId: string, + data: { + completedAt?: Date; + error: TaskRunError; + bulkActionId?: string; + usageDurationMs?: number; + costInCents?: number; + }, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).cancelRun(runId, data, args); + } + + async failRunPermanently( + runId: string, + data: { + status: TaskRunStatus; + completedAt: Date; + error: TaskRunError; + usageDurationMs: number; + costInCents: number; + }, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).failRunPermanently(runId, data, args); + } + + async expireRun( + runId: string, + data: { + error: TaskRunError; + completedAt: Date; + expiredAt: Date; + snapshot: ExpireSnapshotInput; + }, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).expireRun(runId, data, args); + } + + async expireRunsBatch( + runIds: string[], + data: { error: TaskRunError; now: Date }, + tx?: PrismaClientOrTransaction + ): Promise { + // Partition by id-shape: ksuid → NEW, everything else → LEGACY. Call each store + // only when its partition is non-empty (avoids an empty IN () clause). Sum counts. + const newIds = runIds.filter((id) => this.#classifySafe(id) === "NEW"); + const legacyIds = runIds.filter((id) => this.#classifySafe(id) !== "NEW"); + const [fromNew, fromLegacy] = await Promise.all([ + newIds.length > 0 ? this.#new.expireRunsBatch(newIds, data) : 0, + legacyIds.length > 0 ? this.#legacy.expireRunsBatch(legacyIds, data) : 0, + ]); + return fromNew + fromLegacy; + } + + async lockRunToWorker( + runId: string, + data: LockRunData, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).lockRunToWorker(runId, data); + } + + async parkPendingVersion( + runId: string, + data: { statusReason: string }, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).parkPendingVersion(runId, data, args); + } + + async promotePendingVersionRuns( + runId: string, + tx?: PrismaClientOrTransaction + ): Promise<{ count: number }> { + return (await this.#routeForWrite(runId)).promotePendingVersionRuns(runId); + } + + async suspendForCheckpoint( + runId: string, + args: { include: I }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).suspendForCheckpoint(runId, args); + } + + async resumeFromCheckpoint( + runId: string, + args: { select: S }, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeForWrite(runId)).resumeFromCheckpoint(runId, args); + } + + async rescheduleRun( + runId: string, + data: { delayUntil: Date; queueTimestamp?: Date; snapshot?: RescheduleSnapshotInput }, + tx?: PrismaClientOrTransaction + ): Promise { + return (await this.#routeForWrite(runId)).rescheduleRun(runId, data); + } + + async enqueueDelayedRun( + runId: string, + data: { queuedAt: Date }, + tx?: PrismaClientOrTransaction + ): Promise { + return (await this.#routeForWrite(runId)).enqueueDelayedRun(runId, data); + } + + async rewriteDebouncedRun( + runId: string, + data: RewriteDebouncedRunData, + tx?: PrismaClientOrTransaction + ): Promise { + return (await this.#routeForWrite(runId)).rewriteDebouncedRun(runId, data); + } + + async pushTags( + runId: string, + tags: string[], + where: { runtimeEnvironmentId: string }, + tx?: PrismaClientOrTransaction + ): Promise<{ updatedAt: Date }> { + return (await this.#routeForWrite(runId)).pushTags(runId, tags, where); + } + + async pushRealtimeStream( + runId: string, + streamId: string, + tx?: PrismaClientOrTransaction + ): Promise { + return (await this.#routeForWrite(runId)).pushRealtimeStream(runId, streamId); + } + + findRunOrThrow( + where: Prisma.TaskRunWhereInput, + args: { select: S }, + client?: ReadClient + ): Promise>; + findRunOrThrow( + where: Prisma.TaskRunWhereInput, + args: { include: I }, + client?: ReadClient + ): Promise>; + findRunOrThrow(where: Prisma.TaskRunWhereInput, client?: ReadClient): Promise; + findRunOrThrow( + where: Prisma.TaskRunWhereInput, + argsOrClient?: { select?: unknown; include?: unknown } | ReadClient, + _client?: ReadClient + ): Promise { + // The caller's client is not forwarded, but a WRITER signals read-your-writes → the owning + // store's primary (writer); a replica / nothing → its replica (see findRun). + const args = selectOrIncludeArgs(argsOrClient); + const onPrimary = readYourWrites(argsOrClient, _client); + const id = idFromWhere(where); + if (id !== undefined) { + // Residency-classifiable (id/friendlyId): route to the owning store and let it throw on miss. + const store = this.#routeOrNew(id); + const method = onPrimary ? "findRunOrThrowOnPrimary" : "findRunOrThrow"; + return (store[method] as (...rest: unknown[]) => Promise)(where, args); + } + // Unclassifiable where (e.g. spanId): the run may live on either DB, so fan out NEW-first then + // LEGACY rather than defaulting to NEW — defaulting silently misses legacy-resident runs and + // throws a spurious not-found (must mirror findRun's #findRunUnrouted fan-out). + return this.#findRunOrThrowUnrouted(where, args, onPrimary); + } + + async #findRunOrThrowUnrouted( + where: Prisma.TaskRunWhereInput, + args: unknown, + onPrimary: boolean + ): Promise { + const probe = onPrimary ? "findRunOnPrimary" : "findRun"; + const fromNew = await (this.#new[probe] as (...rest: unknown[]) => Promise)( + where, + args + ); + if (fromNew != null) { + return fromNew; + } + // LEGACY is the last leg probed, so it owns the canonical not-found throw when both DBs miss. + const throwMethod = onPrimary ? "findRunOrThrowOnPrimary" : "findRunOrThrow"; + return (this.#legacy[throwMethod] as (...rest: unknown[]) => Promise)(where, args); + } + + // Explicit read-your-writes entry points: route by residency to the owning store's PRIMARY + // (writer), never a replica. A classifiable where routes directly; an unclassifiable one fans + // out NEW→LEGACY on each store's primary (same policy as findRun's fan-out). Each store reads + // its OWN writer, so no control-plane client crosses into another DB. + findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { select: S } + ): Promise | null>; + findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { include: I } + ): Promise | null>; + findRunOnPrimary(where: Prisma.TaskRunWhereInput): Promise; + findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args?: { select?: unknown; include?: unknown } + ): Promise { + const id = idFromWhere(where); + if (id !== undefined) { + const store = this.#routeOrNew(id); + return (store.findRunOnPrimary as (...rest: unknown[]) => Promise)(where, args); + } + return this.#findRunUnrouted(where, args, true); + } + + findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { select: S } + ): Promise>; + findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { include: I } + ): Promise>; + findRunOrThrowOnPrimary(where: Prisma.TaskRunWhereInput): Promise; + findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args?: { select?: unknown; include?: unknown } + ): Promise { + const id = idFromWhere(where); + if (id !== undefined) { + const store = this.#routeOrNew(id); + return (store.findRunOrThrowOnPrimary as (...rest: unknown[]) => Promise)( + where, + args + ); + } + return this.#findRunOrThrowUnrouted(where, args, true); + } + + // --------------------------------------------------------------------------- + // run-ops persistence (snapshots / waitpoints / implicit joins / dependents / attempts / + // checkpoints). Mechanical residency-routing delegates so `implements RunStore` is satisfied. + // --------------------------------------------------------------------------- + + // Membership row lives on the run's residency — route by taskRunId. + async createBatchTaskRunItem( + data: { batchTaskRunId: string; taskRunId: string; status: BatchTaskRunItemStatus }, + tx?: PrismaClientOrTransaction + ): Promise { + return (await this.#routeForWrite(data.taskRunId)).createBatchTaskRunItem(data); + } + + // Snapshot reads route by OWNING run id (a SnapshotId is a cuid, NOT classifiable). The owning + // store hydrates `completedWaitpoints` from its own client only, so a cross-DB completing token's + // OUTPUT is silently missing from the resume payload — re-resolve them across BOTH DBs. + async findLatestExecutionSnapshot( + runId: string, + client?: ReadClient + ): Promise | null> { + const owningStore = this.#routeOrNew(runId); + const snapshot = await owningStore.findLatestExecutionSnapshot(runId); + if (snapshot) { + await this.#reresolveCompletedWaitpointsCrossDb( + snapshot as Record, + owningStore + ); + } + return snapshot; + } + + // Recover any cross-DB completed waitpoint MISSING from the owning store's hydration. The + // join (CompletedWaitpoint, co-resident with the snapshot) is the source of truth for which tokens + // completed the run; the owning store can only hydrate the ones that live on its own DB. When every + // join id is already present we leave the array untouched (byte-identical for single-DB / the + // co-resident steady state — no extra fan-out write); only genuinely-missing ids are resolved + // cross-DB and appended, so a cuid token completing a ksuid run keeps its OUTPUT on the resume. + async #reresolveCompletedWaitpointsCrossDb( + snapshot: Record, + owningStore: RunStore + ): Promise { + const snapshotId = snapshot.id; + if (typeof snapshotId !== "string") { + return; + } + const completed = Array.isArray(snapshot.completedWaitpoints) + ? (snapshot.completedWaitpoints as Record[]) + : []; + const present = new Set(completed.map((w) => w.id as string)); + // The join is co-resident with the snapshot, so read it from the OWNING store (the snapshot's + // own id is a cuid and would mis-route the both-DB `findSnapshotCompletedWaitpointIds`). + const joinIds = await owningStore.findSnapshotCompletedWaitpointIds(snapshotId); + const missing = joinIds.filter((id) => !present.has(id)); + if (missing.length === 0) { + return; // all completed tokens co-resident → owning-store hydration is complete + } + const recovered = (await this.findManyWaitpoints({ + where: { id: { in: missing } }, + })) as Record[]; + snapshot.completedWaitpoints = [...completed, ...recovered]; + } + + // A snapshot is co-resident with its run, so route by the OWNING run id when the `where` carries + // one (the warm-restart `getExecutionSnapshotsSince` shape — both steps key on `runId`), mirroring + // findLatestExecutionSnapshot. Without a runId (a by-snapshot-id-only lookup, snapshot ids are + // cuids and NOT residency-classifiable) the snapshot can live on either DB, so fan out NEW→LEGACY + // rather than hardcode #new — which strands every cuid run's #legacy snapshots. + async findExecutionSnapshot( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise | null> { + const runId = snapshotWhereRunId(args); + if (runId !== undefined) { + return this.#routeOrNew(runId).findExecutionSnapshot(args); + } + const fromNew = await this.#new.findExecutionSnapshot(args); + return fromNew ?? this.#legacy.findExecutionSnapshot(args); + } + + // Snapshot reads route by OWNING run id; merge both DBs for an open/cross-residency where. + async findManyExecutionSnapshots( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]> { + const runId = snapshotWhereRunId(args); + if (runId !== undefined) { + return this.#routeOrNew(runId).findManyExecutionSnapshots(args); + } + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.findManyExecutionSnapshots(args), + this.#legacy.findManyExecutionSnapshots(args), + ]); + return [...fromNew, ...fromLegacy]; + } + + async createExecutionSnapshot( + input: CreateExecutionSnapshotInput, + tx?: PrismaClientOrTransaction + ): Promise> { + return (await this.#routeOrNewForWrite(input.run.id)).createExecutionSnapshot(input); + } + + // A snapshot lives with its run; route by the snapshot id's residency. + findSnapshotCompletedWaitpointIds(snapshotId: string, client?: ReadClient): Promise { + return this.#routeOrNew(snapshotId).findSnapshotCompletedWaitpointIds(snapshotId); + } + + // Keyed by waitpointId, but the WaitpointRunConnection / CompletedWaitpoint join co-locates with the + // RUN/snapshot — which can be on the OTHER DB from a cross-DB token — so fan out to BOTH stores and + // merge. Dedup by value: a token mirrored onto both DBs during drain can carry the same join + // row on each leg. + async findWaitpointConnectedRunIds(waitpointId: string, client?: ReadClient): Promise { + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.findWaitpointConnectedRunIds(waitpointId), + this.#legacy.findWaitpointConnectedRunIds(waitpointId), + ]); + return uniqueStrings([...fromNew, ...fromLegacy]); + } + + async findWaitpointCompletedSnapshotIds( + waitpointId: string, + client?: ReadClient + ): Promise { + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.findWaitpointCompletedSnapshotIds(waitpointId), + this.#legacy.findWaitpointCompletedSnapshotIds(waitpointId), + ]); + return uniqueStrings([...fromNew, ...fromLegacy]); + } + + async blockRunWithWaitpointEdges(params: { + runId: string; + waitpointIds: string[]; + projectId: string; + spanIdToComplete?: string; + batchId?: string; + batchIndex?: number; + tx?: PrismaClientOrTransaction; + }): Promise { + return (await this.#routeOrNewForWrite(params.runId)).blockRunWithWaitpointEdges(params); + } + + // A run's waitpoints can be scattered across both stores (drain in flight), so count on + // each and sum rather than assume one home. + async countPendingWaitpoints(waitpointIds: string[], client?: ReadClient): Promise { + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.countPendingWaitpoints(waitpointIds), + this.#legacy.countPendingWaitpoints(waitpointIds), + ]); + return fromNew + fromLegacy; + } + + // A waitpoint co-locates with the OWNER it points at, in priority order: an explicit + // `coLocateWithRunId` (a DATETIME/MANUAL wait waitpoint co-locating with the run that blocks on + // it — its minted id is always cuid, so id-shape alone always misroutes it to LEGACY), then a + // RUN-completion owner via `completedByTaskRunId`, then a BATCH owner via + // `completedByBatchId` (the control-plane Waitpoint→BatchTaskRun FK requires it to share the + // batch's DB). Else fall back to the waitpoint's own id-shape. + createWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction, + opts?: WaitpointColocationOptions + ): Promise> { + const data = (args as { data?: unknown }).data; + const ownerRunId = scalarStringField(data, "completedByTaskRunId"); + const ownerBatchId = scalarStringField(data, "completedByBatchId"); + const routeId = + opts?.coLocateWithRunId ?? ownerRunId ?? ownerBatchId ?? RoutingRunStore.#waitpointId(data); + const { store, tx: routedTx } = this.#routeWaitpointWrite(routeId, tx); + return store.createWaitpoint(args, routedTx); + } + + upsertWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction, + opts?: WaitpointColocationOptions + ): Promise> { + // `coLocateWithRunId` (the owning run) wins so a DATETIME/MANUAL wait waitpoint lands on its + // run's DB; otherwise key by create.id (always the minted waitpoint id), then where. + const routeId = + opts?.coLocateWithRunId ?? + RoutingRunStore.#waitpointId((args as { create?: unknown }).create) ?? + RoutingRunStore.#waitpointId((args as { where?: unknown }).where); + const { store, tx: routedTx } = this.#routeWaitpointWrite(routeId, tx); + return store.upsertWaitpoint(args, routedTx); + } + + // Probe by id (drain may have relocated it); an idempotency-key lookup with no id routes by + // `coLocateWithRunId` (the owning run's store — a per-run dedup of a co-resident wait), else + // falls back to NEW-then-LEGACY. + async findWaitpoint( + args: Prisma.SelectSubset, + client?: ReadClient, + opts?: WaitpointColocationOptions + ): Promise | null> { + // A waitpoint's blockingTaskRuns / connectedRuns / completedExecutionSnapshots all co-locate with + // the RUN/snapshot, not the waitpoint (the edge + join rows are written on the run's DB). So the + // store that holds the waitpoint hydrates them from its own client only and MISSES a cross-DB + // target (engine.getWaitpoint includes blockingTaskRuns→taskRun). Strip those keys from the + // per-leg query and re-resolve them across BOTH DBs here, mirroring findManyTaskRunWaitpoints. + const { scalarArgs, relations } = splitWaitpointRelationProjection( + args as Record + ); + const id = RoutingRunStore.#waitpointId((args as { where?: unknown }).where); + const store = + id !== undefined + ? await this.#resolveWaitpointStore(id) + : opts?.coLocateWithRunId !== undefined + ? this.#routeOrNew(opts.coLocateWithRunId) + : undefined; + const row = + store !== undefined + ? ((await store.findWaitpoint(scalarArgs as typeof args)) as Record | null) + : (((await this.#new.findWaitpoint(scalarArgs as typeof args)) ?? + (await this.#legacy.findWaitpoint(scalarArgs as typeof args))) as Record< + string, + unknown + > | null); + if (row) { + await this.#reresolveWaitpointRelationsCrossDb(row, relations); + } + return row as Prisma.WaitpointGetPayload | null; + } + + // Read-after-write on the owning store's primary. Only the unblock re-read uses this — a bare + // `{ where: { id } }` with no relation projection — so it routes to the owning store by id and + // delegates, skipping the cross-DB relation re-resolution findWaitpoint does. + async findWaitpointOnPrimary( + args: Prisma.SelectSubset + ): Promise | null> { + const id = RoutingRunStore.#waitpointId((args as { where?: unknown }).where); + const store = id !== undefined ? await this.#resolveWaitpointStore(id) : this.#new; + return store.findWaitpointOnPrimary(args); + } + + async findManyWaitpoints( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]> { + const { scalarArgs, relations } = splitWaitpointRelationProjection( + args as Record + ); + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.findManyWaitpoints(scalarArgs as typeof args), + this.#legacy.findManyWaitpoints(scalarArgs as typeof args), + ]); + // A token mirrored onto both DBs during drain appears in BOTH legs; dedup by id with NEW-wins + // (the NEW copy is authoritative once a run migrates), matching the router's NEW-wins invariant + // (#findRunsOpen). Without this, edge-waitpoint hydration could read a stale LEGACY status and + // strand the run. Rows whose projection omits `id` can't be deduped and pass through. + const byId = new Map>(); + const passthrough: Prisma.WaitpointGetPayload[] = []; + for (const w of [...fromLegacy, ...fromNew]) { + const id = (w as { id?: unknown }).id; + if (typeof id === "string") byId.set(id, w); + else passthrough.push(w); + } + const rows = [...byId.values(), ...passthrough]; + for (const row of rows) { + await this.#reresolveWaitpointRelationsCrossDb(row as Record, relations); + } + return rows; + } + + // Re-resolve a waitpoint's group-A relations across BOTH DBs and attach them to `row`. Each target + // co-locates with the RUN/snapshot (the edge + join rows live on the run's DB), so the join is read + // from EACH store and the targets resolved via the router's existing both-DB fan-out. A no-op when no + // group-A relation was requested (the byte-identical scalar path). + async #reresolveWaitpointRelationsCrossDb( + row: Record, + relations: Partial> + ): Promise { + const waitpointId = row.id; + if (typeof waitpointId !== "string") { + return; + } + if ("blockingTaskRuns" in relations) { + row.blockingTaskRuns = await this.#reresolveBlockingTaskRunsCrossDb( + waitpointId, + relations.blockingTaskRuns + ); + } + if ("connectedRuns" in relations) { + row.connectedRuns = await this.#reresolveConnectedRunsCrossDb( + waitpointId, + relations.connectedRuns + ); + } + if ("completedExecutionSnapshots" in relations) { + row.completedExecutionSnapshots = await this.#reresolveCompletedExecutionSnapshotsCrossDb( + waitpointId, + relations.completedExecutionSnapshots + ); + } + } + + // blockingTaskRuns are the TaskRunWaitpoint edges keyed by waitpointId — already a both-DB read with + // an optional nested `taskRun` re-resolved cross-DB (findManyTaskRunWaitpoints). The edge co-locates + // with the run, so a single store misses a cross-DB run's edge; the both-DB read recovers it. + async #reresolveBlockingTaskRunsCrossDb( + waitpointId: string, + projection: SubProjection + ): Promise { + const edgeArgs = projectionAsArgs(projection) ?? {}; + return this.findManyTaskRunWaitpoints({ + ...(edgeArgs as Prisma.TaskRunWaitpointFindManyArgs), + where: { waitpointId }, + }); + } + + // connectedRuns: the WaitpointRunConnection join co-locates with the run, so read the connected run + // ids from EACH store, then resolve the TaskRun rows across BOTH DBs (findRun routes by id). + async #reresolveConnectedRunsCrossDb( + waitpointId: string, + projection: SubProjection + ): Promise { + const runIds = await this.findWaitpointConnectedRunIds(waitpointId); + const findRun = (this.findRun as (...rest: unknown[]) => Promise).bind(this); + const args = projectionAsArgs(projection); + const runs: unknown[] = []; + for (const runId of runIds) { + const run = await findRun({ id: runId }, args); + if (run != null) { + runs.push(run); + } + } + return runs; + } + + // completedExecutionSnapshots: the CompletedWaitpoint join co-locates with the snapshot/run, so read + // the snapshot ids from EACH store, then resolve the snapshot rows across BOTH DBs. + async #reresolveCompletedExecutionSnapshotsCrossDb( + waitpointId: string, + projection: SubProjection + ): Promise { + const snapshotIds = await this.findWaitpointCompletedSnapshotIds(waitpointId); + if (snapshotIds.length === 0) { + return []; + } + const findArgs = projectionAsArgs(projection) ?? {}; + return this.findManyExecutionSnapshots({ + ...(findArgs as Prisma.TaskRunExecutionSnapshotFindManyArgs), + where: { id: { in: snapshotIds } }, + }); + } + + async updateWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction, + opts?: WaitpointColocationOptions + ): Promise> { + // An update keyed by waitpoint id resolves to where the row lives; a `coLocateWithRunId` hint + // (the idempotency-key rotation arm, where the row was just co-located with its run) routes by + // the owning run's store. + const id = RoutingRunStore.#waitpointId((args as { where?: unknown }).where); + const store = + id !== undefined + ? await this.#resolveWaitpointStore(id) + : opts?.coLocateWithRunId !== undefined + ? this.#routeOrNew(opts.coLocateWithRunId) + : await this.#resolveWaitpointStore(undefined); + return store.updateWaitpoint(args, store === this.#legacy ? tx : undefined); + } + + async updateManyWaitpoints( + args: Prisma.WaitpointUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const id = RoutingRunStore.#waitpointId(args.where); + if (id !== undefined) { + const store = await this.#resolveWaitpointStore(id); + return store.updateManyWaitpoints(args, store === this.#legacy ? tx : undefined); + } + // No single routable id (batch where): apply to both stores and sum. + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.updateManyWaitpoints(args), + this.#legacy.updateManyWaitpoints(args), + ]); + return { count: fromNew.count + fromLegacy.count }; + } + + // Residency guard: selects the owning store by waitpointId. + async forWaitpointCompletion( + waitpointId: string, + context: ForWaitpointCompletionContext + ): Promise { + // Preferred store: explicit legacy-authority pins first, else the waitpoint's id-shape. + const preferred = + context.treeOwnerResidency === "LEGACY" || + context.isCrossTreeIdempotency === true || + context.hasLegacyParent === true + ? this.#legacy + : this.#classifySafe(waitpointId) === "NEW" + ? this.#new + : this.#legacy; + // Resolve to where the waitpoint ACTUALLY lives: a migrated run's waitpoint can be on NEW + // with a LEGACY-classified id (or vice versa), so verify and fall back rather than route + // by id-shape alone and miss it (which leaves the blocked run stuck forever). + if (await preferred.findWaitpoint({ where: { id: waitpointId } })) { + return preferred; + } + const other = preferred === this.#new ? this.#legacy : this.#new; + if (await other.findWaitpoint({ where: { id: waitpointId } })) { + return other; + } + return preferred; + } + + // An edge (TaskRunWaitpoint) co-locates with its RUN, not its waitpoint, so a read keyed by + // `waitpointId` (the completion fan-out) OR `taskRunId` must query BOTH stores and dedup by + // edge `id` — routing to where the waitpoint lives would miss an edge on the run's DB and + // strand that run forever. Dedup is a no-op in steady state; it guards the copy→fence window. + // + // The edge's `waitpoint`/`taskRun` relations can also straddle DBs (a cuid MANUAL/DATETIME token + // blocking a ksuid run; a drain-relocated token). A single store hydrates them from its own + // client only → a cross-DB target resolves to null → the run hangs or its resume + // output is silently dropped. So the router strips those relation keys from the per-leg + // query (scalar edges only) and re-resolves them across BOTH stores here. + async findManyTaskRunWaitpoints( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]> { + const { scalarArgs, waitpoint, taskRun } = splitEdgeRelationProjection( + args as Record + ); + + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.findManyTaskRunWaitpoints(scalarArgs as typeof args), + this.#legacy.findManyTaskRunWaitpoints(scalarArgs as typeof args), + ]); + const edges = dedupeEdgesById([...fromNew, ...fromLegacy]) as Record[]; + + if (waitpoint) { + await this.#hydrateEdgeWaitpointsCrossDb(edges, waitpoint); + } + if (taskRun) { + await this.#hydrateEdgeTaskRunsCrossDb(edges, taskRun); + } + return edges as Prisma.TaskRunWaitpointGetPayload[]; + } + + // Resolve each edge's `waitpoint` from its scalar `waitpointId` across BOTH stores (the token can + // live on either DB). A blocking edge whose waitpoint resolves on NEITHER DB is a hard error: the + // run would otherwise hang forever (or be wrongly treated as completed) on a null status. + async #hydrateEdgeWaitpointsCrossDb( + edges: Record[], + projection: SubProjection + ): Promise { + const ids = uniqueStrings(edges.map((e) => e.waitpointId)); + if (ids.length === 0) { + return; + } + const waitpoints = (await this.findManyWaitpoints({ + where: { id: { in: ids } }, + })) as Record[]; + const byId = new Map(waitpoints.map((w) => [w.id as string, w])); + for (const edge of edges) { + const id = edge.waitpointId as string | undefined; + const wp = id ? byId.get(id) : undefined; + if (id && !wp) { + throw new Error( + `findManyTaskRunWaitpoints: blocking waitpoint ${id} (edge ${String( + edge.id + )}) not found on either run-ops DB` + ); + } + edge.waitpoint = applyEdgeProjection(wp ?? null, projection); + } + } + + // Resolve each edge's `taskRun` from its scalar `taskRunId` across BOTH stores (findRun routes by + // id and falls back NEW→LEGACY). A missing run is left null (display-only callers tolerate it; the + // blocked-run resume path keys off `waitpoint`). + async #hydrateEdgeTaskRunsCrossDb( + edges: Record[], + projection: SubProjection + ): Promise { + // Bind to `this`: findRun reaches the private #routeOrNew/#findRunUnrouted members, so an unbound + // reference loses `this` and throws on the first private access. + const findRun = (this.findRun as (...rest: unknown[]) => Promise).bind(this); + const args = projectionAsArgs(projection); + for (const edge of edges) { + const id = edge.taskRunId as string | undefined; + const run = id ? await findRun({ id }, args) : null; + edge.taskRun = applyEdgeProjection((run as Record) ?? null, projection); + } + } + + async deleteManyTaskRunWaitpoints( + args: Prisma.TaskRunWaitpointDeleteManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const where = args.where as { waitpointId?: unknown } | undefined; + const waitpointId = typeof where?.waitpointId === "string" ? where.waitpointId : undefined; + if (waitpointId !== undefined) { + const store = await this.#resolveWaitpointStore(waitpointId); + return store.deleteManyTaskRunWaitpoints(args, store === this.#legacy ? tx : undefined); + } + // Keyed by taskRunId (or other): a run's edges may straddle DBs mid-drain, so delete from + // both. Can't span one tx across two DBs, so it's dropped for the both-stores path. + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.deleteManyTaskRunWaitpoints(args), + this.#legacy.deleteManyTaskRunWaitpoints(args), + ]); + return { count: fromNew.count + fromLegacy.count }; + } + + findTaskRunAttempt( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise | null> { + const runId = whereFieldString(args.where?.taskRunId as Prisma.TaskRunWhereInput["id"]); + if (runId !== undefined) { + // Residency-classifiable run id present: route to the owning store. Never forward the + // caller's client (it is the control-plane handle); the owning store reads its OWN DB. + return this.#routeOrNew(runId).findTaskRunAttempt(args); + } + // No classifiable run id (no taskRunId, or complex filter): fan out NEW-first → LEGACY. + return this.#findTaskRunAttemptUnrouted(args); + } + + async #findTaskRunAttemptUnrouted( + args: Prisma.SelectSubset + ): Promise | null> { + const fromNew = await this.#new.findTaskRunAttempt(args); + if (fromNew != null) { + return fromNew; + } + return this.#legacy.findTaskRunAttempt(args); + } + + // Co-locate the checkpoint with its OWNING run so the run-routed snapshot's `checkpointId` FK + // resolves on the same DB. Route by `ownerRunId`; tx forwards only to LEGACY. + async createTaskRunCheckpoint( + args: Prisma.SelectSubset, + ownerRunId?: string, + tx?: PrismaClientOrTransaction + ): Promise> { + const store = this.#routeOrNew(ownerRunId); + return store.createTaskRunCheckpoint(args, ownerRunId, store === this.#legacy ? tx : undefined); + } + + // --------------------------------------------------------------------------- + // BatchTaskRun (run-ops). Route by id-shape: ksuid→NEW, cuid→LEGACY. + // --------------------------------------------------------------------------- + + async createBatchTaskRun( + data: CreateBatchTaskRunData, + tx?: PrismaClientOrTransaction + ): Promise { + // Route by the batch's classifiable internal id: ksuid→NEW, cuid→LEGACY. + // Never forward a control-plane tx to NEW (the create would land in the wrong DB, stranding the + // ksuid batch + its co-resident child runs/items); forward tx only to LEGACY (same physical DB + // as the tx). Mirrors #routeWaitpointWrite / updateBatchTaskRun. + const store = await this.#routeOrNewForWrite(data.id); + return store.createBatchTaskRun(data, store === this.#legacy ? tx : undefined); + } + + updateBatchTaskRun( + args: { + where: Prisma.BatchTaskRunWhereUniqueInput; + data: Prisma.BatchTaskRunUpdateInput; + select: S; + }, + tx?: PrismaClientOrTransaction + ): Promise> { + const id = + typeof args.where.id === "string" ? args.where.id : (args.where.friendlyId ?? undefined); + // Never forward a control-plane tx to NEW (it would update the wrong DB and the row would + // not be found); forward tx only to LEGACY (same physical DB as the tx). Mirrors #routeWaitpointWrite. + const store = this.#routeOrNew(id); + return store.updateBatchTaskRun(args, store === this.#legacy ? tx : undefined); + } + + // Batches can be written to either DB by different create paths (runEngine routes by id; + // batchTriggerV3 writes raw to the control-plane), so probe NEW first then LEGACY rather + // than strict id-routing, which would miss a ksuid-id batch resident on the control-plane. + async findBatchTaskRunById( + id: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null> { + // Never forward the caller's client (it is the control-plane handle): a cross-DB probe with one + // shared client can only reach one DB, so each sub-store must read its OWN DB (5434 vs 5432). + const fromNew = await this.#new.findBatchTaskRunById(id, args); + if (fromNew != null) return fromNew; + return this.#legacy.findBatchTaskRunById(id, args); + } + + // Env-scoped friendlyId probe; no id-routing because cuid-on-NEW window batches exist. + async findBatchTaskRunByFriendlyId( + friendlyId: string, + environmentId: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null> { + // Never forward the caller's client (control-plane handle): each sub-store reads its OWN DB. + const fromNew = await this.#new.findBatchTaskRunByFriendlyId(friendlyId, environmentId, args); + if (fromNew != null) return fromNew; + return this.#legacy.findBatchTaskRunByFriendlyId(friendlyId, environmentId, args); + } + + // --------------------------------------------------------------------------- + // Batch residency — route every batch op by the batch id so a ksuid + // batch + its items co-reside on NEW with its child runs (the TaskRun.batchId and + // BatchTaskRunItem.batchTaskRunId FKs resolve locally). + // --------------------------------------------------------------------------- + + // Idempotency probe — no classifiable id (env+key), so fan out NEW→LEGACY. + async findBatchTaskRunByIdempotencyKey( + environmentId: string, + idempotencyKey: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null> { + // Never forward the caller's client (control-plane handle): each sub-store reads its OWN DB. + const fromNew = await this.#new.findBatchTaskRunByIdempotencyKey( + environmentId, + idempotencyKey, + args + ); + if (fromNew != null) return fromNew; + return this.#legacy.findBatchTaskRunByIdempotencyKey(environmentId, idempotencyKey, args); + } + + // Route by `where.id` when scalar; else (e.g. status filter) fan out to both and sum. + async updateManyBatchTaskRun( + args: Prisma.BatchTaskRunUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const id = RoutingRunStore.#scalarId(args.where); + if (id !== undefined) { + const store = this.#routeOrNew(id); + return store.updateManyBatchTaskRun(args, store === this.#legacy ? tx : undefined); + } + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.updateManyBatchTaskRun(args), + this.#legacy.updateManyBatchTaskRun(args), + ]); + return { count: fromNew.count + fromLegacy.count }; + } + + // Items co-reside with their batch — route by `batchTaskRunId`, no fan-out. + countBatchTaskRunItems( + where: { batchTaskRunId: string; status?: BatchTaskRunItemStatus }, + client?: ReadClient + ): Promise { + // Never forward the caller's client (it is the control-plane handle): a ksuid batch routes to + // NEW, so a forwarded control-plane client would count items on the wrong DB (→ 0/wrong count). + // The routed store reads its OWN DB. Mirrors the probe-read sweep. + return this.#routeOrNew(where.batchTaskRunId).countBatchTaskRunItems(where); + } + + // Route by item `id` or `batchTaskRunId` when scalar; else fan out to both and sum. + async updateManyBatchTaskRunItems( + args: Prisma.BatchTaskRunItemUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise { + const id = + RoutingRunStore.#scalarId(args.where) ?? + RoutingRunStore.#scalarField(args.where, "batchTaskRunId"); + if (id !== undefined) { + const store = this.#routeOrNew(id); + return store.updateManyBatchTaskRunItems(args, store === this.#legacy ? tx : undefined); + } + const [fromNew, fromLegacy] = await Promise.all([ + this.#new.updateManyBatchTaskRunItems(args), + this.#legacy.updateManyBatchTaskRunItems(args), + ]); + return { count: fromNew.count + fromLegacy.count }; + } + + // Extract a scalar string `id` from a `{ id }` / `{ id: { equals } }` where; undefined otherwise. + static #scalarId(where: unknown): string | undefined { + return RoutingRunStore.#scalarField(where, "id"); + } + + static #scalarField(where: unknown, field: string): string | undefined { + if (!where || typeof where !== "object") return undefined; + const value = (where as Record)[field]; + if (typeof value === "string") return value; + if (value && typeof value === "object" && "equals" in value) { + const eq = (value as { equals?: unknown }).equals; + return typeof eq === "string" ? eq : undefined; + } + return undefined; + } +} + +// Distinguish a select/include args object from a ReadClient in the overloaded read +// signatures: only an args object carries `select`/`include`. Returns the args (to pass +// through) or undefined (so the routed store uses its own client), never the client. +function selectOrIncludeArgs( + argsOrClient: { select?: unknown; include?: unknown } | unknown +): { select?: unknown; include?: unknown } | undefined { + if ( + argsOrClient && + typeof argsOrClient === "object" && + ("select" in argsOrClient || "include" in argsOrClient) + ) { + return argsOrClient as { select?: unknown; include?: unknown }; + } + return undefined; +} + +// Writers (PrismaClient / RunOpsPrismaClient) expose `$transaction`; a read replica +// (PrismaReplicaClient) does not. That is the identity the routing store keys read-your-writes on. +function isWriterClient(value: unknown): boolean { + return ( + !!value && + typeof value === "object" && + typeof (value as { $transaction?: unknown }).$transaction === "function" + ); +} + +// A read-your-writes call passes a WRITER (the just-written run must be read back before the +// replica has it). Recover the caller's client from the overloaded read args — slot two when it +// isn't a `{ select | include }` object, else slot three — and report whether it is a writer. +function readYourWrites( + argsOrClient: { select?: unknown; include?: unknown } | ReadClient | unknown, + client: ReadClient | undefined +): boolean { + const passedClient = selectOrIncludeArgs(argsOrClient) === undefined ? argsOrClient : client; + return isWriterClient(passedClient); +} + +// Read a plain scalar string field off a create-data object (e.g. `data.completedByTaskRunId`). +function scalarStringField(data: unknown, field: string): string | undefined { + if (!data || typeof data !== "object") { + return undefined; + } + const value = (data as Record)[field]; + return typeof value === "string" ? value : undefined; +} + +function whereFieldString(field: Prisma.TaskRunWhereInput["id"]): string | undefined { + if (typeof field === "string") { + return field; + } + if (field && typeof field === "object" && "equals" in field && typeof field.equals === "string") { + return field.equals; + } + return undefined; +} + +// Extract a scalar `runId` from a snapshot find `args.where` (the warm-restart reads key on it). +function snapshotWhereRunId(args: unknown): string | undefined { + const where = args && typeof args === "object" ? (args as { where?: unknown }).where : undefined; + if (!where || typeof where !== "object") { + return undefined; + } + return whereFieldString((where as { runId?: Prisma.TaskRunWhereInput["id"] }).runId); +} + +function idFromWhere(where: Prisma.TaskRunWhereInput): string | undefined { + // Route by internal id when present, else by friendlyId. Both classify identically + // (the classifier strips the `run_` prefix), so a read keyed on friendlyId (the common + // presenter case) routes to the owning store instead of falling back to the new store. + return whereFieldString(where.id) ?? whereFieldString(where.friendlyId); +} + +type FindRunsArgs = { + where: Prisma.TaskRunWhereInput; + select?: unknown; + include?: unknown; + orderBy?: Prisma.TaskRunOrderByWithRelationInput | Prisma.TaskRunOrderByWithRelationInput[]; + take?: number; + skip?: number; + cursor?: Prisma.TaskRunWhereUniqueInput; +}; + +// The bounded internal-id set a `where` targets, or undefined for an open predicate. +// Only `id` (the residency-classifiable internal id) qualifies for the partitioned path. +function idListFromWhere(where: Prisma.TaskRunWhereInput): string[] | undefined { + const id = where.id; + if (typeof id === "string") return [id]; + if (id && typeof id === "object") { + if ("in" in id && Array.isArray(id.in)) { + const strings = id.in.filter((x): x is string => typeof x === "string"); + return strings.length === id.in.length ? strings : undefined; + } + if ("equals" in id && typeof id.equals === "string") return [id.equals]; + } + return undefined; +} + +function narrowToIds(args: FindRunsArgs, ids: string[]): FindRunsArgs { + return { ...args, where: { ...args.where, id: { in: ids } } }; +} + +// Merge edge rows from both stores, keeping one per edge `id` (NEW seen last wins). Rows whose +// projection omits `id` can't be deduped, so they pass through unchanged. +function dedupeEdgesById(rows: R[]): R[] { + const byId = new Map(); + const passthrough: R[] = []; + for (const row of rows) { + const id = (row as { id?: unknown }).id; + if (typeof id === "string") byId.set(id, row); + else passthrough.push(row); + } + return [...byId.values(), ...passthrough]; +} + +// A caller sub-select for an edge relation: `{ select?, include? }`, `true` for a bare `key: true`, +// or undefined when not requested. +type SubProjection = { select?: any; include?: any } | true | undefined; + +// Split a TaskRunWaitpoint `findMany` args into the scalar args sent to each leg (the `waitpoint`/ +// `taskRun` relation keys removed, the keying scalars + `id` ensured present) and the requested +// relation sub-projections the router resolves cross-DB. +function splitEdgeRelationProjection(args: Record): { + scalarArgs: Record; + waitpoint: SubProjection; + taskRun: SubProjection; +} { + const select = args.select as Record | undefined; + const include = args.include as Record | undefined; + + if (select && ("waitpoint" in select || "taskRun" in select)) { + const { waitpoint, taskRun, ...rest } = select; + return { + // Keep `id` (dedupe) and the keying scalars (cross-DB hydration) through a narrowed select. + scalarArgs: { + ...args, + select: { ...rest, id: true, waitpointId: true, taskRunId: true }, + }, + waitpoint: waitpoint as SubProjection, + taskRun: taskRun as SubProjection, + }; + } + if (include && ("waitpoint" in include || "taskRun" in include)) { + const { waitpoint, taskRun, ...rest } = include; + const restInclude = Object.keys(rest).length > 0 ? { include: rest } : {}; + const { include: _drop, ...base } = args; + return { + scalarArgs: { ...base, ...restInclude }, + waitpoint: waitpoint as SubProjection, + taskRun: taskRun as SubProjection, + }; + } + // No edge relation requested: pass the args through unchanged (byte-identical scalar path). + return { scalarArgs: args, waitpoint: undefined, taskRun: undefined }; +} + +// The Waitpoint group-A relation keys whose TARGETS co-locate with the RUN/snapshot, not the +// waitpoint, so a single store hydrates them from its own client and MISSES a cross-DB target +// The router strips these from the per-leg query and re-resolves them across BOTH DBs. +const WAITPOINT_RELATION_KEYS = [ + "blockingTaskRuns", + "connectedRuns", + "completedExecutionSnapshots", +] as const; +type WaitpointRelationKey = (typeof WAITPOINT_RELATION_KEYS)[number]; + +// Split a Waitpoint `findFirst`/`findMany` args into the scalar args sent to each leg (the group-A +// relation keys removed, `id` kept so the router can re-attach) and the requested relation +// sub-projections the router resolves cross-DB. Mirrors splitEdgeRelationProjection. +function splitWaitpointRelationProjection(args: Record): { + scalarArgs: Record; + relations: Partial>; +} { + const select = args.select as Record | undefined; + const include = args.include as Record | undefined; + const relations: Partial> = {}; + + if (select && WAITPOINT_RELATION_KEYS.some((k) => k in select)) { + const rest: Record = { ...select }; + for (const key of WAITPOINT_RELATION_KEYS) { + if (key in rest) { + relations[key] = rest[key] as SubProjection; + delete rest[key]; + } + } + // Keep `id` so the router can re-attach the re-resolved relations to the row. + return { scalarArgs: { ...args, select: { ...rest, id: true } }, relations }; + } + if (include && WAITPOINT_RELATION_KEYS.some((k) => k in include)) { + const rest: Record = { ...include }; + for (const key of WAITPOINT_RELATION_KEYS) { + if (key in rest) { + relations[key] = rest[key] as SubProjection; + delete rest[key]; + } + } + const restInclude = Object.keys(rest).length > 0 ? { include: rest } : {}; + const { include: _drop, ...base } = args; + return { scalarArgs: { ...base, ...restInclude }, relations }; + } + // No group-A relation requested: pass through unchanged (byte-identical scalar path). + return { scalarArgs: args, relations }; +} + +// Apply an edge relation sub-projection to a hydrated row so only requested fields remain (mirrors +// PostgresRunStore.applyProjection; a `true`/undefined projection returns the full row). +function applyEdgeProjection( + row: Record | null, + projection: SubProjection +): Record | null { + if (!row || projection === true || projection === undefined || !projection.select) { + return row; + } + const out: Record = {}; + for (const k of Object.keys(projection.select)) { + if (projection.select[k]) { + out[k] = row[k]; + } + } + return out; +} + +// Convert an edge relation sub-projection into `findRun`/`findRuns`-shaped args ({select}/{include}). +function projectionAsArgs(projection: SubProjection): { select?: any; include?: any } | undefined { + if (projection === true || projection === undefined) { + return undefined; + } + return projection; +} + +function uniqueStrings(values: unknown[]): string[] { + const set = new Set(); + for (const v of values) { + if (typeof v === "string") set.add(v); + } + return [...set]; +} + +// Fields the in-memory merge needs in every row: `id` (membership/dedupe) plus each scalar +// `orderBy` field (the merge re-sorts in memory, so the field must be present in the row — +// Prisma would otherwise sort it in the DB without projecting it). +function requiredProjectionFields(args: FindRunsArgs): string[] { + const fields = new Set(["id"]); + if (args.orderBy) { + for (const clause of Array.isArray(args.orderBy) ? args.orderBy : [args.orderBy]) { + for (const [field, dir] of Object.entries(clause)) { + if (dir === "asc" || dir === "desc") fields.add(field); + } + } + } + return [...fields]; +} + +// Guarantee the required fields are projected, returning the ones we ADDED so finalizeRows +// can strip them back out (the caller didn't ask for them). +function ensureProjected(args: FindRunsArgs): { args: FindRunsArgs; addedFields: string[] } { + if (args.include || !args.select) return { args, addedFields: [] }; + const select = args.select as Record; + const nextSelect = { ...select }; + const added: string[] = []; + for (const field of requiredProjectionFields(args)) { + if (!select[field]) { + nextSelect[field] = true; + added.push(field); + } + } + return added.length === 0 + ? { args, addedFields: [] } + : { args: { ...args, select: nextSelect }, addedFields: added }; +} + +// Each store must return enough rows for the post-merge `orderBy`/`take`/`skip` to be +// re-imposed globally: drop `skip` and widen `take` to `skip + take` per store. +function widenForMerge(args: FindRunsArgs): FindRunsArgs { + if (args.take == null && !args.skip) return args; + const { skip, take, ...rest } = args; + return { ...rest, take: take == null ? undefined : (skip ?? 0) + take }; +} + +function finalizeRows( + rows: Array>, + args: FindRunsArgs, + addedFields: string[] +): unknown[] { + let out = args.orderBy ? sortByOrderBy(rows, args.orderBy) : rows; + const skip = args.skip ?? 0; + if (skip > 0 || args.take != null) { + out = out.slice(skip, args.take != null ? skip + args.take : undefined); + } + if (addedFields.length === 0) return out; + return out.map((row) => { + const copy = { ...row }; + for (const field of addedFields) delete copy[field]; + return copy; + }); +} + +function sortByOrderBy( + rows: Array>, + orderBy: NonNullable +): Array> { + const clauses = Array.isArray(orderBy) ? orderBy : [orderBy]; + const specs: Array<{ field: string; dir: "asc" | "desc" }> = []; + for (const clause of clauses) { + for (const [field, dir] of Object.entries(clause)) { + // Scalar fields only; relation/_count orderBy carries an object value and can't be + // re-sorted in memory — left to the per-store order. + if (dir === "asc" || dir === "desc") specs.push({ field, dir }); + } + } + if (specs.length === 0) return rows; + return [...rows].sort((a, b) => { + for (const { field, dir } of specs) { + const cmp = compareValues(a[field], b[field]); + if (cmp !== 0) return dir === "asc" ? cmp : -cmp; + } + return 0; + }); +} + +// Scalar comparator matching Postgres byte/C-collation order for the ASCII id/friendlyId +// columns and natural order for Date/number/bigint. Nulls sort first. +function compareValues(a: unknown, b: unknown): number { + if (a == null && b == null) return 0; + if (a == null) return -1; + if (b == null) return 1; + if (a instanceof Date && b instanceof Date) return a.getTime() - b.getTime(); + if (typeof a === "number" && typeof b === "number") return a - b; + if (typeof a === "bigint" && typeof b === "bigint") return a < b ? -1 : a > b ? 1 : 0; + const as = String(a); + const bs = String(b); + return as < bs ? -1 : as > bs ? 1 : 0; +} diff --git a/internal-packages/run-store/src/runOpsStore.waitpoints.test.ts b/internal-packages/run-store/src/runOpsStore.waitpoints.test.ts new file mode 100644 index 00000000000..9017d8c85b0 --- /dev/null +++ b/internal-packages/run-store/src/runOpsStore.waitpoints.test.ts @@ -0,0 +1,622 @@ +// RunStore run-ops persistence — waitpoints, against the REAL dedicated split topology. +// +// `heteroRunOpsPostgresTest` gives prisma14 = the full control-plane schema (#legacy) and +// prisma17 = a real `RunOpsPrismaClient` over the @internal/run-ops-database SUBSET schema (#new). +// These were previously on the weaker `heteroPostgresTest` (full schema on BOTH sides), which could +// not catch dedicated-subset behaviour differences — the entire point of the split. On the subset +// there are no Organization/Project/RuntimeEnvironment models and the implicit M2M join tables +// (`_WaitpointRunConnections`) are replaced by the explicit FK-free `WaitpointRunConnection` model, +// so the store's blocking/completion paths must behave identically whether backed by the legacy +// implicit M2M or the dedicated explicit join. + +import { heteroRunOpsPostgresTest, HETERO_PINNED_ICU_COLLATION } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { describe, expect } from "vitest"; +import { PostgresRunStore } from "./PostgresRunStore.js"; +import { RoutingRunStore } from "./runOpsStore.js"; +import type { CreateRunInput, RunStoreSchemaVariant } from "./types.js"; + +type AnyClient = PrismaClient | RunOpsPrismaClient; + +// ownerEngine classifies by internal-id LENGTH after stripping a single leading `_`: 27 chars +// → ksuid → NEW (#new / dedicated subset), 25 chars → cuid → LEGACY (#legacy / full schema). +const KSUID_27 = "k".repeat(27); +const CUID_25 = "c".repeat(25); + +// On the dedicated subset there are no Organization/Project/RuntimeEnvironment models (the run-ops +// rows carry FK-free scalar ids), so we mint synthetic owning ids. On legacy we seed the real rows +// the kept FKs require. +async function seedEnvironment( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + slugSuffix: string +) { + if (schemaVariant === "dedicated") { + return { + organization: { id: `org_${slugSuffix}` }, + project: { id: `proj_${slugSuffix}` }, + environment: { id: `env_${slugSuffix}` }, + }; + } + const organization = await (prisma as PrismaClient).organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await (prisma as PrismaClient).project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await (prisma as PrismaClient).runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +function buildCreateRunInput(params: { + runId: string; + friendlyId: string; + taskIdentifier: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + parentTaskRunId?: string; + rootTaskRunId?: string; +}): CreateRunInput { + return { + data: { + id: params.runId, + engine: "V2", + status: "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: params.taskIdentifier, + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: "trace_1", + spanId: "span_1", + runTags: ["alpha", "beta"], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + ...(params.parentTaskRunId && { parentTaskRunId: params.parentTaskRunId }), + ...(params.rootTaskRunId && { rootTaskRunId: params.rootTaskRunId }), + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +async function seedPendingWaitpoint( + prisma: AnyClient, + params: { + id: string; + friendlyId: string; + projectId: string; + environmentId: string; + type?: "MANUAL" | "RUN"; + completedByTaskRunId?: string; + } +) { + return (prisma as PrismaClient).waitpoint.create({ + data: { + id: params.id, + friendlyId: params.friendlyId, + type: params.type ?? "MANUAL", + status: "PENDING", + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + ...(params.completedByTaskRunId && { completedByTaskRunId: params.completedByTaskRunId }), + }, + }); +} + +function makeStore(prisma: AnyClient, schemaVariant: RunStoreSchemaVariant) { + return new PostgresRunStore({ + prisma: prisma as never, + readOnlyPrisma: prisma as never, + schemaVariant, + }); +} + +// Count the run↔waitpoint connection rows for (runId, waitpointId), reading from whichever physical +// connection table the store writes: the implicit `_WaitpointRunConnections` M2M on #legacy, the +// explicit FK-free `WaitpointRunConnection` model on the dedicated #new subset. +async function countConnection( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + runId: string, + waitpointId: string +): Promise { + const rows = + schemaVariant === "dedicated" + ? await (prisma as PrismaClient).$queryRawUnsafe<{ count: bigint }[]>( + `SELECT COUNT(*)::bigint as count FROM "WaitpointRunConnection" WHERE "taskRunId" = '${runId}' AND "waitpointId" = '${waitpointId}'` + ) + : await (prisma as PrismaClient).$queryRawUnsafe<{ count: bigint }[]>( + `SELECT COUNT(*)::bigint as count FROM "_WaitpointRunConnections" WHERE "A" = '${runId}' AND "B" = '${waitpointId}'` + ); + return Number(rows.at(0)?.count ?? 0); +} + +// Strip per-DB / prisma-managed fields so completed waitpoint rows compare field-for-field. +function normalizeWaitpoint(row: Record) { + const r = { ...row }; + delete r.id; + delete r.friendlyId; + delete r.idempotencyKey; + delete r.completedAt; + delete r.createdAt; + delete r.updatedAt; + delete r.projectId; + delete r.environmentId; + return r; +} + +describe("RunStore run-ops persistence — waitpoints", () => { + // a PENDING waitpoint blocked then completed via the store yields a behaviourally-identical + // completed row on #legacy (full schema) and #new (dedicated subset). + heteroRunOpsPostgresTest( + "waitpoint complete is behaviourally identical across #legacy and #new", + async ({ prisma14, prisma17 }) => { + const completedAt = new Date("2024-02-02T00:00:00.000Z"); + + const run = async ( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + runId: string, + suffix: string + ) => { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_wa_${suffix}`, + taskIdentifier: "my-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + const w = `wp_${suffix}`; + await seedPendingWaitpoint(prisma, { + id: w, + friendlyId: `waitpoint_${suffix}`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + + await store.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [w], + projectId: env.project.id, + }); + await store.updateManyWaitpoints({ + where: { id: w }, + data: { + status: "COMPLETED", + output: '{"done":true}', + outputType: "application/json", + completedAt, + }, + }); + + return store.findWaitpoint({ where: { id: w } }); + }; + + const wp14 = await run(prisma14, "legacy", `run_${CUID_25}`, "wa14"); + const wp17 = await run(prisma17, "dedicated", `run_${KSUID_27}`, "wa17"); + + expect(wp14).not.toBeNull(); + expect(wp17).not.toBeNull(); + expect(wp14!.status).toBe("COMPLETED"); + expect(wp17!.status).toBe("COMPLETED"); + expect(wp14!.completedAt?.toISOString()).toBe(completedAt.toISOString()); + expect(wp17!.completedAt?.toISOString()).toBe(completedAt.toISOString()); + expect(normalizeWaitpoint(wp14 as Record)).toEqual( + normalizeWaitpoint(wp17 as Record) + ); + } + ); + + // the blocking CTE writes exactly one TaskRunWaitpoint + one connection edge (the implicit + // `_WaitpointRunConnections` on #legacy, the explicit `WaitpointRunConnection` on #new), is + // idempotent on a re-run (ON CONFLICT DO NOTHING), and countPendingWaitpoints (the separate MVCC + // statement) flips 1 → 0 across the completion — identically on both stores. + heteroRunOpsPostgresTest( + "blocking CTE round-trips idempotently and pending-count reflects completion", + async ({ prisma14, prisma17 }) => { + const run = async ( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + runId: string, + suffix: string + ) => { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + await store.createRun( + buildCreateRunInput({ + runId, + friendlyId: `run_friendly_wb_${suffix}`, + taskIdentifier: "my-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + const w = `wp_${suffix}`; + await seedPendingWaitpoint(prisma, { + id: w, + friendlyId: `waitpoint_${suffix}`, + projectId: env.project.id, + environmentId: env.environment.id, + }); + + const countEdges = async () => { + const trw = await store.findManyTaskRunWaitpoints({ where: { taskRunId: runId } }); + const conn = await countConnection(prisma, schemaVariant, runId, w); + return { trw: trw.length, conn }; + }; + + // Pass an explicit batchIndex so the `@@unique([taskRunId, waitpointId, batchIndex])` + // index engages and the CTE's `ON CONFLICT DO NOTHING` genuinely dedupes the + // TaskRunWaitpoint row. (With a NULL batchIndex, NULLs are distinct in the unique + // index, so dedup is handled by a SQL-only partial index that the migration does not + // ship into the test clone — out of scope for this round-trip proof.) + const block = () => + store.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [w], + projectId: env.project.id, + batchIndex: 0, + }); + + await block(); + const afterFirst = await countEdges(); + const pendingBefore = await store.countPendingWaitpoints([w]); + + // Second call: ON CONFLICT DO NOTHING keeps it at exactly one of each. + await block(); + const afterSecond = await countEdges(); + + await store.updateManyWaitpoints({ where: { id: w }, data: { status: "COMPLETED" } }); + const pendingAfter = await store.countPendingWaitpoints([w]); + + return { afterFirst, afterSecond, pendingBefore, pendingAfter }; + }; + + for (const variant of [ + { + prisma: prisma14, + schemaVariant: "legacy" as const, + runId: `run_${CUID_25}`, + suffix: "wb14", + }, + { + prisma: prisma17, + schemaVariant: "dedicated" as const, + runId: `run_${KSUID_27}`, + suffix: "wb17", + }, + ]) { + const r = await run(variant.prisma, variant.schemaVariant, variant.runId, variant.suffix); + expect(r.afterFirst).toEqual({ trw: 1, conn: 1 }); + expect(r.afterSecond).toEqual({ trw: 1, conn: 1 }); + expect(r.pendingBefore).toBe(1); + expect(r.pendingAfter).toBe(0); + } + } + ); + + // a small V2 dependency subgraph (parent → child blocked on a RUN-type waitpoint completed by + // the child) traversed via the store reads produces an identically ordered closure id sequence on + // #legacy and #new. The load-bearing assertion is ordering parity; the order step is pinned to the + // shared ICU collation (`und-x-icu`, present on both containers). + heteroRunOpsPostgresTest( + "V2 dependency closure ordering is identical across #legacy and #new", + async ({ prisma14, prisma17 }) => { + const buildClosure = async ( + prisma: AnyClient, + schemaVariant: RunStoreSchemaVariant, + suffix: string + ) => { + const store = makeStore(prisma, schemaVariant); + const env = await seedEnvironment(prisma, schemaVariant, suffix); + + const parentId = "run_parent"; + const childId = "run_child"; + await store.createRun( + buildCreateRunInput({ + runId: parentId, + friendlyId: `run_parent_friendly_${suffix}`, + taskIdentifier: "parent-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + await store.createRun( + buildCreateRunInput({ + runId: childId, + friendlyId: `run_child_friendly_${suffix}`, + taskIdentifier: "child-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + parentTaskRunId: parentId, + rootTaskRunId: parentId, + }) + ); + + // A RUN-type waitpoint completed by the child, blocking the parent. The id is + // version-independent (each DB clone is isolated) so the closure id sequence is + // directly comparable across the two stores — the friendlyId carries the per-DB suffix + // to satisfy its global-unique constraint. + const w = "wp_run_closure"; + await seedPendingWaitpoint(prisma, { + id: w, + friendlyId: `waitpoint_run_${suffix}`, + projectId: env.project.id, + environmentId: env.environment.id, + type: "RUN", + completedByTaskRunId: childId, + }); + await store.blockRunWithWaitpointEdges({ + runId: parentId, + waitpointIds: [w], + projectId: env.project.id, + }); + + // Traverse: parent → its blocking edges → the blocking waitpoints → the run that + // completes each. Order the closure with explicit COLLATE on the text id step. + const edges = await store.findManyTaskRunWaitpoints({ where: { taskRunId: parentId } }); + const orderedWaitpointIds = ( + await (prisma as PrismaClient).$queryRawUnsafe<{ id: string }[]>( + `SELECT "id" FROM "Waitpoint" WHERE "id" IN (${edges + .map((e) => `'${e.waitpointId}'`) + .join(",")}) ORDER BY "id" COLLATE "${HETERO_PINNED_ICU_COLLATION}" ASC` + ) + ).map((r) => r.id); + const waitpoints = await store.findManyWaitpoints({ + where: { id: { in: orderedWaitpointIds } }, + }); + const completingRunIds = waitpoints + .map((wp) => wp.completedByTaskRunId) + .filter((id): id is string => Boolean(id)); + const completingRuns = await store.findRuns({ + where: { id: { in: completingRunIds } }, + orderBy: { id: "asc" }, + }); + + return [parentId, ...orderedWaitpointIds, ...completingRuns.map((r) => r.id)]; + }; + + const closure14 = await buildClosure(prisma14, "legacy", "wc14"); + const closure17 = await buildClosure(prisma17, "dedicated", "wc17"); + + expect(closure14).toEqual(closure17); + expect(closure14).toEqual(["run_parent", "wp_run_closure", "run_child"]); + } + ); + + // single-DB passthrough — both router stores are the same #legacy store over one client. A + // snapshot create + waitpoint block + complete via the router round-trips on that client and never + // touches the dedicated #new DB (prisma17, the SUBSET schema). + heteroRunOpsPostgresTest( + "single-DB binds one client for run-ops (passthrough)", + async ({ prisma14, prisma17 }) => { + const store = makeStore(prisma14, "legacy"); + const router = new RoutingRunStore({ new: store, legacy: store }); + + const env = await seedEnvironment(prisma14, "legacy", "wd14"); + + // KSUID_27-length id → NEW residency, exercising the route; both slots are the same store so + // it still lands on prisma14. + const runId = `run_${KSUID_27}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_passthrough_wd", + taskIdentifier: "passthrough-task", + organizationId: env.organization.id, + projectId: env.project.id, + runtimeEnvironmentId: env.environment.id, + }) + ); + + const w = "wp_passthrough_wd"; + await seedPendingWaitpoint(prisma14, { + id: w, + friendlyId: "waitpoint_passthrough_wd", + projectId: env.project.id, + environmentId: env.environment.id, + }); + + const snapshot = await router.createExecutionSnapshot({ + run: { id: runId, status: "EXECUTING", attemptNumber: 1 }, + snapshot: { executionStatus: "EXECUTING_WITH_WAITPOINTS", description: "passthrough" }, + completedWaitpoints: [{ id: w, index: 0 }], + environmentId: env.environment.id, + environmentType: "DEVELOPMENT", + projectId: env.project.id, + organizationId: env.organization.id, + }); + await router.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [w], + projectId: env.project.id, + }); + await router.updateManyWaitpoints({ where: { id: w }, data: { status: "COMPLETED" } }); + + const latest = await router.findLatestExecutionSnapshot(runId); + expect(latest?.id).toBe(snapshot.id); + const joinIds = await router.findSnapshotCompletedWaitpointIds(snapshot.id); + expect(joinIds).toEqual([w]); + expect(await router.countPendingWaitpoints([w])).toBe(0); + + // Everything landed on the one #legacy client; the dedicated #new DB was never touched. + expect(await prisma14.taskRun.findUnique({ where: { id: runId } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: runId } })).toBeNull(); + expect(await prisma17.waitpoint.findUnique({ where: { id: w } })).toBeNull(); + } + ); + + // the silent-hang case, against the REAL split. A NEW (ksuid) run is blocked on + // a LEGACY (cuid) token, so its block edge lives on #new (co-located with the run) while the token's + // id-shape says LEGACY. Completing that token must FAN OUT the waitpointId edge read across both DBs + // and find the edge on #new — routing by the token's id-shape (LEGACY) returns zero edges and the + // run hangs forever. The token is mirrored onto both DBs (the drain window), so #resolveWaitpointStore + // would resolve it to LEGACY and miss the NEW edge without the fan-out. + heteroRunOpsPostgresTest( + "completing a LEGACY token finds a NEW run's edge across both DBs (no silent hang)", + async ({ prisma14, prisma17 }) => { + const newStore = makeStore(prisma17, "dedicated"); + const legacyStore = makeStore(prisma14, "legacy"); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + // The NEW run + its (synthetic) env live on the dedicated #new subset (prisma17). + const env17 = await seedEnvironment(prisma17, "dedicated", "we17"); + const runId = `run_${KSUID_27}`; // ksuid → NEW residency + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_friendly_we", + taskIdentifier: "my-task", + organizationId: env17.organization.id, + projectId: env17.project.id, + runtimeEnvironmentId: env17.environment.id, + }) + ); + + // A LEGACY (cuid) token, mirrored onto BOTH DBs as during drain. The edge can only be + // written on #new (the run's DB) because the dedicated block insert sources the edge rows + // from the waitpointId array directly (FK-free). + const token = "w".repeat(25); // cuid-length → LEGACY id-shape + const env14 = await seedEnvironment(prisma14, "legacy", "we14"); + await seedPendingWaitpoint(prisma14, { + id: token, + friendlyId: "waitpoint_we_legacy", + projectId: env14.project.id, + environmentId: env14.environment.id, + }); + await seedPendingWaitpoint(prisma17, { + id: token, + friendlyId: "waitpoint_we_new", + projectId: env17.project.id, + environmentId: env17.environment.id, + }); + + // The edge is written on #new only (co-located with the run). + await newStore.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [token], + projectId: env17.project.id, + }); + expect(await prisma14.taskRunWaitpoint.count({ where: { waitpointId: token } })).toBe(0); + expect(await prisma17.taskRunWaitpoint.count({ where: { waitpointId: token } })).toBe(1); + + // The completion fan-out (the read completeWaitpoint uses) must find the NEW-DB edge even + // though the token classifies LEGACY. Pre-fix this returned [] (LEGACY-only) → silent hang. + const affected = await router.findManyTaskRunWaitpoints({ + where: { waitpointId: token }, + select: { taskRunId: true }, + }); + expect(affected.map((e) => e.taskRunId)).toEqual([runId]); + } + ); + + // replay / partial-completion safety, against the REAL split. There is NO cross-DB + // transaction, so a completion can flip the token on one DB while the edge-clear lands on the other + // (or a job is retried). The unblock recomputes the blocked set from the surviving edges and the + // edge delete is keyed by (taskRunId, edge ids) — never a blind decrement — so running the + // read+delete TWICE must not double-count or strand the run: after the first clear there are zero + // edges, and the second pass is a no-op. + heteroRunOpsPostgresTest( + "replaying the unblock clear is idempotent (no double-decrement, no strand)", + async ({ prisma14, prisma17 }) => { + const newStore = makeStore(prisma17, "dedicated"); + const legacyStore = makeStore(prisma14, "legacy"); + const router = new RoutingRunStore({ new: newStore, legacy: legacyStore }); + + const env17 = await seedEnvironment(prisma17, "dedicated", "wf17"); + const runId = `run_${KSUID_27}`; + await router.createRun( + buildCreateRunInput({ + runId, + friendlyId: "run_friendly_wf", + taskIdentifier: "my-task", + organizationId: env17.organization.id, + projectId: env17.project.id, + runtimeEnvironmentId: env17.environment.id, + }) + ); + + const token = "x".repeat(25); // LEGACY id-shape, edge co-located on #new + await seedPendingWaitpoint(prisma17, { + id: token, + friendlyId: "waitpoint_wf_new", + projectId: env17.project.id, + environmentId: env17.environment.id, + }); + await newStore.blockRunWithWaitpointEdges({ + runId, + waitpointIds: [token], + projectId: env17.project.id, + }); + await router.updateManyWaitpoints({ where: { id: token }, data: { status: "COMPLETED" } }); + + // Drive the continueRunIfUnblocked read+delete shape (by taskRunId) twice. + const unblockPass = async () => { + const edges = await router.findManyTaskRunWaitpoints({ + where: { taskRunId: runId }, + select: { id: true, waitpoint: { select: { status: true } } }, + }); + const stillBlocked = edges.some((e) => e.waitpoint.status !== "COMPLETED"); + if (!stillBlocked && edges.length > 0) { + await router.deleteManyTaskRunWaitpoints({ + where: { taskRunId: runId, id: { in: edges.map((e) => e.id) } }, + }); + } + return { edgeCount: edges.length, stillBlocked }; + }; + + const first = await unblockPass(); + const second = await unblockPass(); + + expect(first).toEqual({ edgeCount: 1, stillBlocked: false }); // found + cleared + expect(second).toEqual({ edgeCount: 0, stillBlocked: false }); // replay is a no-op + // Edge gone from both DBs; the run is unblocked exactly once, not double-processed. + expect(await prisma17.taskRunWaitpoint.count({ where: { taskRunId: runId } })).toBe(0); + expect(await prisma14.taskRunWaitpoint.count({ where: { taskRunId: runId } })).toBe(0); + } + ); +}); diff --git a/internal-packages/run-store/src/types.ts b/internal-packages/run-store/src/types.ts index 98eb3bb9e1a..32e5644114f 100644 --- a/internal-packages/run-store/src/types.ts +++ b/internal-packages/run-store/src/types.ts @@ -1,4 +1,6 @@ import type { + BatchTaskRun, + BatchTaskRunItemStatus, Prisma, PrismaClientOrTransaction, PrismaReplicaClient, @@ -9,6 +11,7 @@ import type { Waitpoint, } from "@trigger.dev/database"; import type { TaskRunError } from "@trigger.dev/core/v3/schemas"; +import type { Residency } from "@trigger.dev/core/v3/isomorphic"; /** * Client accepted by the read methods. Reads route through the replica by @@ -241,7 +244,84 @@ export type ClearIdempotencyKeyInput = export type TaskRunWithWaitpoint = TaskRun & { associatedWaitpoint: Waitpoint | null }; +/** + * Structured input for {@link RunStore.createExecutionSnapshot}. The store derives the + * `completedWaitpoints.connect` / `completedWaitpointOrder` / `isValid` fields from this + * input — callers pass the high-level shape, not a raw Prisma `data`/`include`. + */ +export type CreateExecutionSnapshotInput = { + run: { id: string; status: TaskRunStatus; attemptNumber?: number | null }; + snapshot: { + executionStatus: TaskRunExecutionStatus; + description: string; + metadata?: Prisma.JsonValue; + }; + previousSnapshotId?: string; + batchId?: string; + environmentId: string; + environmentType: RuntimeEnvironmentType; + projectId: string; + organizationId: string; + checkpointId?: string; + workerId?: string; + runnerId?: string; + completedWaitpoints?: { id: string; index?: number }[]; + error?: string; +}; + +// Create payload for `createBatchTaskRun`: scalar `runtimeEnvironmentId` (the FK is +// dropped for cross-DB residency; env existence is validated app-side at create). +export type CreateBatchTaskRunData = Prisma.BatchTaskRunUncheckedCreateInput; + +/** + * Mirror of the webapp's `UnblockRouteKind`. The engine/run-store cannot import the + * webapp types, so this union is kept IDENTICAL (members + field names) to + * `apps/webapp/app/v3/runOpsMigration/types.ts` so the two cannot drift conceptually. + */ +export type WaitpointUnblockRouteKind = + | "MANUAL" + | "DATETIME" + | "RESUME_TOKEN" + | "IDEMPOTENCY_REUSE" + | "RUN"; + +/** + * Pinning context for {@link RunStore.forWaitpointCompletion}. Mirrors the webapp's + * waitpoint-completion pinning input shape. + */ +export interface ForWaitpointCompletionContext { + routeKind: WaitpointUnblockRouteKind; + treeOwnerResidency?: Residency; + isCrossTreeIdempotency?: boolean; + hasLegacyParent?: boolean; +} + +/** + * Co-location hint for the waitpoint write/lookup methods. A DATETIME/MANUAL wait waitpoint's + * minted id is always a cuid, so id-shape routing always sends it to LEGACY; when `coLocateWithRunId` + * is set the router routes by the OWNING RUN's id instead, landing the waitpoint on the run's DB so + * the block edge's local `Waitpoint` join resolves. Single-store implementations ignore it. + */ +export interface WaitpointColocationOptions { + coLocateWithRunId?: string; +} + export interface RunStore { + /** + * Run a co-resident multi-write unit atomically on the store that OWNS `runId`. The callback gets + * the owning `RunStore` plus a `tx` opened on THAT store's OWN client; passing `tx` to the inner + * writes lands them all in ONE transaction on the owning DB (NEW for a ksuid run, LEGACY for a cuid + * run), so a failure between two writes rolls BOTH back. NOT a cross-DB transaction: `tx` is the + * owning store's own client (never the control-plane tx), and every write MUST target the same run / + * its co-resident subgraph. Callers MUST use the supplied `store` + `tx`, not the outer router + * (which would re-route and drop the tx). Single-store impls run `fn(this, tx)` in their own + * `$transaction`. + */ + runInTransaction( + runId: string | undefined, + fn: (store: RunStore, tx: PrismaClientOrTransaction) => Promise + ): Promise; + // Create createRun(params: CreateRunInput, tx?: PrismaClientOrTransaction): Promise; createCancelledRun( @@ -273,12 +353,12 @@ export interface RunStore { args: { select: S }, tx?: PrismaClientOrTransaction ): Promise>; - recordRetryOutcome( + recordRetryOutcome( runId: string, data: { machinePreset?: string; usageDurationMs: number; costInCents: number }, - args: { include: I }, + args: { select: S }, tx?: PrismaClientOrTransaction - ): Promise>; + ): Promise>; requeueRun( runId: string, args: { select: S }, @@ -337,7 +417,7 @@ export interface RunStore { runId: string, data: LockRunData, tx?: PrismaClientOrTransaction - ): Promise>; + ): Promise>; parkPendingVersion( runId: string, data: { statusReason: string }, @@ -429,6 +509,30 @@ export interface RunStore { ): Promise>; findRunOrThrow(where: Prisma.TaskRunWhereInput, client?: ReadClient): Promise; + // Read-after-write on the OWNING store's primary (writer), never the replica — for re-reading a + // run just written in this request, where replica lag would cause a false miss (mirrors + // findWaitpointOnPrimary). The routing store dispatches here per owning store so each reads its + // own writer, never leaking a control-plane client into another DB. + findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { select: S } + ): Promise | null>; + findRunOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { include: I } + ): Promise | null>; + findRunOnPrimary(where: Prisma.TaskRunWhereInput): Promise; + + findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { select: S } + ): Promise>; + findRunOrThrowOnPrimary( + where: Prisma.TaskRunWhereInput, + args: { include: I } + ): Promise>; + findRunOrThrowOnPrimary(where: Prisma.TaskRunWhereInput): Promise; + findRuns( args: { where: Prisma.TaskRunWhereInput; @@ -461,4 +565,179 @@ export interface RunStore { }, client?: ReadClient ): Promise; + + // --- run-ops persistence --- + // Snapshots, waitpoints, implicit M:N joins, dependents, attempts and checkpoints. The + // generic model wrappers are thin generics over the Prisma `*Args` types so include/select + // payload typing survives at the call site; the snapshot DTO builder and the two raw-SQL + // waitpoint methods keep their hand-written shapes. + + // Batch membership + createBatchTaskRunItem( + data: { batchTaskRunId: string; taskRunId: string; status: BatchTaskRunItemStatus }, + tx?: PrismaClientOrTransaction + ): Promise; + + // Snapshot group + findLatestExecutionSnapshot( + runId: string, + client?: ReadClient + ): Promise | null>; + findExecutionSnapshot( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise | null>; + findManyExecutionSnapshots( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]>; + createExecutionSnapshot( + input: CreateExecutionSnapshotInput, + tx?: PrismaClientOrTransaction + ): Promise>; + + // Implicit-join group + findSnapshotCompletedWaitpointIds(snapshotId: string, client?: ReadClient): Promise; + /** Run ids connected to a waitpoint (WaitpointRunConnection / `_WaitpointRunConnections`), this DB only. */ + findWaitpointConnectedRunIds(waitpointId: string, client?: ReadClient): Promise; + /** Snapshot ids that completed a waitpoint (CompletedWaitpoint / `_completedWaitpoints`), this DB only. */ + findWaitpointCompletedSnapshotIds(waitpointId: string, client?: ReadClient): Promise; + blockRunWithWaitpointEdges(params: { + runId: string; + waitpointIds: string[]; + projectId: string; + spanIdToComplete?: string; + batchId?: string; + batchIndex?: number; + tx?: PrismaClientOrTransaction; + }): Promise; + countPendingWaitpoints(waitpointIds: string[], client?: ReadClient): Promise; + + // Waitpoint group + createWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction, + opts?: WaitpointColocationOptions + ): Promise>; + upsertWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction, + opts?: WaitpointColocationOptions + ): Promise>; + findWaitpoint( + args: Prisma.SelectSubset, + client?: ReadClient, + opts?: WaitpointColocationOptions + ): Promise | null>; + // Read-after-write on the owning store's primary (never the replica) — for re-reading a + // waitpoint just written on the unblock path, where replica lag would cause a false miss. + findWaitpointOnPrimary( + args: Prisma.SelectSubset + ): Promise | null>; + findManyWaitpoints( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]>; + updateWaitpoint( + args: Prisma.SelectSubset, + tx?: PrismaClientOrTransaction, + opts?: WaitpointColocationOptions + ): Promise>; + updateManyWaitpoints( + args: Prisma.WaitpointUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise; + + /** + * Select the run-ops store that OWNS a waitpoint completion, by waitpointId + * residency. completeWaitpoint arrives with only (waitpointId, output) — no run + * id — so selection is by the waitpoint's own residency, with the documented + * pins to legacy. Returns the store HANDLE to apply the completion on. + * Single-store implementations return `this`. Throws UnclassifiableRunId on an + * ambiguous id in split mode (the engine rethrows it as UnclassifiableWaitpointId). + */ + forWaitpointCompletion( + waitpointId: string, + context: ForWaitpointCompletionContext + ): Promise; + + // TaskRunWaitpoint group + findManyTaskRunWaitpoints( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise[]>; + deleteManyTaskRunWaitpoints( + args: Prisma.TaskRunWaitpointDeleteManyArgs, + tx?: PrismaClientOrTransaction + ): Promise; + + // Attempt-model group (TaskRunAttempt, V1-residual) + findTaskRunAttempt( + args: Prisma.SelectSubset, + client?: ReadClient + ): Promise | null>; + + // Checkpoint family. `ownerRunId` is the run whose snapshot references this checkpoint via the + // kept `TaskRunExecutionSnapshot.checkpointId` FK — the routing store co-locates the checkpoint + // with that run so the snapshot insert can satisfy the FK on the same DB. The checkpoint + // row itself carries no runId scalar, so the owning run id must be threaded explicitly. + createTaskRunCheckpoint( + args: Prisma.SelectSubset, + ownerRunId?: string, + tx?: PrismaClientOrTransaction + ): Promise>; + + // --- BatchTaskRun (run-ops) --- + // Batch row is born on the run-ops store at create. `findBatchTaskRunById` + // reads the primary by default (worker reads the just-written row; replica lag). + createBatchTaskRun( + data: CreateBatchTaskRunData, + tx?: PrismaClientOrTransaction + ): Promise; + updateBatchTaskRun( + args: { + where: Prisma.BatchTaskRunWhereUniqueInput; + data: Prisma.BatchTaskRunUpdateInput; + select: S; + }, + tx?: PrismaClientOrTransaction + ): Promise>; + findBatchTaskRunById( + id: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null>; + findBatchTaskRunByFriendlyId( + friendlyId: string, + environmentId: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null>; + + // --- BatchTaskRun (run-ops) — batch residency additions --- + // The idempotency probe is keyed by (environmentId, idempotencyKey) — no classifiable + // batch id — so the router fans out NEW→LEGACY (mirrors `findBatchTaskRunByFriendlyId`). + findBatchTaskRunByIdempotencyKey( + environmentId: string, + idempotencyKey: string, + args?: { include?: T }, + client?: ReadClient + ): Promise | null>; + // updateMany of batch rows: route by `where.id` when scalar, else fan-out + sum counts. + updateManyBatchTaskRun( + args: Prisma.BatchTaskRunUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise; + // Count batch items by `batchTaskRunId` (items co-reside with the batch). + countBatchTaskRunItems( + where: { batchTaskRunId: string; status?: BatchTaskRunItemStatus }, + client?: ReadClient + ): Promise; + // updateMany of batch items: route by `where.id`/`where.batchTaskRunId`, else fan-out + sum. + updateManyBatchTaskRunItems( + args: Prisma.BatchTaskRunItemUpdateManyArgs, + tx?: PrismaClientOrTransaction + ): Promise; } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index dedee0097d9..ed8e9c6fdc8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1397,6 +1397,9 @@ importers: internal-packages/run-store: dependencies: + '@internal/run-ops-database': + specifier: workspace:* + version: link:../run-ops-database '@trigger.dev/core': specifier: workspace:* version: link:../../packages/core