diff --git a/.server-changes/run-ops-split-webapp-foundation.md b/.server-changes/run-ops-split-webapp-foundation.md new file mode 100644 index 00000000000..d1d8111b68e --- /dev/null +++ b/.server-changes/run-ops-split-webapp-foundation.md @@ -0,0 +1,5 @@ +--- +area: webapp +type: feature +--- +Add the run-ops database split webapp foundation — DB topology/flag wiring, split-mode gating, distinct-DB and native-realtime boot interlocks, and a cache-first control-plane resolver with cache invalidation on env/org writes — all inert until the split is enabled. diff --git a/apps/webapp/CLAUDE.md b/apps/webapp/CLAUDE.md index a4de6ab57b7..68efaffd41e 100644 --- a/apps/webapp/CLAUDE.md +++ b/apps/webapp/CLAUDE.md @@ -75,8 +75,8 @@ const signal = getRequestAbortSignal(); Access via `env` export from `app/env.server.ts`. **Never use `process.env` directly.** For testable code, **never import env.server.ts** in test files. Pass configuration as options instead: -- `realtimeClient.server.ts` (testable service, takes config as constructor arg) -- `realtimeClientGlobal.server.ts` (creates singleton with env config) +- `realtime/nativeRealtimeClient.server.ts` (testable service, takes config as constructor arg) +- `realtime/nativeRealtimeClientInstance.server.ts` (creates singleton with env config) ## Run Engine 2.0 diff --git a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts index 7de475bda8e..7f137c61d0c 100644 --- a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts +++ b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts @@ -2,6 +2,7 @@ import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import { type Duration } from "~/services/rateLimiter.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { API_RATE_LIMIT_INTENT } from "./ApiRateLimitSection"; import { handleRateLimitAction, @@ -31,6 +32,8 @@ export const apiRateLimitDomain: RateLimitDomain = { where: { id: orgId }, data: { apiRateLimiterConfig: next as any }, }); + // apiRateLimiterConfig is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(orgId); logger.info("admin.backOffice.apiRateLimit", { adminUserId, orgId, diff --git a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts index 3891e4fc40c..4614c5b2893 100644 --- a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts +++ b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts @@ -2,6 +2,7 @@ import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import { type Duration } from "~/services/rateLimiter.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { BATCH_RATE_LIMIT_INTENT } from "./BatchRateLimitSection"; import { handleRateLimitAction, @@ -31,6 +32,8 @@ export const batchRateLimitDomain: RateLimitDomain = { where: { id: orgId }, data: { batchRateLimitConfig: next as any }, }); + // batchRateLimitConfig is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(orgId); logger.info("admin.backOffice.batchRateLimit", { adminUserId, orgId, diff --git a/apps/webapp/app/db.server.ts b/apps/webapp/app/db.server.ts index 09c30b92568..8fc033dbf03 100644 --- a/apps/webapp/app/db.server.ts +++ b/apps/webapp/app/db.server.ts @@ -7,6 +7,7 @@ import { type PrismaTransactionClient, type PrismaTransactionOptions, } from "@trigger.dev/database"; +import { RunOpsPrismaClient } from "@internal/run-ops-database"; import invariant from "tiny-invariant"; import { z } from "zod"; import { env } from "./env.server"; @@ -18,6 +19,11 @@ import { logTransactionInfrastructureError, } from "./utils/prismaErrors"; import { singleton } from "./utils/singleton"; +import { + isSplitEnabled, + assertSplitRealtimeInterlock, +} from "./v3/runOpsMigration/splitMode.server"; +import { computeRunOpsSplitReadEnabled } from "./v3/runOpsMigration/runOpsSplitReadGate"; import { DATASOURCE_CONTEXT_KEY, startActiveSpan } from "./v3/tracer.server"; import type { Span } from "@opentelemetry/api"; import { context, trace } from "@opentelemetry/api"; @@ -130,6 +136,34 @@ function tagDatasource(datasource: "writer" | "replica", }) as unknown as T; } +// Same extension as tagDatasource but typed for RunOpsPrismaClient (different +// generated package — does not extend @trigger.dev/database.PrismaClient). +function tagDatasourceRunOps( + datasource: "writer" | "replica", + client: RunOpsPrismaClient +): RunOpsPrismaClient { + return client.$extends({ + name: "datasource-tagger", + query: { + $allOperations: ({ query, args }) => { + trace.getActiveSpan()?.setAttribute("db.datasource", datasource); + return context.with( + context.active().setValue(DATASOURCE_CONTEXT_KEY, datasource), + async () => await query(args) + ); + }, + }, + }) as unknown as RunOpsPrismaClient; +} + +// Same wrapper as captureInfrastructureErrors, bridged via double cast because +// that helper is constrained to T extends @trigger.dev/database.PrismaClient. +function captureInfraErrorsRunOps(client: RunOpsPrismaClient): RunOpsPrismaClient { + return captureInfrastructureErrors( + client as unknown as PrismaClient + ) as unknown as RunOpsPrismaClient; +} + export const prisma = singleton("prisma", () => captureInfrastructureErrors(tagDatasource("writer", getClient())) ); @@ -139,11 +173,156 @@ export const $replica: PrismaReplicaClient = singleton("replica", () => { return replica ? captureInfrastructureErrors(tagDatasource("replica", replica)) : prisma; }); +export type RunOpsClients = { writer: PrismaClient; replica: PrismaReplicaClient }; +export type NewRunOpsClients = { writer: RunOpsPrismaClient; replica: RunOpsPrismaClient }; +export type RunOpsTopology = { + newRunOps: NewRunOpsClients; + legacyRunOps: RunOpsClients; + controlPlane: RunOpsClients; +}; +export type SelectRunOpsTopologyConfig = { + splitEnabled: boolean; + legacyUrl?: string; + newUrl?: string; + newReplicaUrl?: string; +}; +export type RunOpsClientBuilders = { + controlPlane: RunOpsClients; + buildNewWriter: (url: string, clientType: string) => RunOpsPrismaClient; + buildNewReplica: (url: string, clientType: string) => RunOpsPrismaClient; +}; + +// Pure run-ops client selector. No env, no isSplitEnabled() — those +// belong in the env-bound singleton (see runOpsTopology below). The builder +// callbacks are the only side-effecting boundary, so split-OFF (the default) +// calls NEITHER and opens no second connection. +export function selectRunOpsTopology( + config: SelectRunOpsTopologyConfig, + builders: RunOpsClientBuilders +): RunOpsTopology { + const { controlPlane } = builders; + + const cpFallback: NewRunOpsClients = { + writer: controlPlane.writer as unknown as RunOpsPrismaClient, + replica: controlPlane.replica as unknown as RunOpsPrismaClient, + }; + + if (!config.splitEnabled) { + return { newRunOps: cpFallback, legacyRunOps: controlPlane, controlPlane }; + } + + if (!config.legacyUrl || !config.newUrl) { + return { newRunOps: cpFallback, legacyRunOps: controlPlane, controlPlane }; + } + + const legacyRunOps = controlPlane; + + const newWriter = builders.buildNewWriter(config.newUrl, "run-ops-new-writer"); + const newReplica: RunOpsPrismaClient = config.newReplicaUrl + ? builders.buildNewReplica(config.newReplicaUrl, "run-ops-new-reader") + : newWriter; + + return { + newRunOps: { writer: newWriter, replica: newReplica }, + legacyRunOps, + controlPlane, + }; +} + +// The env-bound run-ops topology singleton. The split decision uses +// a cheap synchronous env predicate (governs whether a second pool is opened); +// the async distinct-DB sentinel is enforced separately at boot via +// assertRunOpsSplitSentinel(). Because the builder callbacks only run when +// splitEnabled is true, single-DB reuses prisma/$replica by reference and opens +// nothing new. The builders apply the SAME wrapper pair the control-plane +// singletons use (captureInfrastructureErrors(tagDatasource(role, raw))). +const runOpsTopology: RunOpsTopology = singleton("runOpsTopology", () => { + const newUrl = env.TASK_RUN_DATABASE_URL; + // Gate on the opt-in flag too: the distinct-DB sentinel only runs when the flag is on. + const splitEnabled = env.RUN_OPS_SPLIT_ENABLED && !!newUrl && !!env.TASK_RUN_LEGACY_DATABASE_URL; + + return selectRunOpsTopology( + { + splitEnabled, + legacyUrl: env.TASK_RUN_LEGACY_DATABASE_URL, + newUrl, + newReplicaUrl: env.TASK_RUN_DATABASE_READ_REPLICA_URL, + }, + { + controlPlane: { writer: prisma, replica: $replica }, + buildNewWriter: (url, clientType) => + captureInfraErrorsRunOps( + tagDatasourceRunOps("writer", buildRunOpsWriterClient({ url, clientType })) + ), + buildNewReplica: (url, clientType) => + captureInfraErrorsRunOps( + tagDatasourceRunOps("replica", buildRunOpsReplicaClient({ url, clientType })) + ), + } + ); +}); + +// Typed as RunOpsPrismaClient for the run-store boundary. +export const runOpsNewPrismaClient: RunOpsPrismaClient = runOpsTopology.newRunOps.writer; +export const runOpsNewReplicaClient: RunOpsPrismaClient = runOpsTopology.newRunOps.replica; +// Legacy-typed aliases kept for the remaining consumers that still expect PrismaClient / +// PrismaReplicaClient (idempotency residency, read-through, handlers, cascade cleanup). +export const runOpsNewPrisma: PrismaClient = runOpsTopology.newRunOps + .writer as unknown as PrismaClient; +export const runOpsNewReplica: PrismaReplicaClient = runOpsTopology.newRunOps + .replica as unknown as PrismaReplicaClient; +export const runOpsLegacyPrisma: PrismaClient = runOpsTopology.legacyRunOps.writer; +export const runOpsLegacyReplica: PrismaReplicaClient = runOpsTopology.legacyRunOps.replica; + +export const runOpsSplitReadEnabled: boolean = computeRunOpsSplitReadEnabled({ + newReplica: runOpsNewReplicaClient, + controlPlaneWriter: prisma, + controlPlaneReplica: $replica, + hasNewUrl: !!env.TASK_RUN_DATABASE_URL, + hasLegacyUrl: !!env.TASK_RUN_LEGACY_DATABASE_URL, +}); + +// Boot-time interlock: if the flag is on but the distinct-DB sentinel does not +// confirm two physically-distinct run-ops DBs, refuse to enable split (data-loss +// interlock). Async, so it cannot live in the synchronous singleton factory — +// call it from the eager-boot path before any run-ops routing is wired. +export async function assertRunOpsSplitSentinel(): Promise { + if (!env.RUN_OPS_SPLIT_ENABLED) return; + // Realtime interlock (synchronous): Electric replicates only from the control-plane + // DB, so split-on without the native realtime backend leaves NEW-resident runs + // invisible and hangs every subscription. Fail fast before the async DB probe. + assertSplitRealtimeInterlock({ + splitEnabled: env.RUN_OPS_SPLIT_ENABLED, + nativeRealtimeEnabled: env.REALTIME_BACKEND_NATIVE_ENABLED === "1", + }); + const ok = await isSplitEnabled(); + if (!ok) { + throw new Error( + "RUN_OPS_SPLIT_ENABLED is on but the distinct-DB sentinel did not confirm two physically-distinct run-ops DBs; refusing to enable split (data-loss interlock)." + ); + } +} + function getClient() { - const { DATABASE_URL } = process.env; - invariant(typeof DATABASE_URL === "string", "DATABASE_URL env var not set"); + // Control-plane datasource repoint: prefer the dedicated control-plane DSN, falling back to + // DATABASE_URL so self-host / single-DB installs boot byte-identical when CONTROL_PLANE_DATABASE_URL is unset. + const url = env.CONTROL_PLANE_DATABASE_URL ?? env.DATABASE_URL; + invariant(typeof url === "string", "neither CONTROL_PLANE_DATABASE_URL nor DATABASE_URL is set"); + + return buildWriterClient({ url, clientType: "writer" }); +} - const databaseUrl = extendQueryParams(DATABASE_URL, { +// Generalized writer builder shared by the control-plane client and the run-ops +// clients. Returns a RAW, untagged, un-wrapped PrismaClient — the +// caller applies tagDatasource + captureInfrastructureErrors. +export function buildWriterClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): PrismaClient { + const databaseUrl = extendQueryParams(url, { connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), @@ -215,7 +394,7 @@ function getClient() { if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { client.$on("info", (log) => { logger.info("PrismaClient info", { - clientType: "writer", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -226,7 +405,7 @@ function getClient() { client.$on("warn", (log) => { logger.warn("PrismaClient warn", { - clientType: "writer", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -237,7 +416,7 @@ function getClient() { client.$on("error", (log) => { logger.error("PrismaClient error", { - clientType: "writer", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -268,12 +447,29 @@ function getClient() { } function getReplicaClient() { - if (!env.DATABASE_READ_REPLICA_URL) { + // Control-plane replica repoint: prefer the dedicated control-plane replica, falling back to + // DATABASE_READ_REPLICA_URL. Early-return undefined only when BOTH are unset, so $replica keeps + // falling back to prisma exactly as today when no replica is configured. + const url = env.CONTROL_PLANE_DATABASE_READ_REPLICA_URL ?? env.DATABASE_READ_REPLICA_URL; + if (!url) { console.log(`🔌 No database replica, using the regular client`); return; } - const replicaUrl = extendQueryParams(env.DATABASE_READ_REPLICA_URL, { + return buildReplicaClient({ url, clientType: "reader" }); +} + +// Generalized replica builder shared by the control-plane replica and the run-ops +// replicas. Returns a RAW, untagged, un-wrapped PrismaClient — the +// caller applies tagDatasource + captureInfrastructureErrors. +export function buildReplicaClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): PrismaClient { + const replicaUrl = extendQueryParams(url, { connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), @@ -345,7 +541,7 @@ function getReplicaClient() { if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { replicaClient.$on("info", (log) => { logger.info("PrismaClient info", { - clientType: "reader", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -356,7 +552,7 @@ function getReplicaClient() { replicaClient.$on("warn", (log) => { logger.warn("PrismaClient warn", { - clientType: "reader", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -367,7 +563,7 @@ function getReplicaClient() { replicaClient.$on("error", (log) => { logger.error("PrismaClient error", { - clientType: "reader", + clientType, event: { timestamp: log.timestamp, message: log.message, @@ -396,6 +592,108 @@ function getReplicaClient() { return replicaClient; } +function buildRunOpsWriterClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): RunOpsPrismaClient { + const databaseUrl = extendQueryParams(url, { + connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), + pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), + connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), + application_name: env.SERVICE_NAME, + }); + + console.log(`🔌 setting up run-ops prisma client to ${redactUrlSecrets(databaseUrl)}`); + + const client = new RunOpsPrismaClient({ + datasources: { db: { url: databaseUrl.href } }, + log: [ + { emit: "event", level: "error" }, + { emit: "event", level: "info" }, + { emit: "event", level: "warn" }, + ...((process.env.VERBOSE_PRISMA_LOGS === "1" || + process.env.VERY_SLOW_QUERY_THRESHOLD_MS !== undefined + ? [{ emit: "event", level: "query" }] + : []) as { emit: "event"; level: "query" }[]), + ], + }); + + if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { + client.$on("info", (log) => logger.info("RunOpsPrismaClient info", { clientType, event: log })); + client.$on("warn", (log) => logger.warn("RunOpsPrismaClient warn", { clientType, event: log })); + client.$on("error", (log) => + logger.error("RunOpsPrismaClient error", { clientType, event: log, ignoreError: true }) + ); + } + + client.$on("query", (log) => queryPerformanceMonitor.onQuery("writer", log)); + + const connectPromise = client.$connect(); + if (env.NODE_ENV === "test") { + connectPromise.catch((error) => { + logger.warn("Failed to eagerly connect run-ops prisma client (writer)", { error }); + }); + } + + console.log(`🔌 run-ops prisma client connected`); + + return client; +} + +function buildRunOpsReplicaClient({ + url, + clientType, +}: { + url: string; + clientType: string; +}): RunOpsPrismaClient { + const replicaUrl = extendQueryParams(url, { + connection_limit: env.DATABASE_CONNECTION_LIMIT.toString(), + pool_timeout: env.DATABASE_POOL_TIMEOUT.toString(), + connection_timeout: env.DATABASE_CONNECTION_TIMEOUT.toString(), + application_name: env.SERVICE_NAME, + }); + + console.log(`🔌 setting up run-ops read replica connection to ${redactUrlSecrets(replicaUrl)}`); + + const client = new RunOpsPrismaClient({ + datasources: { db: { url: replicaUrl.href } }, + log: [ + { emit: "event", level: "error" }, + { emit: "event", level: "info" }, + { emit: "event", level: "warn" }, + ...((process.env.VERBOSE_PRISMA_LOGS === "1" || + process.env.VERY_SLOW_QUERY_THRESHOLD_MS !== undefined + ? [{ emit: "event", level: "query" }] + : []) as { emit: "event"; level: "query" }[]), + ], + }); + + if (process.env.PRISMA_LOG_TO_STDOUT !== "1") { + client.$on("info", (log) => logger.info("RunOpsPrismaClient info", { clientType, event: log })); + client.$on("warn", (log) => logger.warn("RunOpsPrismaClient warn", { clientType, event: log })); + client.$on("error", (log) => + logger.error("RunOpsPrismaClient error", { clientType, event: log }) + ); + } + + client.$on("query", (log) => queryPerformanceMonitor.onQuery("replica", log)); + + const connectPromise = client.$connect(); + if (env.NODE_ENV === "test") { + connectPromise.catch((error) => { + logger.warn("Failed to eagerly connect run-ops prisma client (replica)", { error }); + }); + } + + console.log(`🔌 run-ops read replica connected`); + + return client; +} + function extendQueryParams(hrefOrUrl: string | URL, queryParams: Record) { const url = new URL(hrefOrUrl); const query = url.searchParams; diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index ab1941ef0bb..091f2f28ccf 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -15,7 +15,7 @@ import { bootstrap } from "./bootstrap"; import { LocaleContextProvider } from "./components/primitives/LocaleProvider"; import type { OperatingSystemPlatform } from "./components/primitives/OperatingSystemProvider"; import { OperatingSystemContextProvider } from "./components/primitives/OperatingSystemProvider"; -import { Prisma } from "./db.server"; +import { assertRunOpsSplitSentinel, Prisma } from "./db.server"; import { env } from "./env.server"; import { eventLoopMonitor } from "./eventLoopMonitor.server"; import { logger } from "./services/logger.server"; @@ -271,6 +271,17 @@ process.on("uncaughtException", (error, origin) => { process.exit(1); }); +// Boot-time run-ops split interlock. Async, so it runs as a +// fire-and-forget at startup; a flag-on-but-sentinel-fails misconfig crashes +// the process loudly before any run-ops routing is wired. +singleton("AssertRunOpsSplitSentinel", () => { + assertRunOpsSplitSentinel().catch((error) => { + logger.error("Run-ops split sentinel assertion failed; refusing to start", { error }); + process.exit(1); + }); + return true; +}); + singleton("RunEngineEventBusHandlers", registerRunEngineEventBusHandlers); singleton("SetupBatchQueueCallbacks", setupBatchQueueCallbacks); // Attach the realtime run-changed publish delegations to the engine event bus. diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 367e9a3362d..906d3aa225b 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -97,8 +97,8 @@ const EnvironmentSchema = z DATABASE_CONNECTION_LIMIT: z.coerce.number().int().default(10), DATABASE_POOL_TIMEOUT: z.coerce.number().int().default(60), DATABASE_CONNECTION_TIMEOUT: z.coerce.number().int().default(20), - // Dashboard-agent conversation store. Cloud points this at the dedicated - // PlanetScale database; when unset it falls back to DATABASE_URL (OSS), where + // Dashboard-agent conversation store. Cloud points this at a dedicated + // database; when unset it falls back to DATABASE_URL (OSS), where // the tables live in the isolated `trigger_dashboard_agent` schema. DASHBOARD_AGENT_DATABASE_URL: z.string().optional(), // The secret key (tr_*) for the runtime environment the dashboard-agent task @@ -128,6 +128,55 @@ const EnvironmentSchema = z "DIRECT_URL is invalid, for details please check the additional output above this message." ), DATABASE_READ_REPLICA_URL: z.string().optional(), + // --- Run-ops DB split — Cloud-only scaling concern; OFF by default. --- + // Explicit positive opt-in. Split behavior is unreachable unless this is true + // AND the distinct-DB sentinel confirms the two URLs are physically distinct DBs. + RUN_OPS_SPLIT_ENABLED: BoolEnv.default(false), + // Datasource URL for the dedicated run-ops Prisma schema (migrations/generation). + // The webapp runtime pool is driven by TASK_RUN_DATABASE_URL, not this var. + RUN_OPS_DATABASE_URL: z + .string() + .refine(isValidDatabaseUrl, "RUN_OPS_DATABASE_URL is invalid") + .optional(), + // The NEW dedicated run-ops DB writer. Optional so single-DB installs never set it. + TASK_RUN_DATABASE_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_DATABASE_URL is invalid") + .optional(), + // The NEW run-ops DB unpooled/direct endpoint (Prisma migrate/introspection; + // connection poolers break advisory locks). Consumed by the migrations. + TASK_RUN_DATABASE_DIRECT_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_DATABASE_DIRECT_URL is invalid") + .optional(), + // The LEGACY run-ops DB (the control-plane DB during the transition). When unset, legacy + // run-ops reuses the existing DATABASE_URL (legacy run-ops == control-plane DB initially). + TASK_RUN_LEGACY_DATABASE_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_LEGACY_DATABASE_URL is invalid") + .optional(), + // The NEW dedicated run-ops DB read replica. Optional; self-host never sets it. + // Refined (unlike the unrefined control-plane DATABASE_READ_REPLICA_URL) so a malformed run-ops + // replica URL fails boot loudly rather than silently degrading — do not align it down to the CP shape. + TASK_RUN_DATABASE_READ_REPLICA_URL: z + .string() + .refine(isValidDatabaseUrl, "TASK_RUN_DATABASE_READ_REPLICA_URL is invalid") + .optional(), + // --- Control-plane datasource repoint. Additive-only. --- + // Optional control-plane DB. Unset (self-host/single-DB) -> getClient()/getReplicaClient() fall back to + // DATABASE_URL/DATABASE_READ_REPLICA_URL, so boot is byte-identical. When set, these point at the + // dedicated control-plane DSN; moving off the shared DB is an ops config change, not a code edit. + CONTROL_PLANE_DATABASE_URL: z + .string() + .refine( + (v) => v === undefined || isValidDatabaseUrl(v), + "CONTROL_PLANE_DATABASE_URL is invalid" + ) + .optional(), + CONTROL_PLANE_DATABASE_READ_REPLICA_URL: z.string().optional(), + // Control-plane cache relax knobs. Unset -> defaults (DEFAULT_CP_CACHE_TTL_MS / _MAX_ENTRIES). + CONTROL_PLANE_CACHE_TTL_MS: z.coerce.number().int().optional(), + CONTROL_PLANE_CACHE_MAX_ENTRIES: z.coerce.number().int().optional(), SESSION_SECRET: z.string(), MAGIC_LINK_SECRET: z.string(), ENCRYPTION_KEY: z @@ -1673,6 +1722,29 @@ const EnvironmentSchema = z RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"), RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"), + // --- Run-ops DB split — second replication source (the NEW dedicated run-ops DB). --- + // Cloud-only; only consulted when isSplitEnabled() is true. Self-host never sets these. + // The NEW source's connection URL is TASK_RUN_DATABASE_URL; these add + // the NEW source's replication slot/publication and an explicit per-source enable so it can be + // brought up independently of the legacy source during the transition. + RUN_REPLICATION_NEW_SLOT_NAME: z.string().default("task_runs_to_clickhouse_v2"), + RUN_REPLICATION_NEW_PUBLICATION_NAME: z + .string() + .default("task_runs_to_clickhouse_v2_publication"), + RUN_REPLICATION_NEW_ENABLED: z.string().default("0"), + // Origin generations packed into _version via composeTaskRunVersion. + // Legacy DB = 0, new dedicated run-ops DB = 1. Exposed as env so the mapping is auditable + // per-deploy, but DEFAULTS encode the canonical legacy=0 / new=1 contract. + RUN_REPLICATION_LEGACY_ORIGIN_GENERATION: z.coerce.number().int().default(0), + RUN_REPLICATION_NEW_ORIGIN_GENERATION: z.coerce.number().int().default(1), + + // Run-ops KSUID mint cutover — per-env, canary-first, OFF by default. + // Even when on, an env mints KSUID only if its per-org runOpsMintKsuid flag is + // "ksuid" AND isSplitEnabled() is true. Cache mirrors REALTIME_BACKEND_FLAG_CACHE_*. + RUN_OPS_MINT_KSUID_ENABLED: BoolEnv.default(false), + RUN_OPS_MINT_FLAG_CACHE_TTL_MS: z.coerce.number().int().default(30_000), + RUN_OPS_MINT_FLAG_CACHE_MAX_ENTRIES: z.coerce.number().int().default(10_000), + // Session replication (Postgres → ClickHouse sessions_v1). Shares Redis // with the runs replicator for leader locking but has its own slot and // publication so the two consume independently. diff --git a/apps/webapp/app/models/api-key.server.ts b/apps/webapp/app/models/api-key.server.ts index b5f2bd0f7d9..19947417229 100644 --- a/apps/webapp/app/models/api-key.server.ts +++ b/apps/webapp/app/models/api-key.server.ts @@ -2,6 +2,7 @@ import type { RuntimeEnvironment } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { customAlphabet } from "nanoid"; import { RuntimeEnvironmentType } from "~/database-types"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; const apiKeyId = customAlphabet( "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", @@ -87,6 +88,9 @@ export async function regenerateApiKey({ userId, environmentId }: RegenerateAPIK }); }); + // The env's apiKey changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environmentId); + return updatedEnviroment; } diff --git a/apps/webapp/app/models/runtimeEnvironment.server.ts b/apps/webapp/app/models/runtimeEnvironment.server.ts index efcfdc524fa..5e6974cb0f1 100644 --- a/apps/webapp/app/models/runtimeEnvironment.server.ts +++ b/apps/webapp/app/models/runtimeEnvironment.server.ts @@ -2,6 +2,7 @@ import type { AuthenticatedEnvironment } from "@internal/run-engine"; import type { Prisma, PrismaClientOrTransaction, RuntimeEnvironment } from "@trigger.dev/database"; import { $replica, prisma } from "~/db.server"; import { runStore } from "~/v3/runStore.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { logger } from "~/services/logger.server"; import { getUsername } from "~/utils/username"; import { isDefaultDevBranch, sanitizeBranchName } from "@trigger.dev/core/v3/utils/gitBranch"; @@ -271,24 +272,32 @@ export async function findEnvironmentFromRun( runId: string, tx?: PrismaClientOrTransaction ): Promise { - // The include (no select) already pulls every taskRun scalar, so runTags/batchId - // ride along for free — no extra query for the realtime publish to send a full record. + // Run-ops scalars (runTags/batchId/runtimeEnvironmentId) from the run store; the env half is + // resolved via the control-plane resolver so the run-ops DB can split without a cross-DB join. const taskRun = await runStore.findRun( { id: runId, }, { - include: { - runtimeEnvironment: { include: authIncludeBase }, + select: { + runTags: true, + batchId: true, + runtimeEnvironmentId: true, }, }, tx ?? $replica ); - if (!taskRun?.runtimeEnvironment) { + if (!taskRun) { + return null; + } + const environment = await controlPlaneResolver.resolveAuthenticatedEnv( + taskRun.runtimeEnvironmentId + ); + if (!environment) { return null; } return { - environment: toAuthenticated(taskRun.runtimeEnvironment), + environment, runTags: taskRun.runTags, batchId: taskRun.batchId, }; diff --git a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts index fa197fc1694..44c7c0243c6 100644 --- a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts +++ b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.burst-factor.ts @@ -2,6 +2,7 @@ import { type ActionFunctionArgs, json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server"; const ParamsSchema = z.object({ @@ -26,5 +27,7 @@ export async function action({ request, params }: ActionFunctionArgs) { await updateEnvConcurrencyLimits(environment); + controlPlaneResolver.invalidateEnvironment(environmentId); + return json({ success: true }); } diff --git a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts index ae8deb32dfa..908e8f449a0 100644 --- a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts +++ b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts @@ -3,6 +3,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { engine } from "~/v3/runEngine.server"; import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server"; @@ -45,6 +46,10 @@ export async function action({ request, params }: ActionFunctionArgs) { await updateEnvConcurrencyLimits(environment); + // Org max-concurrency changed too, which is embedded in every env of the org; invalidating + // the org drops the env/authEnv rows for all of them (including this env). + controlPlaneResolver.invalidateOrganization(environment.organizationId); + return json({ success: true }); } diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts index 97c5e74583f..c99637a0d10 100644 --- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.concurrency.ts @@ -3,6 +3,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server"; const ParamsSchema = z.object({ @@ -83,5 +84,8 @@ export async function action({ request, params }: ActionFunctionArgs) { await updateEnvConcurrencyLimits({ ...modifiedEnvironment, organization }); } + // Org + every affected env's concurrency changed; one org invalidation covers them all. + controlPlaneResolver.invalidateOrganization(organizationId); + return json({ success: true }); } diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts index 779847d250f..e5fd7f7963b 100644 --- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts @@ -3,6 +3,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { validatePartialFeatureFlags } from "~/v3/featureFlags"; const ParamsSchema = z.object({ @@ -101,6 +102,9 @@ export async function action({ request, params }: ActionFunctionArgs) { }, }); + // Org feature flags are embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(organizationId); + const updatedFlagsResult = updatedOrganization.featureFlags ? validatePartialFeatureFlags(updatedOrganization.featureFlags as Record) : { success: false as const }; diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts index cb888b5b094..872900c9e2d 100644 --- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.runs.enable.ts @@ -3,6 +3,7 @@ import { EnvironmentPauseSource } from "@trigger.dev/database"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { PauseEnvironmentService } from "~/v3/services/pauseEnvironment.server"; const ParamsSchema = z.object({ @@ -43,6 +44,10 @@ export async function action({ request, params }: ActionFunctionArgs) { return json({ error: "Organization not found" }, { status: 404 }); } + // `runsEnabled` is embedded in every env of the org; drop all its cached env rows. The + // per-env pause writes below invalidate their own envs via PauseEnvironmentService. + controlPlaneResolver.invalidateOrganization(organizationId); + const environments = await prisma.runtimeEnvironment.findMany({ where: { organizationId, diff --git a/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts b/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts index ea0dd757c25..3c62d9c7a5f 100644 --- a/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts +++ b/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts @@ -4,6 +4,7 @@ import { Prisma } from "@trigger.dev/database"; import { z } from "zod"; import { prisma } from "~/db.server"; import { requireUser } from "~/services/session.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { flags as getGlobalFlags } from "~/v3/featureFlags.server"; import { FEATURE_FLAG, @@ -132,5 +133,8 @@ export async function action({ request, params }: ActionFunctionArgs) { throw e; } + // Org feature flags are embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(organizationId); + return json({ success: true }); } diff --git a/apps/webapp/app/services/archiveBranch.server.ts b/apps/webapp/app/services/archiveBranch.server.ts index e6dc3d3325a..3372ac87229 100644 --- a/apps/webapp/app/services/archiveBranch.server.ts +++ b/apps/webapp/app/services/archiveBranch.server.ts @@ -2,6 +2,7 @@ import { type PrismaClient } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { logger } from "./logger.server"; import { nanoid } from "nanoid"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; export class ArchiveBranchService { #prismaClient: PrismaClient; @@ -88,6 +89,9 @@ export class ArchiveBranchService { data: { archivedAt: new Date(), slug, shortcode }, }); + // archivedAt/slug/shortcode changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environmentId); + return { success: true as const, branch: updatedBranch, diff --git a/apps/webapp/app/services/deleteOrganization.server.ts b/apps/webapp/app/services/deleteOrganization.server.ts index 6c490b276df..9f8eb1cd37a 100644 --- a/apps/webapp/app/services/deleteOrganization.server.ts +++ b/apps/webapp/app/services/deleteOrganization.server.ts @@ -4,6 +4,7 @@ import { prisma } from "~/db.server"; import { featuresForRequest } from "~/features.server"; import { DeleteProjectService } from "./deleteProject.server"; import { getCurrentPlan } from "./platform.v3.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; export class DeleteOrganizationService { #prismaClient: PrismaClient; @@ -82,5 +83,8 @@ export class DeleteOrganizationService { deletedAt: new Date(), }, }); + + // runsEnabled + the org's projects (project.deletedAt) changed; drop all cached env rows. + controlPlaneResolver.invalidateOrganization(organization.id); } } diff --git a/apps/webapp/app/services/deleteProject.server.ts b/apps/webapp/app/services/deleteProject.server.ts index f6bc23d56a6..bbce896a57f 100644 --- a/apps/webapp/app/services/deleteProject.server.ts +++ b/apps/webapp/app/services/deleteProject.server.ts @@ -2,6 +2,7 @@ import type { PrismaClient } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { marqs } from "~/v3/marqs/index.server"; import { engine } from "~/v3/runEngine.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; type Options = ({ projectId: string } | { projectSlug: string }) & { userId: string; @@ -60,6 +61,11 @@ export class DeleteProjectService { deletedAt: new Date(), }, }); + + // project.deletedAt (which gates env resolution) changed; drop every cached env of this project. + for (const environment of project.environments) { + controlPlaneResolver.invalidateEnvironment(environment.id); + } } async #getProjectId(options: Options) { diff --git a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts index 3cb30ba6a27..98316b84fe8 100644 --- a/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts +++ b/apps/webapp/app/services/realtime/streamBasinProvisioner.server.ts @@ -11,6 +11,7 @@ import type { PrismaClientOrTransaction } from "~/db.server"; import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { parseDuration } from "./duration.server"; export function isPerOrgBasinsEnabled(): boolean { @@ -76,6 +77,9 @@ export async function provisionBasinForOrg( data: { streamBasinName: basin }, }); + // streamBasinName is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(org.id); + logger.info("[streamBasinProvisioner] provisioned basin for org", { orgId: org.id, basin, @@ -158,6 +162,9 @@ export async function deprovisionBasinForOrg( data: { streamBasinName: null }, }); + // streamBasinName is embedded in every env of the org; drop all its cached env rows. + controlPlaneResolver.invalidateOrganization(org.id); + logger.info("[streamBasinProvisioner] deprovisioned basin for org", { orgId, previousBasin: org.streamBasinName, diff --git a/apps/webapp/app/v3/engineVersion.server.ts b/apps/webapp/app/v3/engineVersion.server.ts index 0d0c6ecfdbf..32eca6fb882 100644 --- a/apps/webapp/app/v3/engineVersion.server.ts +++ b/apps/webapp/app/v3/engineVersion.server.ts @@ -5,6 +5,11 @@ import { getCurrentWorkerDeploymentEngineVersion, } from "./models/workerDeployment.server"; +// Co-locate the per-env run-ops residency/mint decision next to the +// engine-version decision. determineEngineVersion is intentionally left untouched so its +// read-only callers (presenters, admin routes, pauseQueue) never pay the mint flag read. +export { resolveRunIdMintKind, type RunIdMintKind } from "./runOpsMigration/runOpsMintKind.server"; + type Environment = { id: string; type: RuntimeEnvironmentType; diff --git a/apps/webapp/app/v3/eventRepository/index.server.ts b/apps/webapp/app/v3/eventRepository/index.server.ts index 614424a1993..f0687d2a7ce 100644 --- a/apps/webapp/app/v3/eventRepository/index.server.ts +++ b/apps/webapp/app/v3/eventRepository/index.server.ts @@ -3,6 +3,7 @@ import { eventRepository } from "./eventRepository.server"; import { type IEventRepository, type TraceEventOptions } from "./eventRepository.types"; import { prisma } from "~/db.server"; import { runStore } from "../runStore.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { logger } from "~/services/logger.server"; import { FEATURE_FLAG } from "../featureFlags"; import { flag } from "../featureFlags.server"; @@ -261,7 +262,7 @@ async function recordRunEvent( } async function findRunForEventCreation(runId: string) { - return runStore.findRun( + const foundRun = await runStore.findRun( { id: runId, }, @@ -271,21 +272,29 @@ async function findRunForEventCreation(runId: string) { taskIdentifier: true, traceContext: true, taskEventStore: true, - runtimeEnvironment: { - select: { - id: true, - type: true, - organizationId: true, - projectId: true, - project: { - select: { - externalRef: true, - }, - }, - }, - }, + runtimeEnvironmentId: true, }, }, prisma ); + + if (!foundRun) { + return null; + } + + const environment = await controlPlaneResolver.resolveAuthenticatedEnv( + foundRun.runtimeEnvironmentId + ); + + if (!environment) { + // Run exists but its environment could not be resolved (e.g. a lagging replica + // under split); distinguish this from a genuinely missing run. + logger.warn("Run found but environment unresolved for event creation", { + runId, + runtimeEnvironmentId: foundRun.runtimeEnvironmentId, + }); + return null; + } + + return { ...foundRun, runtimeEnvironment: environment }; } diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 46434bebf30..4617179eda1 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -17,6 +17,8 @@ export const FEATURE_FLAG = { computeMigrationFreePercentage: "computeMigrationFreePercentage", computeMigrationPaidPercentage: "computeMigrationPaidPercentage", computeMigrationRequireTemplate: "computeMigrationRequireTemplate", + devBranchesEnabled: "devBranchesEnabled", + runOpsMintKsuid: "runOpsMintKsuid", } as const; export const FeatureFlagCatalog = { @@ -47,6 +49,11 @@ export const FeatureFlagCatalog = { // When on, migrated orgs build their compute template in required mode at deploy // (fails the deploy on error) instead of shadow. Strict boolean (see above). [FEATURE_FLAG.computeMigrationRequireTemplate]: z.boolean(), + // Per-org access to development branches. Off unless enabled for the org. + [FEATURE_FLAG.devBranchesEnabled]: z.coerce.boolean(), + // Per-org KSUID mint cutover. Defaults to "cuid"; only honored when + // RUN_OPS_MINT_KSUID_ENABLED is on AND isSplitEnabled() is true. + [FEATURE_FLAG.runOpsMintKsuid]: z.enum(["cuid", "ksuid"]), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts index 3f9cd603b07..4d9e263d6be 100644 --- a/apps/webapp/app/v3/runEngine.server.ts +++ b/apps/webapp/app/v3/runEngine.server.ts @@ -8,6 +8,9 @@ import { defaultMachine, getCurrentPlan } from "~/services/platform.v3.server"; import { singleton } from "~/utils/singleton"; import { allMachines } from "./machinePresets.server"; import { runEnginePendingVersionLookup } from "./runEnginePendingVersionLookup.server"; +import { pickRunOpsStoreForCompletion } from "./runOpsMigration/crossSeamGuard.server"; +import { runEngineControlPlaneResolver } from "./runOpsMigration/runEngineControlPlaneResolver.server"; +import { runStore } from "./runStore.server"; import { meter, tracer } from "./tracer.server"; export const engine = singleton("RunEngine", createRunEngine); @@ -18,6 +21,12 @@ function createRunEngine() { const engine = new RunEngine({ prisma, readOnlyPrisma: $replica, + crossSeamGuard: pickRunOpsStoreForCompletion, + // Inject the shared run-store singleton so the engine and the webapp presenters/ + // services route through ONE store. When split is off this is the same passthrough + // PostgresRunStore the engine would have defaulted to, so behavior is unchanged. + store: runStore, + controlPlaneResolver: runEngineControlPlaneResolver, logLevel: env.RUN_ENGINE_WORKER_LOG_LEVEL, treatProductionExecutionStallsAsOOM: env.RUN_ENGINE_TREAT_PRODUCTION_EXECUTION_STALLS_AS_OOM === "1", diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 6e99898cdfe..d6c4dd5fe3f 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -3,8 +3,14 @@ import { SpanKind } from "@internal/tracing"; import { tryCatch } from "@trigger.dev/core/utils"; import { createJsonErrorObject, sanitizeError, TaskRunErrorCodes } from "@trigger.dev/core/v3"; import { RunId } from "@trigger.dev/core/v3/isomorphic"; -import type { BatchTaskRunStatus, Prisma } from "@trigger.dev/database"; -import { $replica, prisma } from "~/db.server"; +import { + $replica, + prisma, + runOpsLegacyPrisma, + runOpsNewPrisma, + runOpsNewReplica, + runOpsLegacyReplica, +} from "~/db.server"; import { env } from "~/env.server"; import { findEnvironmentById, findEnvironmentFromRun } from "~/models/runtimeEnvironment.server"; import { TriggerFailedTaskService } from "~/runEngine/services/triggerFailedTask.server"; @@ -22,39 +28,52 @@ import { getEventRepositoryForStore, recordRunDebugLog } from "./eventRepository import { roomFromFriendlyRunId, socketIo } from "./handleSocketIo.server"; import { engine } from "./runEngine.server"; import { runStore } from "./runStore.server"; +import { isSplitEnabled } from "~/v3/runOpsMigration/splitMode.server"; import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server"; +import { + handleBatchCompletion, + QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE, + readRunForEvent, + readRunForEventOrThrow, + type EventReadDeps, +} from "./runEngineHandlersShared.server"; export function registerRunEngineEventBusHandlers() { + // Resolve the split-mode gate ONCE at registration scope (never per-event). + const splitEnabledPromise = isSplitEnabled(); + const eventReadDeps = async (): Promise => ({ + store: runStore, + newReplica: runOpsNewReplica, + legacyReplica: runOpsLegacyReplica, + splitEnabled: await splitEnabledPromise, + }); engine.eventBus.on("runSucceeded", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( - { - id: run.id, - }, + readRunForEventOrThrow( + run.id, + environment.id, { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read so the - // per-env channel carries the membership keys (no separate query). No-op when - // the native backend is disabled. - runTags: true, - batchId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read so the + // per-env channel carries the membership keys (no separate query). No-op when + // the native backend is disabled. + runTags: true, + batchId: true, }, - $replica + await eventReadDeps() ) ); @@ -113,33 +132,30 @@ export function registerRunEngineEventBusHandlers() { const exception = createExceptionPropertiesFromError(sanitizedError); const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( + readRunForEventOrThrow( + run.id, + environment.id, { - id: run.id, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, - }, - $replica + await eventReadDeps() ) ); @@ -185,33 +201,33 @@ export function registerRunEngineEventBusHandlers() { const exception = createExceptionPropertiesFromError(sanitizedError); const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( - { - id: run.id, - }, + readRunForEventOrThrow( + run.id, + // runAttemptFailed carries no environment param; the env is derived from + // the read row afterwards. environmentId is informational for read-through + // (residency is keyed on runId), so an empty value is safe here. + "", { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - $replica + await eventReadDeps() ) ); @@ -273,29 +289,28 @@ export function registerRunEngineEventBusHandlers() { return; } + const deps = await eventReadDeps(); + const [cachedRunError, cachedRun] = await tryCatch( - runStore.findRunOrThrow( + readRunForEventOrThrow( + cachedRunId ?? "", + "", { - id: cachedRunId, - }, - { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, }, - $replica + deps ) ); @@ -308,29 +323,26 @@ export function registerRunEngineEventBusHandlers() { } const [blockedRunError, blockedRun] = await tryCatch( - runStore.findRun( - { - id: blockedRunId, - }, + readRunForEvent( + blockedRunId, + "", { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, }, - $replica + deps ) ); @@ -387,33 +399,30 @@ export function registerRunEngineEventBusHandlers() { } const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( - { - id: run.id, - }, + readRunForEventOrThrow( + run.id, + environment.id, { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - $replica + await eventReadDeps() ) ); @@ -456,33 +465,30 @@ export function registerRunEngineEventBusHandlers() { engine.eventBus.on("runCancelled", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( - runStore.findRunOrThrow( + readRunForEventOrThrow( + run.id, + environment.id, { - id: run.id, + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the native backend is disabled). + runTags: true, + batchId: true, }, - { - select: { - id: true, - friendlyId: true, - traceId: true, - spanId: true, - parentSpanId: true, - createdAt: true, - completedAt: true, - taskIdentifier: true, - projectId: true, - runtimeEnvironmentId: true, - environmentType: true, - isTest: true, - organizationId: true, - taskEventStore: true, - // Piggyback the realtime run-changed publish on this existing read (no-op when - // the native backend is disabled). - runTags: true, - batchId: true, - }, - }, - $replica + await eventReadDeps() ) ); @@ -771,15 +777,6 @@ export function registerRunEngineEventBusHandlers() { }); } -/** - * errorCode returned by the batch process-item callback when the trigger was - * rejected because the environment's queue is at its maximum size. The - * BatchQueue (via `skipRetries`) short-circuits retries for this code, and the - * batch completion callback collapses per-item errors into a single aggregate - * `BatchTaskRunError` row instead of writing one per item. - */ -const QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE = "QUEUE_SIZE_LIMIT_EXCEEDED"; - /** * Set up the BatchQueue processing callbacks. * These handle creating runs from batch items and completing batches. @@ -790,6 +787,9 @@ const QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE = "QUEUE_SIZE_LIMIT_EXCEEDED"; * - The run engine will download from R2 when the task executes */ export function setupBatchQueueCallbacks() { + // Resolve the split-mode gate ONCE at registration scope (never per-callback). + const splitEnabledPromise = isSplitEnabled(); + // Item processing callback - creates a run for each batch item engine.setBatchProcessItemCallback( async ({ batchId, friendlyId, itemIndex, item, meta, attempt, isFinalAttempt }) => { @@ -1035,104 +1035,17 @@ export function setupBatchQueueCallbacks() { } ); - // Batch completion callback - updates Postgres with results + // Batch completion callback - updates Postgres with results. The source callback + // is a thin wrapper that resolves the split-mode gate and supplies the run-ops + // handles; the body lives in handleBatchCompletion for testability. engine.setBatchCompletionCallback(async (result: CompleteBatchResult) => { - const { batchId, runIds, successfulRunCount, failedRunCount, failures } = result; - - // Determine final status - let status: BatchTaskRunStatus; - if (failedRunCount > 0 && successfulRunCount === 0) { - status = "ABORTED"; - } else if (failedRunCount > 0) { - status = "PARTIAL_FAILED"; - } else { - status = "PENDING"; // All runs created, waiting for completion - } - - try { - // Use a transaction to ensure atomicity of batch update and error record creation - // skipDuplicates handles idempotency when callback is retried (relies on unique constraint) - await prisma.$transaction(async (tx) => { - // Update BatchTaskRun - await tx.batchTaskRun.update({ - where: { id: batchId }, - data: { - status, - runIds, - successfulRunCount, - failedRunCount, - completedAt: status === "ABORTED" ? new Date() : undefined, - processingCompletedAt: new Date(), - }, - }); - - // Create error records if there were failures. - // - // Fast-path for queue-size-limit overload: when every failure is the - // same QUEUE_SIZE_LIMIT_EXCEEDED error, collapse them into a single - // aggregate row instead of writing one per item. This keeps the DB - // write volume bounded to O(batches) instead of O(items) when a noisy - // tenant fills their queue and all of their batches start bouncing. - if (failures.length > 0) { - const allQueueSizeLimit = failures.every( - (f) => f.errorCode === QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE - ); - - if (allQueueSizeLimit) { - const sample = failures[0]!; - await tx.batchTaskRunError.createMany({ - data: [ - { - batchTaskRunId: batchId, - // Use the first item's index as a stable anchor for the - // (batchTaskRunId, index) unique constraint so callback - // retries remain idempotent. - index: sample.index, - taskIdentifier: sample.taskIdentifier, - payload: sample.payload, - options: sample.options as Prisma.InputJsonValue | undefined, - error: `${sample.error} (${failures.length} items in this batch failed with the same error)`, - errorCode: sample.errorCode, - }, - ], - skipDuplicates: true, - }); - } else { - await tx.batchTaskRunError.createMany({ - data: failures.map((failure) => ({ - batchTaskRunId: batchId, - index: failure.index, - taskIdentifier: failure.taskIdentifier, - payload: failure.payload, - options: failure.options as Prisma.InputJsonValue | undefined, - error: failure.error, - errorCode: failure.errorCode, - })), - skipDuplicates: true, - }); - } - } - }); - - // Try to complete the batch (handles waitpoint completion if all runs are done) - if (status !== "ABORTED") { - await engine.tryCompleteBatch({ batchId }); - } - - logger.info("Batch completion handled", { - batchId, - status, - successfulRunCount, - failedRunCount, - }); - } catch (error) { - logger.error("Failed to handle batch completion", { - batchId, - error: error instanceof Error ? error.message : String(error), - }); - // Re-throw to preserve Redis data for retry (BatchQueue expects errors to propagate) - throw error; - } + await handleBatchCompletion(result, { + splitEnabled: await splitEnabledPromise, + newReplica: runOpsNewReplica, + newWriter: runOpsNewPrisma, + legacyWriter: runOpsLegacyPrisma, + tryCompleteBatch: (batchId) => engine.tryCompleteBatch({ batchId }), + }); }); logger.info("BatchQueue callbacks configured"); diff --git a/apps/webapp/app/v3/runEngineHandlersShared.server.ts b/apps/webapp/app/v3/runEngineHandlersShared.server.ts new file mode 100644 index 00000000000..338bbd2f9fc --- /dev/null +++ b/apps/webapp/app/v3/runEngineHandlersShared.server.ts @@ -0,0 +1,223 @@ +/** + * Pure, store-routing helpers extracted from runEngineHandlers.server.ts so they + * are testable without constructing the engine (importing that module pulls in the + * whole webapp service graph). The handlers wire the production defaults; tests + * inject per-container stores/replicas, so these helpers never import db.server. + */ +import type { CompleteBatchResult } from "@internal/run-engine"; +import type { RunStore } from "@internal/run-store"; +import type { BatchTaskRunStatus, Prisma } from "@trigger.dev/database"; +import type { PrismaClient, PrismaReplicaClient } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { readThroughRun } from "~/v3/runOpsMigration/readThrough.server"; + +export type EventReadDeps = { + store: RunStore; + newReplica: PrismaReplicaClient; + legacyReplica: PrismaReplicaClient; + splitEnabled: boolean; + // Pure boundary forwarded to read-through; production leaves it undefined + // so the read-through layer uses its own wired default. Tests inject a fake. + isPastRetention?: (runId: string) => boolean; +}; + +/** + * Resolve a TaskRun for an event-bus enrichment read through the run-ops + * read-through layer. The store stays the read mechanism (the + * closures call `store.findRun(...)`); read-through only chooses which replica. + * Returns null when not-found / past-retention. Passthrough in single-DB. + */ +export async function readRunForEvent( + runId: string, + environmentId: string, + select: S, + deps: EventReadDeps +): Promise | null> { + const result = await readThroughRun>({ + runId, + environmentId, + readNew: (client) => deps.store.findRun({ id: runId }, { select }, client), + readLegacy: (replica) => deps.store.findRun({ id: runId }, { select }, replica), + deps: { + newClient: deps.newReplica, + legacyReplica: deps.legacyReplica, + splitEnabled: deps.splitEnabled, + isPastRetention: deps.isPastRetention, + }, + }); + + return result.source === "not-found" || result.source === "past-retention" ? null : result.value; +} + +/** + * Reproduces the `findRunOrThrow` not-found-as-error semantics the 6 throwing + * read sites rely on (a missing run throws, which their `tryCatch` turns into + * the existing error-log + early-return — never a silent no-op). + */ +export async function readRunForEventOrThrow( + runId: string, + environmentId: string, + select: S, + deps: EventReadDeps +): Promise> { + const run = await readRunForEvent(runId, environmentId, select, deps); + if (!run) { + throw new Error("Task run not found"); + } + return run; +} + +/** + * Resolve which run-ops writer physically owns the `BatchTaskRun` row for + * `batchId` by probing where the row lives, so the batch-completion txn commits + * on a single run-ops DB. Length classification is INVALID here: a batch id may + * be a ksuid (cut-over orgs) or a cuid (and cuid-shaped ids can be backfilled + * onto NEW), so id-shape does not reliably indicate the row's actual residency. + * The existence probe is the correct signal. + */ +export async function resolveBatchRunOpsWriter( + batchId: string, + deps: { + newReplica: PrismaReplicaClient; + newWriter: PrismaClient; + legacyWriter: PrismaClient; + } +): Promise { + const onNew = await deps.newReplica.batchTaskRun.findFirst({ + where: { id: batchId }, + select: { id: true }, + }); + return onNew ? deps.newWriter : deps.legacyWriter; +} + +/** + * errorCode returned by the batch process-item callback when the trigger was + * rejected because the environment's queue is at its maximum size. The + * BatchQueue (via `skipRetries`) short-circuits retries for this code, and the + * batch completion callback collapses per-item errors into a single aggregate + * `BatchTaskRunError` row instead of writing one per item. + */ +export const QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE = "QUEUE_SIZE_LIMIT_EXCEEDED"; + +export type BatchCompletionDeps = { + splitEnabled: boolean; + newReplica: PrismaReplicaClient; + newWriter: PrismaClient; + legacyWriter: PrismaClient; + tryCompleteBatch: (batchId: string) => Promise; +}; + +/** + * Routes the batch-completion transaction (BatchTaskRun update + BatchTaskRunError + * createMany — both run-ops tables) onto the run-ops writer that physically owns + * the BatchTaskRun row for `batchId`, so the whole txn commits on a single DB. The + * transaction body is unchanged from before the split; only the client changes. + */ +export async function handleBatchCompletion( + result: CompleteBatchResult, + deps: BatchCompletionDeps +) { + const { batchId, runIds, successfulRunCount, failedRunCount, failures } = result; + + // Determine final status + let status: BatchTaskRunStatus; + if (failedRunCount > 0 && successfulRunCount === 0) { + status = "ABORTED"; + } else if (failedRunCount > 0) { + status = "PARTIAL_FAILED"; + } else { + status = "PENDING"; // All runs created, waiting for completion + } + + // Always probe residency — never special-case on splitEnabled (see commit msg). + const runOpsWriter = await resolveBatchRunOpsWriter(batchId, { + newReplica: deps.newReplica, + newWriter: deps.newWriter, + legacyWriter: deps.legacyWriter, + }); + + try { + // Use a transaction to ensure atomicity of batch update and error record creation + // skipDuplicates handles idempotency when callback is retried (relies on unique constraint) + await runOpsWriter.$transaction(async (tx) => { + // Update BatchTaskRun + await tx.batchTaskRun.update({ + where: { id: batchId }, + data: { + status, + runIds, + successfulRunCount, + failedRunCount, + completedAt: status === "ABORTED" ? new Date() : undefined, + processingCompletedAt: new Date(), + }, + }); + + // Create error records if there were failures. + // + // Fast-path for queue-size-limit overload: when every failure is the + // same QUEUE_SIZE_LIMIT_EXCEEDED error, collapse them into a single + // aggregate row instead of writing one per item. This keeps the DB + // write volume bounded to O(batches) instead of O(items) when a noisy + // tenant fills their queue and all of their batches start bouncing. + if (failures.length > 0) { + const allQueueSizeLimit = failures.every( + (f) => f.errorCode === QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE + ); + + if (allQueueSizeLimit) { + const sample = failures[0]!; + await tx.batchTaskRunError.createMany({ + data: [ + { + batchTaskRunId: batchId, + // Use the first item's index as a stable anchor for the + // (batchTaskRunId, index) unique constraint so callback + // retries remain idempotent. + index: sample.index, + taskIdentifier: sample.taskIdentifier, + payload: sample.payload, + options: sample.options as Prisma.InputJsonValue | undefined, + error: `${sample.error} (${failures.length} items in this batch failed with the same error)`, + errorCode: sample.errorCode, + }, + ], + skipDuplicates: true, + }); + } else { + await tx.batchTaskRunError.createMany({ + data: failures.map((failure) => ({ + batchTaskRunId: batchId, + index: failure.index, + taskIdentifier: failure.taskIdentifier, + payload: failure.payload, + options: failure.options as Prisma.InputJsonValue | undefined, + error: failure.error, + errorCode: failure.errorCode, + })), + skipDuplicates: true, + }); + } + } + }); + + // Try to complete the batch (handles waitpoint completion if all runs are done) + if (status !== "ABORTED") { + await deps.tryCompleteBatch(batchId); + } + + logger.info("Batch completion handled", { + batchId, + status, + successfulRunCount, + failedRunCount, + }); + } catch (error) { + logger.error("Failed to handle batch completion", { + batchId, + error: error instanceof Error ? error.message : String(error), + }); + // Re-throw to preserve Redis data for retry (BatchQueue expects errors to propagate) + throw error; + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts new file mode 100644 index 00000000000..9c90e3067e1 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.test.ts @@ -0,0 +1,237 @@ +import { describe, expect, it } from "vitest"; +import { + ControlPlaneCache, + type ResolvedAuthenticatedEnv, + type ResolvedEnv, + type ResolvedRunLockedWorker, + type ResolvedWorkerVersion, +} from "./controlPlaneCache.server"; + +// Minimal, structurally-irrelevant stand-ins: the cache stores and returns opaque values by +// reference, so these only need to be distinguishable objects — the slot types are exercised for +// key routing, not field shape. +const anEnv = { id: "env_1", organizationId: "org_1" } as unknown as ResolvedEnv; +const aVersion = { worker: { id: "bw_1" } } as unknown as ResolvedWorkerVersion; +const anAuthEnv = { + id: "env_1", + slug: "prod", + organizationId: "org_1", +} as unknown as ResolvedAuthenticatedEnv; +const aLockedWorker = { lockedBy: null, lockedToVersion: null } as ResolvedRunLockedWorker; + +describe("ControlPlaneCache", () => { + it("round-trips a value through every slot", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_1", anEnv); + cache.setWorkerVersion("env_1:current", aVersion); + cache.setEnvExists("env_1", true); + cache.setAuthEnv("env_1", anAuthEnv); + cache.setLockedWorker("bw_1:v_1", aLockedWorker); + + expect(cache.getEnv("env_1")).toBe(anEnv); + expect(cache.getWorkerVersion("env_1:current")).toBe(aVersion); + expect(cache.getEnvExists("env_1")).toBe(true); + expect(cache.getAuthEnv("env_1")).toBe(anAuthEnv); + expect(cache.getLockedWorker("bw_1:v_1")).toBe(aLockedWorker); + }); + + it("returns undefined for a key that was never set, in every slot", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + expect(cache.getEnv("missing")).toBeUndefined(); + expect(cache.getWorkerVersion("missing")).toBeUndefined(); + expect(cache.getEnvExists("missing")).toBeUndefined(); + expect(cache.getAuthEnv("missing")).toBeUndefined(); + expect(cache.getLockedWorker("missing")).toBeUndefined(); + }); + + it("distinguishes a cached null (confirmed absence) from an unset miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + expect(cache.getEnv("env_2")).toBeUndefined(); + cache.setEnv("env_2", null); + expect(cache.getEnv("env_2")).toBeNull(); + + expect(cache.getAuthEnv("env_2")).toBeUndefined(); + cache.setAuthEnv("env_2", null); + expect(cache.getAuthEnv("env_2")).toBeNull(); + + expect(cache.getWorkerVersion("env_2:current")).toBeUndefined(); + cache.setWorkerVersion("env_2:current", null); + expect(cache.getWorkerVersion("env_2:current")).toBeNull(); + + expect(cache.getLockedWorker("_:_")).toBeUndefined(); + cache.setLockedWorker("_:_", null); + expect(cache.getLockedWorker("_:_")).toBeNull(); + }); + + it("caches a false env-existence result distinctly from an unset miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + expect(cache.getEnvExists("env_3")).toBeUndefined(); + cache.setEnvExists("env_3", false); + expect(cache.getEnvExists("env_3")).toBe(false); + }); + + it("invalidateEnv forces the next getEnv to miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_4", anEnv); + expect(cache.getEnv("env_4")).toBe(anEnv); + + cache.invalidateEnv("env_4"); + expect(cache.getEnv("env_4")).toBeUndefined(); + }); + + it("makes a re-setEnv after invalidation readable again", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const replacement = { id: "env_5b" } as unknown as ResolvedEnv; + + cache.setEnv("env_5", anEnv); + cache.invalidateEnv("env_5"); + expect(cache.getEnv("env_5")).toBeUndefined(); + + cache.setEnv("env_5", replacement); + expect(cache.getEnv("env_5")).toBe(replacement); + }); + + it("invalidateEnv is scoped to its own id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const other = { id: "env_keep" } as unknown as ResolvedEnv; + + cache.setEnv("env_drop", anEnv); + cache.setEnv("env_keep", other); + cache.invalidateEnv("env_drop"); + + expect(cache.getEnv("env_drop")).toBeUndefined(); + expect(cache.getEnv("env_keep")).toBe(other); + }); + + it("does not collide keys across slots for the same id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("x", anEnv); + cache.setEnvExists("x", true); + cache.setAuthEnv("x", anAuthEnv); + + expect(cache.getEnv("x")).toBe(anEnv); + expect(cache.getEnvExists("x")).toBe(true); + expect(cache.getAuthEnv("x")).toBe(anAuthEnv); + + // Invalidating the env slot leaves the sibling slots for the same id intact. + cache.invalidateEnv("x"); + expect(cache.getEnv("x")).toBeUndefined(); + expect(cache.getEnvExists("x")).toBe(true); + expect(cache.getAuthEnv("x")).toBe(anAuthEnv); + }); + + it("evicts the oldest entry once maxEntries is exceeded", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 2 }); + + cache.setEnv("first", { id: "first" } as unknown as ResolvedEnv); + cache.setEnv("second", { id: "second" } as unknown as ResolvedEnv); + cache.setEnv("third", { id: "third" } as unknown as ResolvedEnv); + + expect(cache.getEnv("first")).toBeUndefined(); + expect(cache.getEnv("second")).toMatchObject({ id: "second" }); + expect(cache.getEnv("third")).toMatchObject({ id: "third" }); + }); + + it("treats a zero-TTL entry as immediately expired", () => { + const cache = new ControlPlaneCache({ ttlMs: 0, maxEntries: 100 }); + + cache.setEnv("env_ttl", anEnv); + expect(cache.getEnv("env_ttl")).toBeUndefined(); + }); + + it("invalidateEnvironment forces the next env/authEnv/envExists read to miss", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_6", anEnv); + cache.setAuthEnv("env_6", anAuthEnv); + cache.setEnvExists("env_6", true); + expect(cache.getEnv("env_6")).toBe(anEnv); + expect(cache.getAuthEnv("env_6")).toBe(anAuthEnv); + expect(cache.getEnvExists("env_6")).toBe(true); + + cache.invalidateEnvironment("env_6"); + + expect(cache.getEnv("env_6")).toBeUndefined(); + expect(cache.getAuthEnv("env_6")).toBeUndefined(); + expect(cache.getEnvExists("env_6")).toBeUndefined(); + }); + + it("invalidateEnvironment is scoped to its own id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const keepEnv = { id: "env_keep", organizationId: "org_1" } as unknown as ResolvedEnv; + + cache.setEnv("env_drop", anEnv); + cache.setEnv("env_keep", keepEnv); + cache.invalidateEnvironment("env_drop"); + + expect(cache.getEnv("env_drop")).toBeUndefined(); + expect(cache.getEnv("env_keep")).toBe(keepEnv); + }); + + it("invalidateOrganization drops env/authEnv rows for that org across every env id", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const envA = { id: "env_a", organizationId: "org_1" } as unknown as ResolvedEnv; + const envB = { id: "env_b", organizationId: "org_1" } as unknown as ResolvedEnv; + const authA = { + id: "env_a", + slug: "a", + organizationId: "org_1", + } as unknown as ResolvedAuthenticatedEnv; + + cache.setEnv("env_a", envA); + cache.setEnv("env_b", envB); + cache.setAuthEnv("env_a", authA); + expect(cache.getEnv("env_a")).toBe(envA); + expect(cache.getEnv("env_b")).toBe(envB); + expect(cache.getAuthEnv("env_a")).toBe(authA); + + cache.invalidateOrganization("org_1"); + + // Every env/authEnv row for org_1 misses — no reverse org->env index required. + expect(cache.getEnv("env_a")).toBeUndefined(); + expect(cache.getEnv("env_b")).toBeUndefined(); + expect(cache.getAuthEnv("env_a")).toBeUndefined(); + }); + + it("invalidateOrganization does not affect a different org's cached envs", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const otherOrgEnv = { id: "env_other", organizationId: "org_2" } as unknown as ResolvedEnv; + + cache.setEnv("env_1", anEnv); // org_1 + cache.setEnv("env_other", otherOrgEnv); // org_2 + + cache.invalidateOrganization("org_1"); + + expect(cache.getEnv("env_1")).toBeUndefined(); + expect(cache.getEnv("env_other")).toBe(otherOrgEnv); + }); + + it("re-setting an env after an org invalidation makes it readable again", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_1", anEnv); + cache.invalidateOrganization("org_1"); + expect(cache.getEnv("env_1")).toBeUndefined(); + + // A write after the bump stamps the new org epoch, so it reads back. + cache.setEnv("env_1", anEnv); + expect(cache.getEnv("env_1")).toBe(anEnv); + }); + + it("a cached null env survives an org invalidation (a confirmed absence carries no org)", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + + cache.setEnv("env_absent", null); + expect(cache.getEnv("env_absent")).toBeNull(); + + cache.invalidateOrganization("org_1"); + + expect(cache.getEnv("env_absent")).toBeNull(); + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts new file mode 100644 index 00000000000..01fd205030a --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneCache.server.ts @@ -0,0 +1,244 @@ +import type { + BackgroundWorker, + BackgroundWorkerTask, + Prisma, + RuntimeEnvironmentType, + TaskQueue, + WorkerDeployment, +} from "@trigger.dev/database"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import type { AuthenticatedEnvironment } from "@trigger.dev/core/v3/auth/environment"; + +/** + * Cache policy + invalidation for the cross-DB control-plane resolver. + * + * One-way dependency: this module is imported by `controlPlaneResolver.server.ts`; + * it must NEVER import the resolver. The shared `Resolved*` return types live here + * so both files reference an identical definition (the resolver re-exports them for + * consumers). + * + * Invalidation note: the underlying `BoundedTtlCache` exposes no public `delete`, so + * explicit invalidation is implemented with a per-key epoch map. A write stamps the + * stored value with the key's current epoch; a read returns the value only if its + * stamped epoch still matches the current epoch, otherwise it is treated as a miss. + * `invalidate*` bumps the key's epoch, forcing the next read to miss. (If a future + * rebase gives `BoundedTtlCache` a public `delete`, prefer it and drop the epoch map.) + * + * Two invalidation scopes: `invalidateEnvironment(id)` bumps every env-keyed slot for one + * env; `invalidateOrganization(orgId)` bumps a per-org epoch that env/authEnv values are + * also stamped with at write time (no reverse org->env index needed), so all of that org's + * cached env/authEnv rows miss on the next read. + */ + +export const DEFAULT_CP_CACHE_TTL_MS = 30_000; +export const DEFAULT_CP_CACHE_MAX_ENTRIES = 10_000; + +export type ResolvedEnv = { + id: string; + type: RuntimeEnvironmentType; + projectId: string; + organizationId: string; + archivedAt: Date | null; + // The parent env's type, or null when this env has no parent. Alerts compute + // `parentEnvironmentType ?? type` (byte-identical to `parentEnvironment?.type ?? type`). + parentEnvironmentType: RuntimeEnvironmentType | null; + // Concurrency + nested ids the run-engine ControlPlaneResolver adapter maps to + // `ResolvedEngineEnv` (a MinimalAuthenticatedEnvironment superset). Existing app consumers + // ignore these additive fields. + maximumConcurrencyLimit: number; + concurrencyLimitBurstFactor: Prisma.Decimal; +}; + +/** Mirrors `WorkerDeploymentWithWorkerTasks` in `dequeueSystem.ts` exactly. */ +export type ResolvedWorkerVersion = { + worker: BackgroundWorker; + tasks: BackgroundWorkerTask[]; + queues: TaskQueue[]; + deployment: WorkerDeployment | null; +}; + +// The canonical authenticated-environment shape (slug/type/project/organization/orgMember/…) +// PLUS the `git` JSON column the run-engine runAttemptSystem reads. `AuthenticatedEnvironment` +// does not carry `git`, so the intersection adds it; this matches the run-engine +// `ResolvedAuthenticatedEnv` so the engine adapter can delegate to this cached slot. +export type ResolvedAuthenticatedEnv = AuthenticatedEnvironment & { git: Prisma.JsonValue | null }; + +/** + * The slim `lockedBy` (BackgroundWorkerTask) + `lockedToVersion` (BackgroundWorker, with its + * WorkerDeployment) shape — the UNION of every field webapp run sites read off these two + * cross-DB worker relations. Each field is optional because a run may be locked to a version + * but not a task (or neither); resolvers return only what exists. + */ +export type ResolvedRunLockedWorker = { + lockedBy: { + id: string; + filePath: string; + exportName: string | null; + slug: string; + machineConfig: Prisma.JsonValue | null; + worker: { + id: string; + version: string; + sdkVersion: string; + cliVersion: string; + supportsLazyAttempts: boolean; + deployment: { + friendlyId: string; + shortCode: string; + version: string; + runtime: string | null; + runtimeVersion: string | null; + git: Prisma.JsonValue | null; + } | null; + }; + } | null; + lockedToVersion: { + version: string; + sdkVersion: string; + runtime: string | null; + runtimeVersion: string | null; + supportsLazyAttempts: boolean; + } | null; +}; + +// `orgEpoch` is stamped only on slots that embed org config (env/authEnv); undefined slots +// are exempt from the org-epoch check. +type Stamped = { value: V; epoch: number; orgEpoch?: number }; + +export class ControlPlaneCache { + readonly #env: BoundedTtlCache>; + readonly #version: BoundedTtlCache>; + readonly #envExists: BoundedTtlCache>; + readonly #authEnv: BoundedTtlCache>; + readonly #lockedWorker: BoundedTtlCache>; + + // Explicit invalidation: bumping a key's (or org's) epoch forces the next read to miss. + readonly #epochs = new Map(); + readonly #orgEpochs = new Map(); + + constructor(opts?: { ttlMs?: number; maxEntries?: number }) { + const ttl = opts?.ttlMs ?? DEFAULT_CP_CACHE_TTL_MS; + const max = opts?.maxEntries ?? DEFAULT_CP_CACHE_MAX_ENTRIES; + this.#env = new BoundedTtlCache(ttl, max); + this.#version = new BoundedTtlCache(ttl, max); + this.#envExists = new BoundedTtlCache(ttl, max); + this.#authEnv = new BoundedTtlCache(ttl, max); + this.#lockedWorker = new BoundedTtlCache(ttl, max); + } + + #epoch(key: string): number { + return this.#epochs.get(key) ?? 0; + } + + #orgEpoch(orgId: string): number { + return this.#orgEpochs.get(orgId) ?? 0; + } + + #read(cache: BoundedTtlCache>, key: string, orgId?: string): V | undefined { + const entry = cache.get(key); + if (entry === undefined || entry.epoch !== this.#epoch(key)) { + return undefined; + } + if (orgId !== undefined && entry.orgEpoch !== this.#orgEpoch(orgId)) { + return undefined; + } + return entry.value; + } + + #write(cache: BoundedTtlCache>, key: string, value: V, orgId?: string): void { + cache.set(key, { + value, + epoch: this.#epoch(key), + orgEpoch: orgId !== undefined ? this.#orgEpoch(orgId) : undefined, + }); + } + + #bump(key: string): void { + this.#epochs.set(key, this.#epoch(key) + 1); + } + + getEnv(id: string): (ResolvedEnv | null) | undefined { + const entry = this.#env.get(`env:${id}`); + if (entry === undefined || entry.epoch !== this.#epoch(`env:${id}`)) { + return undefined; + } + // A cached null (or an entry written without an org) carries no org, so it can never be + // stale against an org write. + if ( + entry.value !== null && + entry.value.organizationId && + entry.orgEpoch !== this.#orgEpoch(entry.value.organizationId) + ) { + return undefined; + } + return entry.value; + } + setEnv(id: string, value: ResolvedEnv | null): void { + this.#write(this.#env, `env:${id}`, value, value?.organizationId); + } + invalidateEnv(id: string): void { + this.#bump(`env:${id}`); + } + + // worker version: key = `${environmentId}:${backgroundWorkerId ?? "current"}` + getWorkerVersion(key: string): (ResolvedWorkerVersion | null) | undefined { + return this.#read(this.#version, `version:${key}`); + } + setWorkerVersion(key: string, value: ResolvedWorkerVersion | null): void { + this.#write(this.#version, `version:${key}`, value); + } + + // env existence (boolean; for the dropped-FK replacement check) + getEnvExists(id: string): boolean | undefined { + return this.#read(this.#envExists, `envExists:${id}`); + } + setEnvExists(id: string, exists: boolean): void { + this.#write(this.#envExists, `envExists:${id}`, exists); + } + + // full authenticated environment (toAuthenticated shape) + getAuthEnv(id: string): (ResolvedAuthenticatedEnv | null) | undefined { + const entry = this.#authEnv.get(`authEnv:${id}`); + if (entry === undefined || entry.epoch !== this.#epoch(`authEnv:${id}`)) { + return undefined; + } + if ( + entry.value !== null && + entry.value.organizationId && + entry.orgEpoch !== this.#orgEpoch(entry.value.organizationId) + ) { + return undefined; + } + return entry.value; + } + setAuthEnv(id: string, value: ResolvedAuthenticatedEnv | null): void { + this.#write(this.#authEnv, `authEnv:${id}`, value, value?.organizationId); + } + + /** + * Invalidate every env-keyed slot for a single environment. Call this from a control-plane + * write that mutates one env's config (pause/resume, archive, concurrency/burst-factor). + */ + invalidateEnvironment(id: string): void { + this.#bump(`env:${id}`); + this.#bump(`authEnv:${id}`); + this.#bump(`envExists:${id}`); + } + + /** + * Invalidate every cached env/authEnv row belonging to an organization. Call this from a + * control-plane write that mutates org-level config (feature flags, org concurrency, runs + * enable/disable, rate limits) — it affects the org object embedded in each of the org's envs. + */ + invalidateOrganization(orgId: string): void { + this.#orgEpochs.set(orgId, this.#orgEpoch(orgId) + 1); + } + + // run-locked worker (lockedBy + lockedToVersion); key = `${lockedById ?? "_"}:${lockedToVersionId ?? "_"}` + getLockedWorker(key: string): (ResolvedRunLockedWorker | null) | undefined { + return this.#read(this.#lockedWorker, `lockedWorker:${key}`); + } + setLockedWorker(key: string, value: ResolvedRunLockedWorker | null): void { + this.#write(this.#lockedWorker, `lockedWorker:${key}`, value); + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts new file mode 100644 index 00000000000..ce83c632abf --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/controlPlaneResolver.server.ts @@ -0,0 +1,463 @@ +import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; +import type { + PrismaClient, + PrismaReplicaClient, + RuntimeEnvironmentType, +} from "@trigger.dev/database"; +import { prisma, $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { + ControlPlaneCache, + DEFAULT_CP_CACHE_MAX_ENTRIES, + DEFAULT_CP_CACHE_TTL_MS, + type ResolvedAuthenticatedEnv, + type ResolvedEnv, + type ResolvedWorkerVersion, + type ResolvedRunLockedWorker, +} from "./controlPlaneCache.server"; +import { authIncludeWithParent, toAuthenticated } from "~/models/runtimeEnvironment.server"; + +/** + * App-level control-plane resolution + cache layer. Replaces the run-ops -> control-plane + * Prisma joins (env/project/org, the pinned/current worker version + its tasks/queues, the + * TaskQueue, the TaskSchedule friendlyId mapping) with cached lookups against the + * control-plane client, so the split (cross-DB) hot path avoids a cross-WAN round-trip per + * resolution. + * + * Split ON (cloud): cache-first reads against the control-plane replica; `null` is cached as + * a confirmed absence. Split OFF (self-host/local/CI): plain Prisma join against the single + * control-plane client on every call, NO cache — byte-identical to today's inline join. + * + * The split gate is a SYNCHRONOUS `splitEnabled: () => boolean` injected at construction; the + * resolver never awaits the async `isSplitEnabled()` (that gate is reserved for the boot + * sentinel). Tests inject testcontainer clients + a sync predicate; only the module-level + * singleton at the bottom reads from `db.server.ts` / `env.server.ts`. + * + * Scope boundary: this unit owns ONLY control-plane resolution (env, worker version, + * env existence). The run-ops batchId friendlyId->id resolution belongs to the + * run-ops read path (the unit owning `runsRepository.server.ts`); do not duplicate it here. + */ + +export { ResolvedEnv, ResolvedWorkerVersion }; +export type { ResolvedAuthenticatedEnv, ResolvedRunLockedWorker }; + +/** Thrown by `assertEnvExists` when a referenced control-plane env does not exist. */ +export class ControlPlaneReferenceError extends Error { + constructor(message: string) { + super(message); + this.name = "ControlPlaneReferenceError"; + } +} + +export type ControlPlaneResolverOptions = { + controlPlanePrimary: PrismaClient; + controlPlaneReplica: PrismaReplicaClient; + cache: ControlPlaneCache; + splitEnabled: () => boolean; +}; + +type CpClient = PrismaClient | PrismaReplicaClient; + +function workerVersionKey( + environmentId: string, + backgroundWorkerId: string | undefined, + type: RuntimeEnvironmentType | undefined +): string { + return `${environmentId}:${backgroundWorkerId ?? "current"}:${type ?? "any"}`; +} + +function lockedWorkerKey(lockedById?: string | null, lockedToVersionId?: string | null): string { + return `${lockedById ?? "_"}:${lockedToVersionId ?? "_"}`; +} + +export class ControlPlaneResolver { + private readonly controlPlanePrimary: PrismaClient; + private readonly controlPlaneReplica: PrismaReplicaClient; + private readonly cache: ControlPlaneCache; + private readonly splitEnabled: () => boolean; + + constructor(opts: ControlPlaneResolverOptions) { + this.controlPlanePrimary = opts.controlPlanePrimary; + this.controlPlaneReplica = opts.controlPlaneReplica; + this.cache = opts.cache; + this.splitEnabled = opts.splitEnabled; + } + + async resolveEnv(environmentId: string): Promise { + if (!this.splitEnabled()) { + return this.#queryEnv(this.controlPlanePrimary, environmentId); + } + + const cached = this.cache.getEnv(environmentId); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryEnv(this.controlPlaneReplica, environmentId); + this.cache.setEnv(environmentId, resolved); + return resolved; + } + + async #queryEnv(client: CpClient, environmentId: string): Promise { + const env = await client.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + select: { + id: true, + type: true, + projectId: true, + archivedAt: true, + maximumConcurrencyLimit: true, + concurrencyLimitBurstFactor: true, + project: { select: { organizationId: true } }, + parentEnvironment: { select: { type: true } }, + }, + }); + + if (!env) { + return null; + } + + return { + id: env.id, + type: env.type, + projectId: env.projectId, + organizationId: env.project.organizationId, + archivedAt: env.archivedAt, + parentEnvironmentType: env.parentEnvironment?.type ?? null, + maximumConcurrencyLimit: env.maximumConcurrencyLimit, + concurrencyLimitBurstFactor: env.concurrencyLimitBurstFactor, + }; + } + + async resolveAuthenticatedEnv(environmentId: string): Promise { + if (!this.splitEnabled()) { + return this.#queryAuthenticatedEnv(this.controlPlanePrimary, environmentId); + } + + const cached = this.cache.getAuthEnv(environmentId); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryAuthenticatedEnv(this.controlPlaneReplica, environmentId); + this.cache.setAuthEnv(environmentId, resolved); + return resolved; + } + + async #queryAuthenticatedEnv( + client: CpClient, + environmentId: string + ): Promise { + const env = await client.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + include: authIncludeWithParent, + }); + + if (!env) { + return null; + } + + // `authIncludeWithParent` returns all RuntimeEnvironment scalars on the row (including + // `git`), so we map the auth shape via toAuthenticated() and add `git` from the same row. + return { ...toAuthenticated(env), git: env.git }; + } + + async resolveRunLockedWorker(args: { + lockedById?: string | null; + lockedToVersionId?: string | null; + }): Promise { + const { lockedById, lockedToVersionId } = args; + + if (!this.splitEnabled()) { + return this.#queryRunLockedWorker(this.controlPlanePrimary, lockedById, lockedToVersionId); + } + + const key = lockedWorkerKey(lockedById, lockedToVersionId); + const cached = this.cache.getLockedWorker(key); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryRunLockedWorker( + this.controlPlaneReplica, + lockedById, + lockedToVersionId + ); + this.cache.setLockedWorker(key, resolved); + return resolved; + } + + async #queryRunLockedWorker( + client: CpClient, + lockedById?: string | null, + lockedToVersionId?: string | null + ): Promise { + const lockedByRow = lockedById + ? await client.backgroundWorkerTask.findFirst({ + where: { id: lockedById }, + select: { + id: true, + filePath: true, + exportName: true, + slug: true, + machineConfig: true, + worker: { + select: { + id: true, + version: true, + sdkVersion: true, + cliVersion: true, + supportsLazyAttempts: true, + deployment: { + select: { + friendlyId: true, + shortCode: true, + version: true, + runtime: true, + runtimeVersion: true, + git: true, + }, + }, + }, + }, + }, + }) + : null; + + const lockedToVersionRow = lockedToVersionId + ? await client.backgroundWorker.findFirst({ + where: { id: lockedToVersionId }, + select: { + version: true, + sdkVersion: true, + runtime: true, + runtimeVersion: true, + supportsLazyAttempts: true, + }, + }) + : null; + + return { + lockedBy: lockedByRow, + lockedToVersion: lockedToVersionRow, + }; + } + + async resolveWorkerVersion(args: { + environmentId: string; + backgroundWorkerId?: string; + /** + * When provided, the full run-engine dequeue dispatch is used (DEV resolves the most-recent + * worker; deployed resolves the promoted MANAGED deployment with the latest-v2 fallback). + * When omitted, the original app behavior applies (worker-by-id, else current promotion). + */ + type?: RuntimeEnvironmentType; + }): Promise { + const { environmentId, backgroundWorkerId, type } = args; + + if (!this.splitEnabled()) { + return this.#queryWorkerVersion( + this.controlPlanePrimary, + environmentId, + backgroundWorkerId, + type + ); + } + + const key = workerVersionKey(environmentId, backgroundWorkerId, type); + const cached = this.cache.getWorkerVersion(key); + if (cached !== undefined) { + return cached; + } + + const resolved = await this.#queryWorkerVersion( + this.controlPlaneReplica, + environmentId, + backgroundWorkerId, + type + ); + this.cache.setWorkerVersion(key, resolved); + return resolved; + } + + async #queryWorkerVersion( + client: CpClient, + environmentId: string, + backgroundWorkerId?: string, + type?: RuntimeEnvironmentType + ): Promise { + // Full run-engine dequeue dispatch (mirrors dequeueSystem's four helpers) when the env type is + // known. DEVELOPMENT envs resolve by most-recent worker; deployed envs resolve the promoted + // MANAGED deployment. + if (type === "DEVELOPMENT") { + return backgroundWorkerId + ? this.#queryWorkerById(client, backgroundWorkerId) + : this.#queryMostRecentWorker(client, environmentId); + } + + if (backgroundWorkerId) { + const worker = await client.backgroundWorker.findFirst({ + where: { id: backgroundWorkerId }, + include: { deployment: true, tasks: true, queues: true }, + }); + + if (!worker) { + return null; + } + + return { + worker, + tasks: worker.tasks, + queues: worker.queues, + deployment: worker.deployment, + }; + } + + // Deployed env, no workerId: resolve the currently-promoted deployment's worker. When `type` + // is known (engine dispatch) apply the MANAGED guard + latest-v2 fallback that the run-engine + // path requires; without `type` keep the original app behavior (return the promoted worker). + const promotion = await client.workerDeploymentPromotion.findFirst({ + where: { environmentId, label: CURRENT_DEPLOYMENT_LABEL }, + include: { + deployment: { + include: { worker: { include: { tasks: true, queues: true } } }, + }, + }, + }); + + if (!promotion?.deployment.worker) { + return null; + } + + if (type === undefined || promotion.deployment.type === "MANAGED") { + return { + worker: promotion.deployment.worker, + tasks: promotion.deployment.worker.tasks, + queues: promotion.deployment.worker.queues, + deployment: promotion.deployment, + }; + } + + // Engine dispatch only: the promoted deployment is not run-engine v2; fall back to the latest + // MANAGED deployment. + const latestV2Deployment = await client.workerDeployment.findFirst({ + where: { environmentId, type: "MANAGED" }, + orderBy: { id: "desc" }, + include: { worker: { include: { tasks: true, queues: true } } }, + }); + + if (!latestV2Deployment?.worker) { + return null; + } + + return { + worker: latestV2Deployment.worker, + tasks: latestV2Deployment.worker.tasks, + queues: latestV2Deployment.worker.queues, + deployment: latestV2Deployment, + }; + } + + async #queryWorkerById( + client: CpClient, + workerId: string + ): Promise { + const worker = await client.backgroundWorker.findFirst({ + where: { id: workerId }, + include: { deployment: true, tasks: true, queues: true }, + orderBy: { id: "desc" }, + }); + + if (!worker) { + return null; + } + + return { worker, tasks: worker.tasks, queues: worker.queues, deployment: worker.deployment }; + } + + async #queryMostRecentWorker( + client: CpClient, + environmentId: string + ): Promise { + const worker = await client.backgroundWorker.findFirst({ + where: { runtimeEnvironmentId: environmentId }, + include: { tasks: true, queues: true }, + orderBy: { id: "desc" }, + }); + + if (!worker) { + return null; + } + + return { worker, tasks: worker.tasks, queues: worker.queues, deployment: null }; + } + + async assertEnvExists(environmentId: string): Promise { + if (!this.splitEnabled()) { + // Split OFF = single DB, so run and env are co-located and there is no FK/check + // to replace (matches main). Skip the hot-path read entirely. + return; + } + + const cached = this.cache.getEnvExists(environmentId); + if (cached !== undefined) { + if (!cached) { + throw new ControlPlaneReferenceError( + `Referenced environment does not exist: ${environmentId}` + ); + } + return; + } + + const exists = await this.#queryEnvExists(this.controlPlaneReplica, environmentId); + this.cache.setEnvExists(environmentId, exists); + if (!exists) { + throw new ControlPlaneReferenceError( + `Referenced environment does not exist: ${environmentId}` + ); + } + } + + async #queryEnvExists(client: CpClient, environmentId: string): Promise { + const env = await client.runtimeEnvironment.findFirst({ + where: { id: environmentId }, + select: { id: true }, + }); + return env !== null; + } + + /** + * Drop cached control-plane rows for one environment after a control-plane write to that + * env's config. A no-op when split is OFF (nothing is cached), so it is always safe to call. + */ + invalidateEnvironment(environmentId: string): void { + this.cache.invalidateEnvironment(environmentId); + } + + /** + * Drop cached env/authEnv rows for every environment of an organization after a + * control-plane write to that org's config. Safe under split OFF (no cache). + */ + invalidateOrganization(organizationId: string): void { + this.cache.invalidateOrganization(organizationId); + } +} + +// Module-level singleton: wires the real control-plane clients + env split predicate. +// The control-plane writer/replica are the unchanged `prisma` / `$replica` exports. The +// split decision is a boot constant derived once from the env predicate (same one the +// run-ops topology factory uses); the async isSplitEnabled() distinct-DB sentinel is enforced +// at boot elsewhere and is never awaited on a resolver hot path. +const SPLIT_ENABLED = + env.RUN_OPS_SPLIT_ENABLED && !!env.TASK_RUN_DATABASE_URL && !!env.TASK_RUN_LEGACY_DATABASE_URL; + +export const controlPlaneResolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma, + controlPlaneReplica: $replica, + // Relax the cache via config. Unset env knobs -> built-in defaults (byte-identical). + cache: new ControlPlaneCache({ + ttlMs: env.CONTROL_PLANE_CACHE_TTL_MS ?? DEFAULT_CP_CACHE_TTL_MS, + maxEntries: env.CONTROL_PLANE_CACHE_MAX_ENTRIES ?? DEFAULT_CP_CACHE_MAX_ENTRIES, + }), + splitEnabled: () => SPLIT_ENABLED, +}); diff --git a/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts b/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts new file mode 100644 index 00000000000..791a101e9e3 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/crossSeamGuard.server.ts @@ -0,0 +1,94 @@ +import { ownerEngine } from "@trigger.dev/core/v3/isomorphic"; +import { isSplitEnabled } from "./splitMode.server"; +import type { + CrossSeamGuardDecision, + CrossSeamGuardInput, + RunOpsResidency, + StoreTarget, + UnblockRouteKind, +} from "./types"; + +const KNOWN_ROUTE_KINDS: ReadonlySet = new Set([ + "MANUAL", + "DATETIME", + "RESUME_TOKEN", + "IDEMPOTENCY_REUSE", + "RUN", +]); + +// There is NO default store: an unrecognised route is a loud failure. +function assertKnownRouteKind(routeKind: UnblockRouteKind): void { + if (!KNOWN_ROUTE_KINDS.has(routeKind)) { + throw new Error(`Unknown unblock routeKind: ${JSON.stringify(routeKind)}`); + } +} + +function storeForResidency(residency: RunOpsResidency): StoreTarget { + return residency === "NEW" ? "new" : "legacy"; +} + +/** + * Pin precedence (deterministic, documented order): + * 1. non-tree-owned (treeOwnerResidency === "LEGACY") + * 2. cross-tree-idempotency (isCrossTreeIdempotency === true) + * 3. legacy-parent-descendant (hasLegacyParent === true) + * Any hit overrides the store to "legacy"; the waitpoint's own residency is + * preserved on the decision so callers/metrics can see "NEW pinned to legacy". + */ +function applyPinningRules( + input: CrossSeamGuardInput +): CrossSeamGuardDecision["pinnedReason"] | undefined { + if (input.treeOwnerResidency === "LEGACY") return "non-tree-owned"; + if (input.isCrossTreeIdempotency === true) return "cross-tree-idempotency"; + if (input.hasLegacyParent === true) return "legacy-parent-descendant"; + return undefined; +} + +/** + * Pure store-selection core. No env import, no I/O — driven exhaustively by the + * downstream proof harness via the optional `classify` seam. + */ +export function selectStoreForWaitpoint( + input: CrossSeamGuardInput, + deps?: { classify?: (id: string) => RunOpsResidency } +): CrossSeamGuardDecision { + assertKnownRouteKind(input.routeKind); + + const classify = deps?.classify ?? ownerEngine; + + // Loud on ambiguity: classify throws UnclassifiableRunId with the real id; never catch-and-default. + const residency: RunOpsResidency = classify(input.waitpointId); + + const pinnedReason = applyPinningRules(input); + const store: StoreTarget = pinnedReason ? "legacy" : storeForResidency(residency); + + return { + store, + residency, + routeKind: input.routeKind, + ...(pinnedReason ? { pinnedReason } : {}), + }; +} + +/** + * Pure flag-aware core. In single-DB mode "legacy" IS the single store, so we + * return it WITHOUT ever consulting the classifier (off in single-DB). When + * split is on, delegate to the pure selection core. + */ +export function computeStoreForCompletion( + input: CrossSeamGuardInput, + opts: { splitEnabled: boolean; classify?: (id: string) => RunOpsResidency } +): CrossSeamGuardDecision { + if (opts.splitEnabled === false) { + return { store: "legacy", residency: "LEGACY", routeKind: input.routeKind }; + } + return selectStoreForWaitpoint(input, { classify: opts.classify }); +} + +/** Thin server entry the waitpoint-completion consumers call. */ +export async function pickRunOpsStoreForCompletion( + input: CrossSeamGuardInput +): Promise { + const splitEnabled = await isSplitEnabled(); + return computeStoreForCompletion(input, { splitEnabled }); +} diff --git a/apps/webapp/app/v3/runOpsMigration/distinctDbSentinel.server.ts b/apps/webapp/app/v3/runOpsMigration/distinctDbSentinel.server.ts new file mode 100644 index 00000000000..2c92178f82d --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/distinctDbSentinel.server.ts @@ -0,0 +1,55 @@ +import { PrismaClient } from "@trigger.dev/database"; + +type DatabaseFingerprint = { systemIdentifier: string; databaseName: string }; + +async function readDatabaseFingerprint(url: string): Promise { + const client = new PrismaClient({ datasources: { db: { url } } }); + try { + const rows = await client.$queryRawUnsafe< + Array<{ system_identifier: string; database_name: string }> + >( + "SELECT system_identifier::text AS system_identifier, current_database() AS database_name FROM pg_control_system()" + ); + const row = rows[0]; + if (!row) { + throw new Error("distinct-db sentinel: pg_control_system() returned no rows"); + } + return { systemIdentifier: row.system_identifier, databaseName: row.database_name }; + } finally { + await client.$disconnect(); + } +} + +export async function probeDistinctDatabases( + legacyUrl: string, + newUrl: string, + opts?: { logger?: { warn: (msg: string, meta?: Record) => void } } +): Promise<{ distinct: true } | { distinct: false; reason: string }> { + try { + const [legacy, next] = await Promise.all([ + readDatabaseFingerprint(legacyUrl), + readDatabaseFingerprint(newUrl), + ]); + const sameCluster = legacy.systemIdentifier === next.systemIdentifier; + const sameDb = sameCluster && legacy.databaseName === next.databaseName; + // Same-cluster-different-database policy: two databases inside the SAME cluster + // (same system identifier, different current_database()) are reported distinct: true. + // That is acceptable — they are genuinely separate Postgres databases with separate + // WAL-visible state for our purposes, and the Cloud topology always uses separate + // clusters anyway. A stricter "must be a different cluster" policy would gate on + // sameCluster alone; that is flagged as an open question, not decided here. + if (sameDb) { + const reason = + "run-ops legacy and new URLs resolve to the SAME physical database " + + `(systemIdentifier=${legacy.systemIdentifier}, database=${legacy.databaseName}); ` + + "refusing to enable split — pooler/replica likely."; + opts?.logger?.warn(reason); + return { distinct: false, reason }; + } + return { distinct: true }; + } catch (error) { + const reason = `distinct-db sentinel probe failed; failing closed (single-DB). ${String(error)}`; + opts?.logger?.warn(reason, { error }); + return { distinct: false, reason }; + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts new file mode 100644 index 00000000000..b552b4de736 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.test.ts @@ -0,0 +1,107 @@ +import { describe, expect, it, vi } from "vitest"; +import { batchIdForMintKind, resolveBatchMintKind } from "./mintBatchFriendlyId.server"; +import { classifyKind } from "@trigger.dev/core/v3/isomorphic"; + +describe("batchIdForMintKind (pure)", () => { + it("ksuid -> 27-char classifiable NEW batch id (no 21-char ids)", () => { + const r = batchIdForMintKind("ksuid"); + expect(r.friendlyId.startsWith("batch_")).toBe(true); + expect(r.id.length).toBe(27); + expect(classifyKind(r.id)).toBe("ksuid"); + expect(classifyKind(r.friendlyId)).toBe("ksuid"); + }); + + it("cuid -> 25-char classifiable LEGACY batch id", () => { + const r = batchIdForMintKind("cuid"); + expect(r.id.length).toBe(25); + expect(classifyKind(r.id)).toBe("cuid"); + expect(classifyKind(r.friendlyId)).toBe("cuid"); + }); + + it("never mints a 21-char id", () => { + for (const kind of ["cuid", "ksuid"] as const) { + expect([25, 27]).toContain(batchIdForMintKind(kind).id.length); + } + }); +}); + +describe("resolveBatchMintKind", () => { + const environment = { organizationId: "org_1", id: "env_1", orgFeatureFlags: {} }; + + it("ROOT batch (no parent) resolves per-org kind via resolveRunIdMintKind", async () => { + const resolveRunIdMintKind = vi.fn().mockResolvedValue("ksuid"); + const kind = await resolveBatchMintKind({ + environment, + deps: { resolveRunIdMintKind }, + }); + expect(kind).toBe("ksuid"); + expect(resolveRunIdMintKind).toHaveBeenCalledWith({ + organizationId: "org_1", + id: "env_1", + orgFeatureFlags: {}, + }); + }); + + it("ROOT batch on a non-cut-over org -> cuid", async () => { + const resolveRunIdMintKind = vi.fn().mockResolvedValue("cuid"); + const kind = await resolveBatchMintKind({ + environment, + deps: { resolveRunIdMintKind }, + }); + expect(kind).toBe("cuid"); + }); + + it("CHILD batch inherits a ksuid (NEW) parent by id-shape", async () => { + const parentRunFriendlyId = `run_${"a".repeat(27)}`; + const resolveRunIdMintKind = vi.fn(); + + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { resolveRunIdMintKind }, + }); + + expect(kind).toBe("ksuid"); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); + }); + + it("CHILD batch inherits a cuid (LEGACY) parent by id-shape", async () => { + const parentRunFriendlyId = `run_${"a".repeat(25)}`; + const resolveRunIdMintKind = vi.fn(); + + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { resolveRunIdMintKind }, + }); + + expect(kind).toBe("cuid"); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); + }); + + // mint-on-FLIP invariant: a child follows its parent's store even after the org flag + // flips the other way. The flag resolver must NEVER be consulted for a child. + it("FLIP cuid->ksuid: a cuid (LEGACY) parent still mints a cuid child though the flag now says ksuid", async () => { + const parentRunFriendlyId = `run_${"a".repeat(25)}`; + const resolveRunIdMintKind = vi.fn().mockResolvedValue("ksuid"); // flag flipped to ksuid + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { resolveRunIdMintKind }, + }); + expect(kind).toBe("cuid"); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); + }); + + it("FLIP ksuid->cuid: a ksuid (NEW) parent still mints a ksuid child though the flag now says cuid", async () => { + const parentRunFriendlyId = `run_${"a".repeat(27)}`; + const resolveRunIdMintKind = vi.fn().mockResolvedValue("cuid"); // flag flipped back to cuid + const kind = await resolveBatchMintKind({ + environment, + parentRunFriendlyId, + deps: { resolveRunIdMintKind }, + }); + expect(kind).toBe("ksuid"); + expect(resolveRunIdMintKind).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts new file mode 100644 index 00000000000..0503fc5b2c8 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/mintBatchFriendlyId.server.ts @@ -0,0 +1,45 @@ +import { BatchId, generateKsuidId } from "@trigger.dev/core/v3/isomorphic"; +import { + resolveRunIdMintKind as defaultResolveRunIdMintKind, + type RunIdMintKind, +} from "~/v3/engineVersion.server"; +import { resolveInheritedMintKind } from "~/v3/runOpsMigration/resolveInheritedMintKind.server"; + +type ResolveDeps = { + resolveRunIdMintKind: typeof defaultResolveRunIdMintKind; +}; + +const defaultDeps: ResolveDeps = { + resolveRunIdMintKind: defaultResolveRunIdMintKind, +}; + +export function batchIdForMintKind(kind: RunIdMintKind): { id: string; friendlyId: string } { + if (kind === "ksuid") { + const id = generateKsuidId(); + return { id, friendlyId: BatchId.toFriendlyId(id) }; + } + return BatchId.generate(); +} + +export async function resolveBatchMintKind(args: { + environment: { organizationId: string; id: string; orgFeatureFlags?: unknown }; + parentRunFriendlyId?: string; + deps?: Partial; +}): Promise { + const deps = { ...defaultDeps, ...args.deps }; + return args.parentRunFriendlyId + ? resolveInheritedMintKind(args.parentRunFriendlyId) + : deps.resolveRunIdMintKind({ + organizationId: args.environment.organizationId, + id: args.environment.id, + orgFeatureFlags: args.environment.orgFeatureFlags, + }); +} + +export async function mintBatchFriendlyId(args: { + environment: { organizationId: string; id: string; orgFeatureFlags?: unknown }; + parentRunFriendlyId?: string; + deps?: Partial; +}): Promise<{ id: string; friendlyId: string }> { + return batchIdForMintKind(await resolveBatchMintKind(args)); +} diff --git a/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts new file mode 100644 index 00000000000..1fe52189c83 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/readThrough.server.test.ts @@ -0,0 +1,153 @@ +// Real legacy-replica + new-DB proof for the read-through layer. +// We NEVER mock the DB: the reads run as real `$queryRaw` against the two containers, +// crossing the actual legacy↔new boundary the split relies on. The only injected +// fakes are the pure boundaries — `isPastRetention`, `splitEnabled` — plus throwing +// spies used to assert a store was NEVER touched. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { describe, expect, vi } from "vitest"; +import type { PrismaReplicaClient } from "~/db.server"; +import { readThroughRun, type ReadThroughResult } from "./readThrough.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +// 25-char cuid body → LEGACY residency. 27-char body → NEW residency. +const LEGACY_RUN_ID = "run_" + "a".repeat(25); +const NEW_RUN_ID = "run_" + "b".repeat(27); + +// Lightweight real read: a trivial `$queryRaw` that genuinely hits the given container. +// `hit` controls whether the read "finds" the run, so we exercise routing without +// seeding a full TaskRun (many required FKs) — the routing DoD is store-order, not shape. +async function realRead( + client: PrismaReplicaClient, + hit: boolean +): Promise<{ marker: number } | null> { + const rows = await client.$queryRaw<{ marker: number }[]>`SELECT 1 AS marker`; + return hit ? (rows[0] ?? null) : null; +} + +// A presenter-shaped mapping: both "not-found" and "past-retention" collapse to the +// same 404-ish surface, so an old run after termination yields the normal response. +function toHttpish(result: ReadThroughResult): { status: number; value?: T } { + switch (result.source) { + case "new": + case "legacy-replica": + return { status: 200, value: result.value }; + case "not-found": + case "past-retention": + return { status: 404 }; + } +} + +describe("readThroughRun (legacy replica + new DB)", () => { + heteroPostgresTest( + "old in-retention run is served from the legacy REPLICA, never a primary", + async ({ prisma14, prisma17 }) => { + // legacy hit, new miss. The layer has NO legacy-writer handle at all — the + // read resolving through `legacyReplica` (prisma14) IS the structural guarantee + // that the primary is never touched. + const result = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, false), + readLegacy: (c) => realRead(c, true), + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + }, + }); + + expect(result.source).toBe("legacy-replica"); + expect(toHttpish(result).status).toBe(200); + } + ); + + heteroPostgresTest( + "post-termination past-retention returns the normal not-found surface", + async ({ prisma14, prisma17 }) => { + const pastRetentionResult = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, false), + readLegacy: (c) => realRead(c, false), // legacy gone / retention elapsed + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isPastRetention: () => true, + }, + }); + + expect(pastRetentionResult.source).toBe("past-retention"); + + // A run that is simply absent (not past retention) yields not-found. + const notFoundResult = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, false), + readLegacy: (c) => realRead(c, false), + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + isPastRetention: () => false, + }, + }); + + expect(notFoundResult.source).toBe("not-found"); + // Both collapse to the same 404-ish surface. + expect(toHttpish(pastRetentionResult).status).toBe(toHttpish(notFoundResult).status); + expect(toHttpish(pastRetentionResult).status).toBe(404); + } + ); + + heteroPostgresTest( + "single-DB passthrough — only readNew runs, legacy never touched", + async ({ prisma14, prisma17 }) => { + const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { + throw new Error("readLegacy must never run in single-DB mode"); + }); + const newRead = vi.fn((c: PrismaReplicaClient) => realRead(c, true)); + + const result = await readThroughRun({ + runId: LEGACY_RUN_ID, + environmentId: "env_1", + readNew: newRead, + readLegacy: throwingLegacy, + deps: { + splitEnabled: false, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + }, + }); + + expect(result.source).toBe("new"); + expect(newRead).toHaveBeenCalledTimes(1); + expect(throwingLegacy).not.toHaveBeenCalled(); + } + ); + + heteroPostgresTest( + "new-residency fast-path — legacy replica is never touched", + async ({ prisma14, prisma17 }) => { + const throwingLegacy = vi.fn(async (): Promise<{ marker: number } | null> => { + throw new Error("readLegacy must never run for a NEW-residency id"); + }); + + const result = await readThroughRun({ + runId: NEW_RUN_ID, + environmentId: "env_1", + readNew: (c) => realRead(c, true), + readLegacy: throwingLegacy, + deps: { + splitEnabled: true, + newClient: prisma17 as unknown as PrismaReplicaClient, + legacyReplica: prisma14 as unknown as PrismaReplicaClient, + }, + }); + + expect(result.source).toBe("new"); + expect(throwingLegacy).not.toHaveBeenCalled(); + } + ); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts b/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts new file mode 100644 index 00000000000..8b83b70d78f --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/readThrough.server.ts @@ -0,0 +1,106 @@ +/** + * Read-through reads the LEGACY RUN-OPS READ REPLICA ONLY — never the legacy primary + * (which carries the read load we are shedding). Disabled entirely when isSplitEnabled() + * is false (single-DB passthrough). + * + * During the retention window, old run-ops rows are served off the legacy read replica. + * Residency is decided purely by id-shape: a ksuid (NEW) id reads new only, a cuid + * (LEGACY) id reads legacy only. An unclassifiable id falls back to a new-then-legacy + * probe. After termination, past-retention runs return the normal not-found response. + * Patterned on `mollifier/resolveRunForMutation.server.ts` (`?? default` DI), but with + * the legacy-primary/writer fallback deliberately removed: this layer has NO legacy-writer + * handle at all (structural guarantee). + */ +import type { PrismaReplicaClient } from "~/db.server"; +import { + runOpsLegacyReplica as defaultLegacyReplica, + runOpsNewReplica as defaultNewClient, +} from "~/db.server"; +import { logger as defaultLogger } from "~/services/logger.server"; +import { ownerEngine, UnclassifiableRunId } from "@trigger.dev/core/v3/isomorphic"; +import { isSplitEnabled } from "./splitMode.server"; + +export type ReadThroughSource = "new" | "legacy-replica"; + +export type ReadThroughResult = + | { source: ReadThroughSource; value: T } + | { source: "not-found" } + | { source: "past-retention" }; + +export type ReadThroughDeps = { + newClient?: PrismaReplicaClient; + legacyReplica?: PrismaReplicaClient; + /** Resolved boot constant; never `await`ed per-request when supplied. */ + splitEnabled?: boolean; + isPastRetention?: (runId: string) => boolean; + logger?: { warn: (m: string, meta?: unknown) => void }; + /** Saturation-signal emit hook: called on each legacy-replica hit. */ + onLegacyReplicaRead?: (runId: string) => void; +}; + +type ReadThroughRunInput = { + runId: string; + environmentId: string; + readNew: (client: PrismaReplicaClient) => Promise; + readLegacy: (replica: PrismaReplicaClient) => Promise; + deps?: ReadThroughDeps; +}; + +export async function readThroughRun( + input: ReadThroughRunInput +): Promise> { + const { runId, deps } = input; + const newClient = deps?.newClient ?? defaultNewClient; + const legacyReplica = deps?.legacyReplica ?? defaultLegacyReplica; + const logger = deps?.logger ?? defaultLogger; + + const splitEnabled = deps?.splitEnabled ?? (await isSplitEnabled()); + + // Passthrough: single plain read against the one collapsed store. No legacy read, + // no second connection. + if (!splitEnabled) { + const v = await input.readNew(newClient); + return v != null ? { source: "new", value: v } : { source: "not-found" }; + } + + // Split is on. Classify residency; an unclassifiable id is treated as LEGACY + // (conservative — probe rather than drop a real run). + let residency: "LEGACY" | "NEW"; + try { + residency = ownerEngine(runId); + } catch (e) { + if (e instanceof UnclassifiableRunId) { + logger.warn("readThroughRun: UnclassifiableRunId, treating as LEGACY", { + runId, + valueLength: e.valueLength, + }); + residency = "LEGACY"; + } else { + throw e; + } + } + + // A ksuid id can only live on the new DB — skip the legacy replica entirely. + if (residency === "NEW") { + const v = await input.readNew(newClient); + return v != null ? { source: "new", value: v } : { source: "not-found" }; + } + + // LEGACY (or unclassifiable→LEGACY) fan-out: new first. + const v = await input.readNew(newClient); + if (v != null) { + return { source: "new", value: v }; + } + + // Legacy READ REPLICA only — never a legacy writer/primary (no such handle exists). + const lv = await input.readLegacy(legacyReplica); + if (lv != null) { + deps?.onLegacyReplicaRead?.(runId); + return { source: "legacy-replica", value: lv }; + } + + if (deps?.isPastRetention?.(runId)) { + return { source: "past-retention" }; + } + return { source: "not-found" }; +} diff --git a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts new file mode 100644 index 00000000000..74baee1bb6e --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.test.ts @@ -0,0 +1,15 @@ +import { describe, expect, it } from "vitest"; +import { resolveInheritedMintKind } from "./resolveInheritedMintKind.server"; + +const NEW_PARENT = `run_${"a".repeat(27)}`; // ksuid id-shape -> NEW +const LEGACY_PARENT = `run_${"b".repeat(25)}`; // cuid id-shape -> LEGACY + +describe("resolveInheritedMintKind (pure id-shape, shared across all mint paths)", () => { + it("inherits a ksuid (NEW) parent by id-shape -> ksuid", () => { + expect(resolveInheritedMintKind(NEW_PARENT)).toBe("ksuid"); + }); + + it("inherits a cuid (LEGACY) parent by id-shape -> cuid", () => { + expect(resolveInheritedMintKind(LEGACY_PARENT)).toBe("cuid"); + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts new file mode 100644 index 00000000000..e43c3a8e33c --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/resolveInheritedMintKind.server.ts @@ -0,0 +1,10 @@ +import { ownerEngine } from "@trigger.dev/core/v3/isomorphic"; +import type { RunIdMintKind } from "./runOpsMintKind.server"; + +// Mint a child in the SAME physical store as its anchor (parent run / owning batch), +// regardless of the org's current mint flag — keeps a subgraph co-resident across a +// flip. With no migration/drain, residency is a pure id-shape check (zero hot-path +// I/O): a ksuid (NEW) parent mints ksuid children, a cuid (LEGACY) parent mints cuid. +export function resolveInheritedMintKind(parentRunFriendlyId: string): RunIdMintKind { + return ownerEngine(parentRunFriendlyId) === "NEW" ? "ksuid" : "cuid"; +} diff --git a/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts new file mode 100644 index 00000000000..014e446464f --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runEngineControlPlaneResolver.server.ts @@ -0,0 +1,81 @@ +import type { + ControlPlaneResolver as EngineControlPlaneResolver, + ResolvedAuthenticatedEnv, + ResolvedEngineEnv, + ResolvedWorkerVersion, +} from "@internal/run-engine"; +import type { RuntimeEnvironmentType } from "@trigger.dev/database"; +import type { ControlPlaneResolver as AppControlPlaneResolver } from "./controlPlaneResolver.server"; +import { controlPlaneResolver } from "./controlPlaneResolver.server"; + +/** + * Adapter that presents the webapp's cross-DB cached ControlPlaneResolver as the + * run-engine `ControlPlaneResolver` seam. Injected in `runEngine.server.ts`, it replaces the + * default `PassthroughControlPlaneResolver` so the engine's dequeue/waitpoint/checkpoint/delayTTL + * reads resolve the control-plane half cache-first instead of via an in-DB join. + * + * `resolveEnv` maps the app `ResolvedEnv` (widened to carry the concurrency + nested ids the engine + * needs) onto `ResolvedEngineEnv`. `resolveWorkerVersion` forwards the env `type` so the app + * resolver runs the full run-engine dequeue dispatch (DEV most-recent / MANAGED promotion). + */ +export class RunEngineControlPlaneResolver implements EngineControlPlaneResolver { + readonly #resolver: AppControlPlaneResolver; + + constructor(resolver: AppControlPlaneResolver) { + this.#resolver = resolver; + } + + async resolveEnv(environmentId: string): Promise { + const env = await this.#resolver.resolveEnv(environmentId); + + if (!env) { + return null; + } + + return { + id: env.id, + type: env.type, + archivedAt: env.archivedAt, + maximumConcurrencyLimit: env.maximumConcurrencyLimit, + concurrencyLimitBurstFactor: env.concurrencyLimitBurstFactor, + projectId: env.projectId, + organizationId: env.organizationId, + project: { id: env.projectId }, + organization: { id: env.organizationId }, + }; + } + + async resolveWorkerVersion(args: { + environmentId: string; + type: RuntimeEnvironmentType; + workerId?: string; + }): Promise { + return this.#resolver.resolveWorkerVersion({ + environmentId: args.environmentId, + backgroundWorkerId: args.workerId, + type: args.type, + }); + } + + async resolveAuthenticatedEnv(environmentId: string): Promise { + // Delegate to the cache-first, split-aware app resolver (like resolveEnv/resolveWorkerVersion): + // its authenticated-env slot now carries `git`. Keep the deleted-project guard the engine relies + // on — a deleted project's env must not resolve. + const environment = await this.#resolver.resolveAuthenticatedEnv(environmentId); + + if (!environment || environment.project.deletedAt !== null) { + return null; + } + + return environment; + } + + async assertEnvExists(environmentId: string): Promise { + await this.#resolver.assertEnvExists(environmentId); + } +} + +// Module-level singleton over the app resolver singleton. +export const runEngineControlPlaneResolver = new RunEngineControlPlaneResolver( + controlPlaneResolver +); diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsCascadeCleanup.server.ts b/apps/webapp/app/v3/runOpsMigration/runOpsCascadeCleanup.server.ts new file mode 100644 index 00000000000..2392d516180 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsCascadeCleanup.server.ts @@ -0,0 +1,275 @@ +import { type PrismaClient } from "@trigger.dev/database"; +import { type RunOpsPrismaClient } from "@internal/run-ops-database"; +import { runOpsLegacyPrisma, runOpsNewPrismaClient } from "~/db.server"; + +/** + * Structural client covering exactly the run-subgraph delegates + WHERE filters the cascade uses on + * a run-ops writer. Both `@trigger.dev/database`'s `PrismaClient` (full schema, legacy writer) and + * `@internal/run-ops-database`'s `RunOpsPrismaClient` (dedicated SUBSET schema, new writer) are + * assignable to it — the two concrete clients are NOT mutually assignable (the subset adds FK-free + * join models the full schema lacks), so a shared structural type is the only common ground. + * + * Crucially it does NOT expose control-plane-resident models (e.g. `bulkActionItem`) nor scalarized + * relations that don't exist on the subset (e.g. `TaskRunWaitpoint.taskRun`), so the compiler now + * rejects the two bugs an `as unknown as PrismaClient` cast would otherwise mask. + */ +type CountResult = { count: number }; +type RunSubgraphCleanupClient = { + taskRun: { + findMany(args: { + where: { runtimeEnvironmentId: string }; + select: { id: true }; + }): Promise>; + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + taskRunAttempt: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { taskRun: { projectId: string } }; + }): Promise; + }; + taskRunWaitpoint: { + deleteMany(args: { + where: { taskRunId: { in: string[] } } | { projectId: string }; + }): Promise; + }; + taskRunCheckpoint: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + checkpoint: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + checkpointRestoreEvent: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { projectId: string }; + }): Promise; + }; + waitpoint: { + deleteMany(args: { + where: { environmentId: string } | { projectId: string }; + }): Promise; + }; + batchTaskRun: { + deleteMany(args: { + where: { runtimeEnvironmentId: string } | { runs: { some: { projectId: string } } }; + }): Promise; + }; +}; + +// Compile-time assertion that both concrete writers satisfy the structural shape. +const _newWriterAssignable: RunSubgraphCleanupClient = undefined as unknown as RunOpsPrismaClient; +const _legacyWriterAssignable: RunSubgraphCleanupClient = undefined as unknown as PrismaClient; +void _newWriterAssignable; +void _legacyWriterAssignable; + +/** + * RunOpsCascadeCleanupService — application-level env/project-delete cascade-cleanup that replaces + * the cloud-only dropped cross-seam `onDelete: Cascade` FKs crossing run-ops -> control-plane. + * + * Deletes route through the dedicated run-ops write clients (`runOpsNewPrismaClient` + + * `runOpsLegacyPrisma`), NOT the control-plane `prisma`. The ordered delete pass runs against BOTH + * writers: a migrating env/project's run-ops rows split across the new (KSUID) and + * legacy (cuid) DBs per the per-env cutover + roll-new-forward rollback, and the + * cloud DB that lost its physical FK has no cascade to clean the other writer's miss. In single-DB + * both handles are reference-equal to the one collapsed client, so de-dup-by-reference runs the + * pass once; the FK cascade also fires there, making these deletes idempotent no-ops. + * + * The NEW run-ops writer is a dedicated `RunOpsPrismaClient` over the run-subgraph SUBSET schema: + * it does NOT carry control-plane-resident models. `BulkActionItem` is one such control-plane model + * (it lives in `@trigger.dev/database` but NOT in the run-ops subset), so cleaning it on the NEW + * writer would dereference an `undefined` delegate at runtime. Its cleanup therefore runs ONLY + * against the control-plane writer; the run-subgraph deletes (which DO exist on both schemas) run + * per run-ops writer. Typing the run-ops writers as `RunOpsPrismaClient` makes the compiler reject + * any future control-plane-only model access on the NEW writer, so this class of bug can't recur. + * + * Deliberately NOT gated behind `isSplitEnabled()` (cloud relies on it; self-host treats it as + * idempotent insurance). Every delete is `deleteMany`, so a zero-row scope is a no-op and rows a + * concurrent FK cascade already removed return `count: 0`. Deletes are not wrapped in one + * `$transaction` (no cross-DB txn is possible, and a single huge txn risks long locks); a crash + * mid-cleanup is recovered by re-running. + */ + +/** Per-table deleted row counts, summed across the distinct run-ops writers actually run. */ +type CascadeCleanupResult = Record; + +type CleanupServiceDeps = { + /** + * Run-ops write clients to run the run-subgraph delete pass against. Defaults to the two + * run-ops writers — NOT the control-plane `prisma`. Typed as the structural + * `RunSubgraphCleanupClient` so the compiler rejects control-plane-only model access (e.g. + * `bulkActionItem`) and subset-absent relations. De-duped by reference so the single-DB + * reference-equal collapse runs the pass once. + */ + runOpsWriters?: RunSubgraphCleanupClient[]; + /** + * Control-plane writer for control-plane-resident models the run-subgraph cascade must also clean + * (currently only `BulkActionItem`, which has no env/project column and is NOT in the run-ops + * subset schema). Runs exactly once. Defaults to the legacy run-ops writer, which IS the + * control-plane client. + */ + controlPlaneWriter?: PrismaClient; +}; + +export class RunOpsCascadeCleanupService { + #writers: RunSubgraphCleanupClient[]; + #controlPlaneWriter: PrismaClient; + + constructor(deps: CleanupServiceDeps = {}) { + const writers = deps.runOpsWriters ?? [runOpsNewPrismaClient, runOpsLegacyPrisma]; + this.#writers = Array.from(new Set(writers)); + this.#controlPlaneWriter = deps.controlPlaneWriter ?? runOpsLegacyPrisma; + } + + /** Delete all run-ops rows scoped to one environment, across every distinct run-ops writer. */ + public async cleanupEnvironment(runtimeEnvironmentId: string): Promise { + const result: CascadeCleanupResult = {}; + await this.#cleanupBulkActionItemsForEnvironment(runtimeEnvironmentId, result); + for (const writer of this.#writers) { + await this.#cleanupEnvironmentOnWriter(writer, runtimeEnvironmentId, result); + } + return result; + } + + /** Delete all run-ops rows scoped to one project, across every distinct run-ops writer. */ + public async cleanupProject(projectId: string): Promise { + const result: CascadeCleanupResult = {}; + await this.#cleanupBulkActionItemsForProject(projectId, result); + for (const writer of this.#writers) { + await this.#cleanupProjectOnWriter(writer, projectId, result); + } + return result; + } + + // BulkActionItem is control-plane-resident (it exists in @trigger.dev/database, NOT in the + // run-ops subset schema), so it is cleaned only on the control-plane writer. It has no env column; + // clean via both run relations (destination may differ). + async #cleanupBulkActionItemsForEnvironment( + runtimeEnvironmentId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "bulkActionItem", async () => { + const a = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { sourceRun: { runtimeEnvironmentId } }, + }); + const b = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { destinationRun: { runtimeEnvironmentId } }, + }); + return a.count + b.count; + }); + } + + // BulkActionItem has no projectId column; clean via both run relations. + async #cleanupBulkActionItemsForProject( + projectId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "bulkActionItem", async () => { + const a = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { sourceRun: { projectId } }, + }); + const b = await this.#controlPlaneWriter.bulkActionItem.deleteMany({ + where: { destinationRun: { projectId } }, + }); + return a.count + b.count; + }); + } + + // Child-before-parent ordering: an FK-retained DB never errors on an out-of-order delete, and an + // FK-dropped DB leaves no orphans. TaskRun self-relations and TaskRun.batchId are SetNull, so a + // single deleteMany of all scoped TaskRuns is order-safe within the table; Waitpoint's run/batch + // links are SetNull (nullable) so its position is for tidiness only. + async #cleanupEnvironmentOnWriter( + writer: RunSubgraphCleanupClient, + runtimeEnvironmentId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "checkpointRestoreEvent", () => + writer.checkpointRestoreEvent + .deleteMany({ where: { runtimeEnvironmentId } }) + .then((r) => r.count) + ); + await this.#accumulate(result, "checkpoint", () => + writer.checkpoint.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRunCheckpoint", () => + writer.taskRunCheckpoint.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + // TaskRunWaitpoint has neither an env column nor (on the subset schema) a `taskRun` relation to + // filter through, so resolve the scoped run ids first and delete by the scalar `taskRunId`. + await this.#accumulate(result, "taskRunWaitpoint", async () => { + const runs = await writer.taskRun.findMany({ + where: { runtimeEnvironmentId }, + select: { id: true }, + }); + if (runs.length === 0) return 0; + const r = await writer.taskRunWaitpoint.deleteMany({ + where: { taskRunId: { in: runs.map((run) => run.id) } }, + }); + return r.count; + }); + // Waitpoint's env column is `environmentId`, NOT `runtimeEnvironmentId`. + await this.#accumulate(result, "waitpoint", () => + writer.waitpoint + .deleteMany({ where: { environmentId: runtimeEnvironmentId } }) + .then((r) => r.count) + ); + await this.#accumulate(result, "taskRunAttempt", () => + writer.taskRunAttempt.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + await this.#accumulate(result, "batchTaskRun", () => + writer.batchTaskRun.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRun", () => + writer.taskRun.deleteMany({ where: { runtimeEnvironmentId } }).then((r) => r.count) + ); + } + + async #cleanupProjectOnWriter( + writer: RunSubgraphCleanupClient, + projectId: string, + result: CascadeCleanupResult + ): Promise { + await this.#accumulate(result, "checkpointRestoreEvent", () => + writer.checkpointRestoreEvent.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "checkpoint", () => + writer.checkpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRunCheckpoint", () => + writer.taskRunCheckpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "taskRunWaitpoint", () => + writer.taskRunWaitpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + await this.#accumulate(result, "waitpoint", () => + writer.waitpoint.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + // TaskRunAttempt has no projectId column; clean via its TaskRun relation. + await this.#accumulate(result, "taskRunAttempt", () => + writer.taskRunAttempt.deleteMany({ where: { taskRun: { projectId } } }).then((r) => r.count) + ); + // BatchTaskRun has no projectId column; clean via its TaskRun (`runs`) members. + await this.#accumulate(result, "batchTaskRun", () => + writer.batchTaskRun + .deleteMany({ where: { runs: { some: { projectId } } } }) + .then((r) => r.count) + ); + await this.#accumulate(result, "taskRun", () => + writer.taskRun.deleteMany({ where: { projectId } }).then((r) => r.count) + ); + } + + async #accumulate( + result: CascadeCleanupResult, + table: string, + run: () => Promise + ): Promise { + const count = await run(); + result[table] = (result[table] ?? 0) + count; + } +} diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.flipLatency.test.ts b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.flipLatency.test.ts new file mode 100644 index 00000000000..fc346dad897 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.flipLatency.test.ts @@ -0,0 +1,75 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import { computeRunIdMintKind, type RunIdMintKind } from "./runOpsMintKind.server"; + +// LOCK of the CURRENT (intentional) flip-latency behavior, NOT a change request. +// resolveRunIdMintKind caches the per-org mint kind in a process-singleton +// BoundedTtlCache (TTL RUN_OPS_MINT_FLAG_CACHE_TTL_MS, 30000ms default) with get/set +// and NO invalidation hook (runOpsMintKind.server.ts:38-45,56-81). So after a flag +// flip a process keeps minting the stale kind until its cached entry expires; in +// multi-instance prod each process expires independently. This suite reconstructs the +// same flag fn over a real cache and pins both edges of that window. + +// Mirror of resolveRunIdMintKind's flag fn (runOpsMintKind.server.ts:56-81). +function makeCachedFlag( + cache: BoundedTtlCache, + liveFlag: () => RunIdMintKind +): (orgId: string) => Promise { + return async (orgId: string) => { + const cached = cache.get(orgId); + if (cached !== undefined) return cached; + const kind = liveFlag(); + cache.set(orgId, kind); + return kind; + }; +} + +const TTL_MS = 30_000; +const env = { organizationId: "org_flip", id: "env_flip" }; + +describe("computeRunIdMintKind flip latency (mintCache TTL window — current behavior LOCK)", () => { + beforeEach(() => vi.useFakeTimers()); + afterEach(() => vi.useRealTimers()); + + it("returns the STALE cached kind within the TTL after the flag flips cuid->ksuid", async () => { + const cache = new BoundedTtlCache(TTL_MS, 100); + let live: RunIdMintKind = "cuid"; + const flag = makeCachedFlag(cache, () => live); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); // populates the cache + + live = "ksuid"; // admin flips the org flag + vi.advanceTimersByTime(TTL_MS - 1); // still inside the window + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); // STALE, as designed + }); + + it("returns the FRESH kind once the TTL expires after a cuid->ksuid flip", async () => { + const cache = new BoundedTtlCache(TTL_MS, 100); + let live: RunIdMintKind = "cuid"; + const flag = makeCachedFlag(cache, () => live); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); + + live = "ksuid"; + vi.advanceTimersByTime(TTL_MS + 1); // past expiry -> entry evicted on read + expect(await computeRunIdMintKind(env, deps)).toBe("ksuid"); // re-reads the live flag + }); + + it("symmetric flip-back ksuid->cuid is also stale within TTL, fresh after", async () => { + const cache = new BoundedTtlCache(TTL_MS, 100); + let live: RunIdMintKind = "ksuid"; + const flag = makeCachedFlag(cache, () => live); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + expect(await computeRunIdMintKind(env, deps)).toBe("ksuid"); + + live = "cuid"; + vi.advanceTimersByTime(TTL_MS - 1); + expect(await computeRunIdMintKind(env, deps)).toBe("ksuid"); // STALE + + vi.advanceTimersByTime(2); // now past expiry + expect(await computeRunIdMintKind(env, deps)).toBe("cuid"); // FRESH + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.test.ts b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.test.ts new file mode 100644 index 00000000000..9d2e575fef8 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.test.ts @@ -0,0 +1,61 @@ +import { describe, expect, it, vi } from "vitest"; +import { computeRunIdMintKind } from "./runOpsMintKind.server"; + +describe("computeRunIdMintKind (pure)", () => { + it("mints cuid when the master switch is off (never reads the flag)", async () => { + const flag = vi.fn(); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: false, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("cuid"); + expect(flag).not.toHaveBeenCalled(); + }); + + it("mints cuid when split is OFF, even if master + per-org flag say ksuid", async () => { + const flag = vi.fn().mockResolvedValue("ksuid"); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: true, splitEnabled: async () => false, flag } + ); + expect(kind).toBe("cuid"); // the split-enabled gate dominates + expect(flag).not.toHaveBeenCalled(); // split-off short-circuits before any flag read + }); + + it("mints ksuid only when master on AND split on AND per-org flag = ksuid", async () => { + const flag = vi.fn().mockResolvedValue("ksuid"); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("ksuid"); + }); + + it("passes the already-loaded org feature flags through to the flag fn (no extra DB read)", async () => { + const flag = vi.fn().mockResolvedValue("ksuid"); + const orgFeatureFlags = { runOpsMintKsuid: "ksuid" }; + await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1", orgFeatureFlags }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(flag).toHaveBeenCalledWith("org_1", orgFeatureFlags); + }); + + it("mints cuid for a non-canary org (per-org flag defaults to cuid)", async () => { + const flag = vi.fn().mockResolvedValue("cuid"); + const kind = await computeRunIdMintKind( + { organizationId: "org_2", id: "env_2" }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("cuid"); + }); + + it("fails safe to cuid when the flag read throws", async () => { + const flag = vi.fn().mockRejectedValue(new Error("db down")); + const kind = await computeRunIdMintKind( + { organizationId: "org_1", id: "env_1" }, + { masterEnabled: true, splitEnabled: async () => true, flag } + ); + expect(kind).toBe("cuid"); // never arm a mint on a flag-read failure + }); +}); diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.ts b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.ts new file mode 100644 index 00000000000..c3751c993ce --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsMintKind.server.ts @@ -0,0 +1,84 @@ +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; +import { singleton } from "~/utils/singleton"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { isSplitEnabled } from "./splitMode.server"; + +export type RunIdMintKind = "cuid" | "ksuid"; + +type MintKindDeps = { + masterEnabled: boolean; + splitEnabled: () => Promise; + // Receives the orgId + the (optional) already-loaded org feature flags. When + // orgFeatureFlags is provided, the implementation must NOT read the DB for them. + flag: (orgId: string, orgFeatureFlags: unknown | undefined) => Promise; +}; + +// PURE CORE — no env import; tests drive this directly. Gate order is load-bearing: +// master switch → split gate → per-org flag, short-circuiting at the first OFF. +export async function computeRunIdMintKind( + environment: { organizationId: string; id: string; orgFeatureFlags?: unknown }, + deps: MintKindDeps +): Promise { + if (!deps.masterEnabled) return "cuid"; + if (!(await deps.splitEnabled())) return "cuid"; + try { + return await deps.flag(environment.organizationId, environment.orgFeatureFlags); + } catch (error) { + logger.error("[runOpsMintKind] flag read failed; minting cuid (fail-safe)", { error }); + return "cuid"; + } +} + +// ENV-BOUND wrapper — the only place env/$replica/isSplitEnabled are read. +const flagFn = singleton("runOpsMintFlag", () => makeFlag($replica)); +const mintCache = singleton( + "runOpsMintCache", + () => + new BoundedTtlCache( + env.RUN_OPS_MINT_FLAG_CACHE_TTL_MS, + env.RUN_OPS_MINT_FLAG_CACHE_MAX_ENTRIES + ) +); + +export async function resolveRunIdMintKind(environment: { + organizationId: string; + id: string; + // Pass environment.organization.featureFlags from the trigger call site. + orgFeatureFlags?: unknown; +}): Promise { + return computeRunIdMintKind(environment, { + masterEnabled: env.RUN_OPS_MINT_KSUID_ENABLED, + splitEnabled: isSplitEnabled, + flag: async (orgId, orgFeatureFlags) => { + // The cache stores only "cuid"|"ksuid" (never undefined), so the cache's + // "stored-undefined == miss" caveat never applies here. + const cached = mintCache.get(orgId); + if (cached !== undefined) return cached; + + // Hot-path pass-through: use the org flags the authenticated environment already + // carries; only fall back to a DB read when the caller did NOT pass them (non-trigger + // callers). The trigger path always passes them, so it never issues this findFirst. + const overrides = + orgFeatureFlags !== undefined + ? orgFeatureFlags + : ( + await $replica.organization.findFirst({ + where: { id: orgId }, + select: { featureFlags: true }, + }) + )?.featureFlags; + + const kind = await flagFn({ + key: FEATURE_FLAG.runOpsMintKsuid, + defaultValue: "cuid", + overrides: (overrides as Record) ?? {}, + }); + mintCache.set(orgId, kind); + return kind; + }, + }); +} diff --git a/apps/webapp/app/v3/runOpsMigration/runOpsSplitReadGate.ts b/apps/webapp/app/v3/runOpsMigration/runOpsSplitReadGate.ts new file mode 100644 index 00000000000..59eef29fbce --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/runOpsSplitReadGate.ts @@ -0,0 +1,14 @@ +// Pure run-ops split READ gate. The LEGACY handle is intentionally the control-plane client, +// so only the NEW client's distinctness gates (see runOpsSplitReadGate.test.ts). +export function computeRunOpsSplitReadEnabled(args: { + newReplica: unknown; + controlPlaneWriter: unknown; + controlPlaneReplica: unknown; + hasNewUrl: boolean; + hasLegacyUrl: boolean; +}): boolean { + const newIsDistinctDedicatedClient = + args.newReplica !== args.controlPlaneWriter && args.newReplica !== args.controlPlaneReplica; + + return newIsDistinctDedicatedClient && args.hasNewUrl && args.hasLegacyUrl; +} diff --git a/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts b/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts new file mode 100644 index 00000000000..8d0eb807dbf --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/splitMode.server.ts @@ -0,0 +1,84 @@ +/** + * isSplitEnabled() is the Wave-0 gate. The entire migration/routing/FK-drop family + * MUST be unreachable when this returns false. Default is false (single-DB). Never + * infer split-vs-single from URL string-equality — distinctness is proven by the + * runtime sentinel. + */ +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { probeDistinctDatabases as defaultProbe } from "./distinctDbSentinel.server"; + +export type SplitModeConfig = { + flagEnabled: boolean; + legacyUrl?: string; + newUrl?: string; +}; + +export type SplitModeDeps = { + probe?: typeof defaultProbe; + logger?: { warn: (msg: string, meta?: Record) => void }; +}; + +export async function computeSplitEnabled( + config: SplitModeConfig, + deps: SplitModeDeps = {} +): Promise { + // Hard gate #1: explicit positive opt-in. OFF by default -> never probe. + if (!config.flagEnabled) { + return false; + } + // Both URLs are required to even consider a split. + if (!config.legacyUrl || !config.newUrl) { + deps.logger?.warn( + "RUN_OPS_SPLIT_ENABLED is on but TASK_RUN_LEGACY_DATABASE_URL / TASK_RUN_DATABASE_URL are not both set; staying single-DB." + ); + return false; + } + // Hard gate #2: runtime sentinel must confirm physically-distinct DBs. + const probe = deps.probe ?? defaultProbe; + const result = await probe(config.legacyUrl, config.newUrl, { logger: deps.logger }); + return result.distinct === true; +} + +export type SplitRealtimeInterlockConfig = { + splitEnabled: boolean; + nativeRealtimeEnabled: boolean; +}; + +/** + * Boot-time realtime interlock (pure predicate). Split mode puts NEW-resident + * (ksuid) runs on the dedicated run-ops DB, but Electric replicates only from the + * control-plane DB — with the native realtime backend OFF those runs are invisible + * and every realtime subscription hangs. Refuse split unless native is on; split-off + * is always allowed regardless of the realtime backend. + */ +export function assertSplitRealtimeInterlock(config: SplitRealtimeInterlockConfig): void { + if (!config.splitEnabled) { + return; + } + if (!config.nativeRealtimeEnabled) { + throw new Error( + "RUN_OPS_SPLIT_ENABLED is on but the native realtime backend (REALTIME_BACKEND_NATIVE_ENABLED) is not enabled — Electric cannot serve NEW-resident runs; refusing to enable split." + ); + } +} + +let cached: Promise | undefined; + +export function isSplitEnabled(): Promise { + if (!cached) { + cached = computeSplitEnabled( + { + flagEnabled: env.RUN_OPS_SPLIT_ENABLED, + legacyUrl: env.TASK_RUN_LEGACY_DATABASE_URL, + newUrl: env.TASK_RUN_DATABASE_URL, + }, + { logger } + ); + } + return cached; +} + +export function __resetSplitModeCacheForTests(): void { + cached = undefined; +} diff --git a/apps/webapp/app/v3/runOpsMigration/types.ts b/apps/webapp/app/v3/runOpsMigration/types.ts new file mode 100644 index 00000000000..69a4f7b1e85 --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/types.ts @@ -0,0 +1,25 @@ +// Pure types for the cross-seam residency guard. No runtime, no env, no Prisma. +import type { Residency } from "@trigger.dev/core/v3/isomorphic"; + +// Aliased (not re-declared) so it cannot drift from the classifier's own union. +export type RunOpsResidency = Residency; + +export type StoreTarget = "new" | "legacy"; + +export type UnblockRouteKind = "MANUAL" | "DATETIME" | "RESUME_TOKEN" | "IDEMPOTENCY_REUSE" | "RUN"; + +export interface CrossSeamGuardInput { + waitpointId: string; + routeKind: UnblockRouteKind; + treeOwnerResidency?: RunOpsResidency; + isCrossTreeIdempotency?: boolean; + hasLegacyParent?: boolean; +} + +export interface CrossSeamGuardDecision { + store: StoreTarget; + /** Always the waitpoint's OWN classification, even when pinned to legacy. */ + residency: RunOpsResidency; + routeKind: UnblockRouteKind; + pinnedReason?: "non-tree-owned" | "cross-tree-idempotency" | "legacy-parent-descendant"; +} diff --git a/apps/webapp/app/v3/runOpsMigration/unblockRouteCatalog.ts b/apps/webapp/app/v3/runOpsMigration/unblockRouteCatalog.ts new file mode 100644 index 00000000000..3296569ce0b --- /dev/null +++ b/apps/webapp/app/v3/runOpsMigration/unblockRouteCatalog.ts @@ -0,0 +1,96 @@ +// If you add a `completeWaitpoint(` call site in the run-engine, add a matching +// entry here or `apps/webapp/test/crossSeamGuard.proof.test.ts` fails. Entries are +// one-per-textual-call-site (so the per-file count matches the source), anchored by +// method name, not line number. The `kind` is the dominant route kind — store +// selection is driven by residency, not kind, so a disputed kind label is cosmetic. +// +// PURE module — no engine import, no env, no Prisma. +import type { UnblockRouteKind } from "./types"; + +export interface UnblockRoute { + id: string; + kind: UnblockRouteKind; + /** The relative source path, e.g. "internal-packages/run-engine/src/engine/index.ts". */ + site: string; + /** Enclosing method/symbol name — NEVER a line number. */ + symbol: string; +} + +const INDEX = "internal-packages/run-engine/src/engine/index.ts"; +const WAITPOINT_SYSTEM = "internal-packages/run-engine/src/engine/systems/waitpointSystem.ts"; +const TTL_SYSTEM = "internal-packages/run-engine/src/engine/systems/ttlSystem.ts"; +const RUN_ATTEMPT_SYSTEM = "internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts"; +const BATCH_SYSTEM = "internal-packages/run-engine/src/engine/systems/batchSystem.ts"; + +export const UNBLOCK_ROUTES: readonly UnblockRoute[] = [ + { + id: "index.public", + kind: "RESUME_TOKEN", + site: INDEX, + symbol: "completeWaitpoint (public declaration)", + }, + { + id: "index.public.delegate", + kind: "RESUME_TOKEN", + site: INDEX, + symbol: "completeWaitpoint (delegation to waitpointSystem)", + }, + { + id: "index.finishWaitpoint", + kind: "DATETIME", + site: INDEX, + symbol: "finishWaitpoint redis job", + }, + { + id: "wp.sink", + kind: "RUN", + site: WAITPOINT_SYSTEM, + symbol: "completeWaitpoint (sink declaration)", + }, + { + id: "wp.blockAndComplete", + kind: "RUN", + site: WAITPOINT_SYSTEM, + symbol: "blockRunAndCompleteWaitpoint", + }, + { + id: "wp.getOrCreate", + kind: "IDEMPOTENCY_REUSE", + site: WAITPOINT_SYSTEM, + symbol: "getOrCreateRunWaitpoint", + }, + { + id: "batch.tryCompleteBatch", + kind: "RUN", + site: BATCH_SYSTEM, + symbol: "#tryCompleteBatch", + }, + { + id: "ttl.expireRun", + kind: "RUN", + site: TTL_SYSTEM, + symbol: "expireRun", + }, + { + id: "runAttempt.succeeded", + kind: "RUN", + site: RUN_ATTEMPT_SYSTEM, + symbol: "attemptSucceeded", + }, + { + id: "runAttempt.cancel", + kind: "RUN", + site: RUN_ATTEMPT_SYSTEM, + symbol: "cancelRun", + }, + { + id: "runAttempt.permanentlyFail", + kind: "RUN", + site: RUN_ATTEMPT_SYSTEM, + symbol: "#permanentlyFailRun", + }, +]; + +export function expectedCompleteWaitpointCallSites(): { site: string; symbol: string }[] { + return UNBLOCK_ROUTES.map((r) => ({ site: r.site, symbol: r.symbol })); +} diff --git a/apps/webapp/app/v3/runStore.server.test.ts b/apps/webapp/app/v3/runStore.server.test.ts new file mode 100644 index 00000000000..065d8dbbc2e --- /dev/null +++ b/apps/webapp/app/v3/runStore.server.test.ts @@ -0,0 +1,267 @@ +import { heteroPostgresTest, heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore, RoutingRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { buildRunStore } from "./runStore.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +// 25-char internal id -> cuid -> LEGACY; 27-char internal id -> ksuid -> NEW. +const CUID_25 = "c".repeat(25); +const KSUID_27 = "k".repeat(27); + +async function seedEnvironment(prisma: PrismaClient, slugSuffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +function createRunInput(params: { + runId: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; +}) { + return { + data: { + id: params.runId, + engine: "V2" as const, + status: "PENDING" as const, + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT" as const, + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: { foo: "bar" }, + traceContext: { trace: "ctx" }, + traceId: "trace_1", + spanId: "span_1", + runTags: ["alpha"], + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + snapshot: { + engine: "V2" as const, + executionStatus: "RUN_CREATED" as const, + description: "Run was created", + runStatus: "PENDING" as const, + environmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT" as const, + projectId: params.projectId, + organizationId: params.organizationId, + }, + }; +} + +describe("T24 — findRun resolves ksuid run on dedicated DB", () => { + heteroRunOpsPostgresTest( + "split ON: findRun({friendlyId, runtimeEnvironmentId}, {select}) finds a ksuid run on the new store", + async ({ prisma14, prisma17 }) => { + const ENV_ID = "env_t24_ksuid_probe"; + const WORKER_ID = "worker_t24_lock"; + await prisma17.taskRun.create({ + data: { + id: KSUID_27, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_t24_ksuid", + runtimeEnvironmentId: ENV_ID, + environmentType: "DEVELOPMENT", + organizationId: "org_t24", + projectId: "proj_t24", + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceId: "trace_t24", + spanId: "span_t24", + queue: "task/my-task", + lockedToVersionId: WORKER_ID, + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + }); + + const store = buildRunStore({ + splitEnabled: true, + newWriter: prisma17, + newReplica: prisma17, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + const run = await store.findRun( + { friendlyId: "run_t24_ksuid", runtimeEnvironmentId: ENV_ID }, + { select: { lockedToVersionId: true } } + ); + + expect(run).not.toBeNull(); + expect(run?.lockedToVersionId).toBe(WORKER_ID); + expect(await prisma14.taskRun.findUnique({ where: { id: KSUID_27 } })).toBeNull(); + } + ); +}); + +describe("buildRunStore", () => { + heteroPostgresTest( + "split OFF returns a passthrough PostgresRunStore that writes only to the single DB", + async ({ prisma14, prisma17 }) => { + // Single-DB: every handle is prisma14. prisma17 must stay untouched. + const store = buildRunStore({ + splitEnabled: false, + newWriter: prisma14, + newReplica: prisma14, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + expect(store).toBeInstanceOf(PostgresRunStore); + + const seed = await seedEnvironment(prisma14, "off"); + // A ksuid id (would route to NEW under split) must still land on the single DB. + const runId = KSUID_27; + await store.createRun( + createRunInput({ + runId, + friendlyId: "run_off", + organizationId: seed.organization.id, + projectId: seed.project.id, + runtimeEnvironmentId: seed.environment.id, + }) + ); + + expect(await prisma14.taskRun.findUnique({ where: { id: runId } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: runId } })).toBeNull(); + } + ); + + heteroPostgresTest( + "split ON routes a NEW-classified create to the new store and a LEGACY-classified create to the legacy store", + async ({ prisma14, prisma17 }) => { + // legacy = PG14, new = PG17. + const store = buildRunStore({ + splitEnabled: true, + newWriter: prisma17, + newReplica: prisma17, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + expect(store).toBeInstanceOf(RoutingRunStore); + + const seedNew = await seedEnvironment(prisma17, "on_new"); + const seedLegacy = await seedEnvironment(prisma14, "on_legacy"); + + // ksuid -> NEW (PG17) + await store.createRun( + createRunInput({ + runId: KSUID_27, + friendlyId: "run_new", + organizationId: seedNew.organization.id, + projectId: seedNew.project.id, + runtimeEnvironmentId: seedNew.environment.id, + }) + ); + expect(await prisma17.taskRun.findUnique({ where: { id: KSUID_27 } })).not.toBeNull(); + expect(await prisma14.taskRun.findUnique({ where: { id: KSUID_27 } })).toBeNull(); + + // cuid -> LEGACY (PG14) + await store.createRun( + createRunInput({ + runId: CUID_25, + friendlyId: "run_legacy", + organizationId: seedLegacy.organization.id, + projectId: seedLegacy.project.id, + runtimeEnvironmentId: seedLegacy.environment.id, + }) + ); + expect(await prisma14.taskRun.findUnique({ where: { id: CUID_25 } })).not.toBeNull(); + expect(await prisma17.taskRun.findUnique({ where: { id: CUID_25 } })).toBeNull(); + } + ); + + heteroPostgresTest( + "split ON keeps a write on a LEGACY-classified id on the legacy store", + async ({ prisma14, prisma17 }) => { + // Routing is pure id-shape, so a cuid write stays LEGACY. + const store = buildRunStore({ + splitEnabled: true, + newWriter: prisma17, + newReplica: prisma17, + legacyWriter: prisma14, + legacyReplica: prisma14, + singleWriter: prisma14, + singleReplica: prisma14, + }); + + const seedLegacy = await seedEnvironment(prisma14, "no_marker_legacy"); + // The run lives on LEGACY (PG14); seed it directly. + await prisma14.taskRun.create({ + data: { + id: CUID_25, + engine: "V2", + status: "EXECUTING", + friendlyId: "run_no_marker", + runtimeEnvironmentId: seedLegacy.environment.id, + environmentType: "DEVELOPMENT", + organizationId: seedLegacy.organization.id, + projectId: seedLegacy.project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceId: "t", + spanId: "s", + queue: "task/my-task", + createdAt: new Date("2024-01-01T00:00:00.000Z"), + }, + }); + + const updated = await store.updateMetadata( + CUID_25, + { + metadata: '{"k":"v"}', + metadataType: "application/json", + metadataVersion: { increment: 1 }, + updatedAt: new Date("2024-01-02T00:00:00.000Z"), + }, + {} + ); + expect(updated.count).toBe(1); + + const onLegacy = await prisma14.taskRun.findUnique({ where: { id: CUID_25 } }); + expect(onLegacy?.metadata).toBe('{"k":"v"}'); + expect(await prisma17.taskRun.findUnique({ where: { id: CUID_25 } })).toBeNull(); + } + ); +}); diff --git a/apps/webapp/app/v3/runStore.server.ts b/apps/webapp/app/v3/runStore.server.ts index 2993597ea17..4173fc55eaa 100644 --- a/apps/webapp/app/v3/runStore.server.ts +++ b/apps/webapp/app/v3/runStore.server.ts @@ -1,8 +1,121 @@ -import { PostgresRunStore } from "@internal/run-store"; -import { $replica, prisma } from "~/db.server"; +import { PostgresRunStore, RoutingRunStore, type RunStore } from "@internal/run-store"; +import { ownerEngine, type Residency } from "@trigger.dev/core/v3/isomorphic"; +import type { PrismaClient, PrismaReplicaClient } from "@trigger.dev/database"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import { + $replica, + prisma, + runOpsLegacyPrisma, + runOpsLegacyReplica, + runOpsNewPrismaClient, + runOpsNewReplicaClient, +} from "~/db.server"; +import { env } from "~/env.server"; import { singleton } from "~/utils/singleton"; -export const runStore = singleton( - "PostgresRunStore", - () => new PostgresRunStore({ prisma, readOnlyPrisma: $replica }) -); +type BuildRunStoreDeps = { + /** Boot constant: true only when both run-ops DBs are configured and the split flag is on. */ + splitEnabled: boolean; + /** Split-only handles. Required when splitEnabled is true; omitted entirely when OFF + * so single-DB callers never touch the run-ops clients (keeps mocks/passthrough clean). */ + newWriter?: RunOpsPrismaClient; + newReplica?: RunOpsPrismaClient; + legacyWriter?: PrismaClient; + legacyReplica?: PrismaReplicaClient; + /** Single-DB store handles (control-plane pair). Used verbatim when split is OFF. */ + singleWriter: PrismaClient; + singleReplica: PrismaReplicaClient; + /** Residency classifier; defaults to ownerEngine inside RoutingRunStore. */ + classify?: (id: string) => Residency; +}; + +/** + * Pure run-store builder (no env / no boot side effects — webapp testability rule). + * + * Split OFF (default / self-host): returns the exact passthrough PostgresRunStore we + * have always returned, built from the single control-plane handles. No second store + * is constructed and no marker predicate is consulted, so behavior is byte-identical + * to single-DB today. + * + * Split ON: returns a RoutingRunStore that selects between a NEW store (where new runs + * are born) and a LEGACY store (draining) by run-id residency (id shape). There is no cuid + * migration, so a LEGACY-classified id is always LEGACY-resident. + */ +export function buildRunStore(deps: BuildRunStoreDeps): RunStore { + if (!deps.splitEnabled) { + return new PostgresRunStore({ + prisma: deps.singleWriter, + readOnlyPrisma: deps.singleReplica, + }); + } + + if (!deps.newWriter || !deps.newReplica || !deps.legacyWriter || !deps.legacyReplica) { + throw new Error("buildRunStore: split is enabled but run-ops store handles are missing"); + } + // The NEW store is backed by the dedicated RunOpsPrismaClient (subset schema): relation-shaped + // ops branch onto FK-free scalars + explicit join models. The LEGACY store keeps the default + // "legacy" variant (full @trigger.dev/database schema with implicit M2M + @relations). + const newStore = new PostgresRunStore({ + prisma: deps.newWriter, + readOnlyPrisma: deps.newReplica, + schemaVariant: "dedicated", + }); + const legacyStore = new PostgresRunStore({ + prisma: deps.legacyWriter, + readOnlyPrisma: deps.legacyReplica, + }); + + return new RoutingRunStore({ + new: newStore, + legacy: legacyStore, + classify: deps.classify ?? ownerEngine, + }); +} + +// Build the routing store whenever BOTH run-ops DBs are configured, independent of +// RUN_OPS_SPLIT_ENABLED. Reads must fan out across both DBs so a run that lives on the new +// DB stays visible even with the flag off (matches the db.server topology factory). The flag +// governs write/mint residency + migration via isSplitEnabled(), not read visibility. +const ROUTING_ENABLED = !!env.TASK_RUN_DATABASE_URL && !!env.TASK_RUN_LEGACY_DATABASE_URL; + +// Resolve the run-ops handles, tolerating contexts where they are absent — tests that mock +// ~/db.server minimally omit them, and accessing a missing export under vi.mock throws. A +// miss means "no run-ops handles here" and we fall back to single-store. +function tryResolveRunOpsHandles() { + try { + if ( + !runOpsNewPrismaClient || + !runOpsNewReplicaClient || + !runOpsLegacyPrisma || + !runOpsLegacyReplica + ) { + return null; + } + return { + newWriter: runOpsNewPrismaClient, + newReplica: runOpsNewReplicaClient, + legacyWriter: runOpsLegacyPrisma, + legacyReplica: runOpsLegacyReplica, + }; + } catch { + return null; + } +} + +export const runStore: RunStore = singleton("RunStore", () => { + const handles = ROUTING_ENABLED ? tryResolveRunOpsHandles() : null; + // Single-store passthrough: self-host (one DB), or a context without run-ops handles. + if (!handles) { + return buildRunStore({ + splitEnabled: false, + singleWriter: prisma, + singleReplica: $replica, + }); + } + return buildRunStore({ + splitEnabled: true, + ...handles, + singleWriter: prisma, + singleReplica: $replica, + }); +}); diff --git a/apps/webapp/app/v3/services/allocateConcurrency.server.ts b/apps/webapp/app/v3/services/allocateConcurrency.server.ts index 83fb69d0623..b9a28daba60 100644 --- a/apps/webapp/app/v3/services/allocateConcurrency.server.ts +++ b/apps/webapp/app/v3/services/allocateConcurrency.server.ts @@ -2,6 +2,7 @@ import { tryCatch } from "@trigger.dev/core"; import { ManageConcurrencyPresenter } from "~/presenters/v3/ManageConcurrencyPresenter.server"; import { BaseService } from "./baseService.server"; import { updateEnvConcurrencyLimits } from "../runQueue.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; type Input = { userId: string; @@ -88,6 +89,9 @@ export class AllocateConcurrencyService extends BaseService { if (!updatedEnvironment.paused) { await updateEnvConcurrencyLimits(updatedEnvironment); } + + // maximumConcurrencyLimit changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environment.id); } return { diff --git a/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts b/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts index 61238cd8a1c..0b59d4c7fae 100644 --- a/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts +++ b/apps/webapp/app/v3/services/billingLimit/billingLimitConvergeEnvironments.server.ts @@ -7,6 +7,7 @@ import { } from "@trigger.dev/database"; import { prisma } from "~/db.server"; import { logger } from "~/services/logger.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { BILLABLE_ENVIRONMENT_TYPES, BILLING_LIMIT_CONVERGE_BATCH_SIZE, @@ -177,6 +178,9 @@ async function pauseEnvironmentForBillingLimit( data: { paused: false, pauseSource: null }, }); throw error; + } finally { + // The env's paused state changed (or was rolled back); drop any cached copy either way. + controlPlaneResolver.invalidateEnvironment(environment.id); } } @@ -208,5 +212,8 @@ async function resumeEnvironmentFromBillingLimit( }, }); throw error; + } finally { + // The env's paused state changed (or was rolled back); drop any cached copy either way. + controlPlaneResolver.invalidateEnvironment(environment.id); } } diff --git a/apps/webapp/app/v3/services/pauseEnvironment.server.ts b/apps/webapp/app/v3/services/pauseEnvironment.server.ts index 4cafbac1405..af9edff856c 100644 --- a/apps/webapp/app/v3/services/pauseEnvironment.server.ts +++ b/apps/webapp/app/v3/services/pauseEnvironment.server.ts @@ -5,6 +5,7 @@ import { getManualPauseEnvironmentResult } from "~/v3/services/billingLimit/manu import { updateEnvConcurrencyLimits } from "../runQueue.server"; import { WithRunEngine } from "./baseService.server"; import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; export type PauseStatus = "paused" | "resumed"; @@ -127,9 +128,14 @@ export class PauseEnvironmentService extends WithRunEngine { pauseSource: previousPauseState?.pauseSource ?? null, }, }); + // Rollback still wrote the env row; drop any cached copy before rethrowing. + controlPlaneResolver.invalidateEnvironment(environment.id); throw error; } + // The env's `paused` state changed in the control-plane; drop any cached copy. + controlPlaneResolver.invalidateEnvironment(environment.id); + return { success: true, state: action, diff --git a/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts b/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts index f18d2e5aa67..c7b2c4aebe4 100644 --- a/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts +++ b/apps/webapp/app/v3/taskRunHeartbeatFailed.server.ts @@ -8,6 +8,7 @@ import type { PrismaClientOrTransaction } from "~/db.server"; import { workerQueue } from "~/services/worker.server"; import { socketIo } from "./handleSocketIo.server"; import { TaskRunErrorCodes } from "@trigger.dev/core/v3"; +import { controlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; import { isV3Disabled } from "./engineDeprecation.server"; export class TaskRunHeartbeatFailedService extends BaseService { @@ -23,16 +24,8 @@ export class TaskRunHeartbeatFailedService extends BaseService { friendlyId: true, status: true, lockedAt: true, - runtimeEnvironment: { - select: { - type: true, - }, - }, - lockedToVersion: { - select: { - supportsLazyAttempts: true, - }, - }, + runtimeEnvironmentId: true, + lockedToVersionId: true, _count: { select: { attempts: true, @@ -60,6 +53,16 @@ export class TaskRunHeartbeatFailedService extends BaseService { return; } + const env = await controlPlaneResolver.resolveEnv(taskRun.runtimeEnvironmentId); + const lockedWorker = await controlPlaneResolver.resolveRunLockedWorker({ + lockedToVersionId: taskRun.lockedToVersionId, + }); + + if (!env) { + logger.debug("TaskRunHeartbeatFailedService: environment not found", { runId }); + return; + } + const service = new FailedTaskRunService(); switch (taskRun.status) { @@ -143,7 +146,7 @@ export class TaskRunHeartbeatFailedService extends BaseService { ); try { - if (taskRun.runtimeEnvironment.type === "DEVELOPMENT") { + if (env.type === "DEVELOPMENT") { return; } @@ -152,7 +155,7 @@ export class TaskRunHeartbeatFailedService extends BaseService { version: "v1", runId: taskRun.id, // Give the run a few seconds to exit to complete any flushing etc - delayInMs: taskRun.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, + delayInMs: lockedWorker?.lockedToVersion?.supportsLazyAttempts ? 5_000 : undefined, }); } catch (error) { logger.error("[TaskRunHeartbeatFailedService] Error signaling run cancellation", { diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 638fbda5896..643093624b4 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -59,6 +59,7 @@ "@internal/llm-model-catalog": "workspace:*", "@internal/redis": "workspace:*", "@internal/run-engine": "workspace:*", + "@internal/run-ops-database": "workspace:*", "@internal/run-store": "workspace:*", "@internal/schedule-engine": "workspace:*", "@internal/tracing": "workspace:*", @@ -249,6 +250,7 @@ "@swc/helpers": "^0.4.11", "@tailwindcss/forms": "^0.5.3", "@tailwindcss/typography": "^0.5.9", + "@testcontainers/postgresql": "^11.14.0", "@total-typescript/ts-reset": "^0.4.2", "@types/bcryptjs": "^2.4.2", "@types/compression": "^1.7.2", diff --git a/apps/webapp/test/findEnvironmentFromRun.readthrough.test.ts b/apps/webapp/test/findEnvironmentFromRun.readthrough.test.ts new file mode 100644 index 00000000000..7231205ea8d --- /dev/null +++ b/apps/webapp/test/findEnvironmentFromRun.readthrough.test.ts @@ -0,0 +1,135 @@ +// Real PG14 (control-plane) + PG17 (run-ops) proof for findEnvironmentFromRun. +// The env (slug/project/org) lives on PG14; the run-ops scalar row on PG17 with cross-seam +// FKs dropped. A PostgresRunStore over PG17 reads run scalars; the ControlPlaneResolver over +// PG14 resolves the env. The DB is never mocked. The .count() proof shows neither DB joins +// the other. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 120_000, hookTimeout: 120_000 }); + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const constraint of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe( + `ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${constraint}"` + ); + } +} + +let seedCounter = 0; + +async function seedControlPlane(prisma: PrismaClient) { + const n = seedCounter++; + const organization = await prisma.organization.create({ + data: { title: `Org ${n}`, slug: `org-${n}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${n}`, + slug: `project-${n}`, + externalRef: `proj_${n}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${n}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_prod_${n}`, + pkApiKey: `pk_prod_${n}`, + shortcode: `short_${n}`, + }, + }); + return { organization, project, environment }; +} + +async function seedRun( + prisma: PrismaClient, + ids: { runtimeEnvironmentId: string; projectId: string; organizationId: string }, + opts?: { runTags?: string[] } +) { + const n = seedCounter++; + return prisma.taskRun.create({ + data: { + id: `run_${n}_pg17`, + engine: "V2", + status: "PENDING", + friendlyId: `run_friendly_${n}`, + runtimeEnvironmentId: ids.runtimeEnvironmentId, + organizationId: ids.organizationId, + projectId: ids.projectId, + taskIdentifier: "fefr-task", + payload: "{}", + payloadType: "application/json", + queue: "task/fefr-task", + traceId: `trace_${n}`, + spanId: `span_${n}`, + workerQueue: "main", + runTags: opts?.runTags ?? ["a", "b"], + }, + }); +} + +function buildResolver(controlPlane: PrismaClient) { + return new ControlPlaneResolver({ + controlPlanePrimary: controlPlane, + controlPlaneReplica: controlPlane, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => false, + }); +} + +describe("findEnvironmentFromRun cross-DB read-through", () => { + heteroPostgresTest( + "resolves env from PG14 while run scalars resolve from PG17 (no cross-DB join)", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedControlPlane(prisma14 as unknown as PrismaClient); + const run = await seedRun( + prisma17 as unknown as PrismaClient, + { + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + }, + { runTags: ["x", "y"] } + ); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = buildResolver(prisma14 as unknown as PrismaClient); + + // The decomposed findEnvironmentFromRun: run scalars from the store + env from the resolver. + const taskRun = await runStore.findRun( + { id: run.id }, + { select: { runtimeEnvironmentId: true, runTags: true, batchId: true } }, + prisma17 as unknown as PrismaClient + ); + expect(taskRun).not.toBeNull(); + const environment = await resolver.resolveAuthenticatedEnv(taskRun!.runtimeEnvironmentId); + expect(environment).not.toBeNull(); + expect(environment!.id).toBe(cp.environment.id); + expect(environment!.slug).toBe(cp.environment.slug); + expect(environment!.project.id).toBe(cp.project.id); + expect(taskRun!.runTags).toEqual(["x", "y"]); + + // Inversion proof: PG17 (run-ops) has no env rows; PG14 (control-plane) has no run rows. + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/apps/webapp/test/routeLoaders.controlPlane.readthrough.test.ts b/apps/webapp/test/routeLoaders.controlPlane.readthrough.test.ts new file mode 100644 index 00000000000..a6437a3e016 --- /dev/null +++ b/apps/webapp/test/routeLoaders.controlPlane.readthrough.test.ts @@ -0,0 +1,164 @@ +// Real PG14 (control-plane) + PG17 (run-ops) proof for the run route loaders that were +// decomposed onto the ControlPlaneResolver. The env (slug/project/org) and the +// locked worker/deployment live on PG14; the run-ops scalar row on PG17 with cross-seam FKs +// dropped (including the lockedById / lockedToVersionId FKs). A PostgresRunStore over PG17 +// reads run scalars; the ControlPlaneResolver over PG14 resolves env + lockedBy.worker.deployment. +// The DB is never mocked. The .count() proof shows neither DB joins the other. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000, hookTimeout: 60_000 }); + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", + "TaskRun_lockedById_fkey", + "TaskRun_lockedToVersionId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const c of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe(`ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${c}"`); + } +} + +let n = 0; +async function seedAll(prisma: PrismaClient) { + const s = n++; + const organization = await prisma.organization.create({ + data: { title: `Org ${s}`, slug: `org-${s}` }, + }); + const project = await prisma.project.create({ + data: { + name: `P ${s}`, + slug: `p-${s}`, + externalRef: `proj_${s}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${s}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${s}`, + pkApiKey: `pk_${s}`, + shortcode: `sc_${s}`, + }, + }); + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${s}`, + contentHash: `hash_${s}`, + projectId: project.id, + runtimeEnvironmentId: environment.id, + version: `2024.1.${s}`, + metadata: {}, + engine: "V2", + }, + }); + const deployment = await prisma.workerDeployment.create({ + data: { + friendlyId: `dep_${s}`, + contentHash: `hash_${s}`, + version: worker.version, + shortCode: `dc_${s}`, + type: "MANAGED", + status: "DEPLOYED", + projectId: project.id, + environmentId: environment.id, + workerId: worker.id, + git: { commitSha: `sha_${s}` }, + }, + }); + const task = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${s}`, + slug: `t-${s}`, + filePath: "src/index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: environment.id, + projectId: project.id, + }, + }); + return { organization, project, environment, worker, deployment, task }; +} + +describe("run route loader cross-DB read-through", () => { + heteroPostgresTest( + "resources.runs.$runParam env + lockedBy.worker.deployment.git resolve from PG14", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedAll(prisma14 as unknown as PrismaClient); + + const run = await (prisma17 as unknown as PrismaClient).taskRun.create({ + data: { + id: `run_${n++}_pg17`, + engine: "V2", + status: "COMPLETED_SUCCESSFULLY", + friendlyId: `run_rl_${n}`, + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + lockedById: cp.task.id, + lockedToVersionId: cp.worker.id, + taskIdentifier: "rl-task", + payload: "{}", + payloadType: "application/json", + queue: "task/rl-task", + traceId: "tr_rl", + spanId: "sp_rl", + workerQueue: "main", + }, + }); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma14 as unknown as PrismaClient, + controlPlaneReplica: prisma14 as unknown as PrismaClient, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { + select: { + id: true, + runtimeEnvironmentId: true, + lockedById: true, + lockedToVersionId: true, + }, + }, + prisma17 as unknown as PrismaClient + ); + const env = await resolver.resolveAuthenticatedEnv(found!.runtimeEnvironmentId); + expect(env!.slug).toBe(cp.environment.slug); + expect(env!.organization.title).toBe(cp.organization.title); + expect(env!.project.externalRef).toBe(cp.project.externalRef); + + const locked = await resolver.resolveRunLockedWorker({ + lockedById: found!.lockedById, + lockedToVersionId: found!.lockedToVersionId, + }); + expect(locked!.lockedToVersion!.version).toBe(cp.worker.version); + expect(locked!.lockedBy!.worker.deployment!.git).toEqual({ + commitSha: cp.deployment.git ? (cp.deployment.git as any).commitSha : undefined, + }); + expect(locked!.lockedBy!.worker.deployment!.friendlyId).toBe(cp.deployment.friendlyId); + + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts b/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts new file mode 100644 index 00000000000..d2bf3d6e193 --- /dev/null +++ b/apps/webapp/test/runDetailLoaders.controlPlane.readthrough.test.ts @@ -0,0 +1,190 @@ +// Dedicated run-ops proof: the run-detail page loaders read the run by friendlyId on the dedicated +// run-ops client (PG17, subset schema with no control-plane tables), then authorize membership + +// resolve env on PG14. Neither DB joins the other. +import { heteroRunOpsPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { RunOpsPrismaClient } from "@internal/run-ops-database"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000, hookTimeout: 60_000 }); + +let n = 0; +async function seedAll(prisma: PrismaClient) { + const s = n++; + const organization = await prisma.organization.create({ + data: { title: `Org ${s}`, slug: `org-${s}` }, + }); + const project = await prisma.project.create({ + data: { + name: `P ${s}`, + slug: `p-${s}`, + externalRef: `proj_${s}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${s}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${s}`, + pkApiKey: `pk_${s}`, + shortcode: `sc_${s}`, + }, + }); + const member = await prisma.user.create({ + data: { email: `u-${s}@example.com`, name: `U ${s}`, authenticationMethod: "MAGIC_LINK" }, + }); + await prisma.orgMember.create({ + data: { organizationId: organization.id, userId: member.id, role: "ADMIN" }, + }); + const stranger = await prisma.user.create({ + data: { email: `x-${s}@example.com`, name: `X ${s}`, authenticationMethod: "MAGIC_LINK" }, + }); + return { organization, project, environment, member, stranger }; +} + +// The run lives on the dedicated run-ops client; its control-plane FKs are synthetic scalar ids +// pointing at rows that exist only on PG14 (the dedicated DB has no such tables). +async function seedKsuidRun(prisma17: RunOpsPrismaClient, cp: Awaited>) { + const k = n++; + return prisma17.taskRun.create({ + data: { + id: `run_2abcDEF${k}ghijkLMNOPqrstuv`, + engine: "V2", + status: "COMPLETED_SUCCESSFULLY", + friendlyId: `run_2abcDEF${k}ghijkLMNOPqrstuv`, + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + taskIdentifier: "run-detail-task", + payload: "{}", + payloadType: "application/json", + queue: "task/run-detail-task", + idempotencyKey: "idem-1", + spanId: `sp_${k}`, + traceId: `tr_${k}`, + number: 1, + workerQueue: "main", + }, + }); +} + +function wire(prisma14: PrismaClient, prisma17: RunOpsPrismaClient) { + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + schemaVariant: "dedicated", + }); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma14, + controlPlaneReplica: prisma14, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + return { runStore, resolver }; +} + +describe("run-detail loaders cross-DB read-through (dedicated run-ops client)", () => { + heteroRunOpsPostgresTest( + "ksuid run resolves: friendlyId read on the dedicated run-ops DB + membership/env auth on PG14 (resources.runs.$runParam shape)", + async ({ prisma14, prisma17 }) => { + const cp14 = prisma14 as unknown as PrismaClient; + const cp = await seedAll(cp14); + const run = await seedKsuidRun(prisma17, cp); + const { runStore, resolver } = wire(cp14, prisma17); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { + select: { + id: true, + traceId: true, + projectId: true, + runtimeEnvironmentId: true, + status: true, + queue: true, + spanId: true, + idempotencyKey: true, + taskIdentifier: true, + }, + } + ); + expect(found).not.toBeNull(); + expect(found!.id).toBe(run.id); + + const authorized = await cp14.project.findFirst({ + where: { + id: found!.projectId, + organization: { members: { some: { userId: cp.member.id } } }, + }, + select: { id: true }, + }); + expect(authorized).not.toBeNull(); + + const env = await resolver.resolveAuthenticatedEnv(found!.runtimeEnvironmentId); + expect(env!.slug).toBe(cp.environment.slug); + expect(env!.project.slug).toBe(cp.project.slug); + expect(env!.organization.slug).toBe(cp.organization.slug); + + // Inversion proof: no run on PG14 (control-plane). + expect(await cp14.taskRun.count()).toBe(0); + } + ); + + heteroRunOpsPostgresTest( + "non-member is denied: membership findFirst returns null (404/redirect path)", + async ({ prisma14, prisma17 }) => { + const cp14 = prisma14 as unknown as PrismaClient; + const cp = await seedAll(cp14); + const run = await seedKsuidRun(prisma17, cp); + const { runStore } = wire(cp14, prisma17); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { select: { id: true, projectId: true, runtimeEnvironmentId: true } } + ); + expect(found).not.toBeNull(); + + const authorized = await cp14.project.findFirst({ + where: { + id: found!.projectId, + organization: { members: { some: { userId: cp.stranger.id } } }, + }, + select: { id: true }, + }); + expect(authorized).toBeNull(); + } + ); + + heteroRunOpsPostgresTest( + "env-slug-scoped routes: idempotencyKey.reset re-imposes env slug on the resolved env", + async ({ prisma14, prisma17 }) => { + const cp14 = prisma14 as unknown as PrismaClient; + const cp = await seedAll(cp14); + const run = await seedKsuidRun(prisma17, cp); + const { runStore, resolver } = wire(cp14, prisma17); + + const found = await runStore.findRun( + { friendlyId: run.friendlyId }, + { + select: { + id: true, + idempotencyKey: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + }, + } + ); + const env = await resolver.resolveAuthenticatedEnv(found!.runtimeEnvironmentId); + expect(env!.slug).toBe(cp.environment.slug); + expect(env!.slug === "does-not-match").toBe(false); + expect(found!.idempotencyKey).toBe("idem-1"); + } + ); +}); diff --git a/apps/webapp/test/runEngineHandlers.test.ts b/apps/webapp/test/runEngineHandlers.test.ts new file mode 100644 index 00000000000..2c57d87506e --- /dev/null +++ b/apps/webapp/test/runEngineHandlers.test.ts @@ -0,0 +1,671 @@ +import { containerTest, heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import type { CompleteBatchResult } from "@internal/run-engine"; +import { describe, expect, vi } from "vitest"; +import { + handleBatchCompletion, + readRunForEvent, + readRunForEventOrThrow, + resolveBatchRunOpsWriter, + type BatchCompletionDeps, + type EventReadDeps, +} from "~/v3/runEngineHandlersShared.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +// Proves two routing properties against REAL Postgres (never mocked): +// 1. the 7 TaskRun event reads resolve run-ops new-or-old via read-through; +// 2. the batch update + error-createMany transaction commits entirely on the +// run-ops writer that owns the BatchTaskRun row (no boundary-spanning txn). + +const EVENT_SELECT = { + id: true, + friendlyId: true, + traceId: true, + spanId: true, + parentSpanId: true, + createdAt: true, + completedAt: true, + taskIdentifier: true, + projectId: true, + runtimeEnvironmentId: true, + environmentType: true, + isTest: true, + organizationId: true, + taskEventStore: true, + runTags: true, + batchId: true, +} as const; + +async function seedEnvironment(prisma: PrismaClient, slugSuffix: string) { + const organization = await prisma.organization.create({ + data: { title: `Org ${slugSuffix}`, slug: `org-${slugSuffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${slugSuffix}`, + slug: `project-${slugSuffix}`, + externalRef: `proj_${slugSuffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "DEVELOPMENT", + slug: "dev", + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_dev_${slugSuffix}`, + pkApiKey: `pk_dev_${slugSuffix}`, + shortcode: `short_${slugSuffix}`, + }, + }); + return { organization, project, environment }; +} + +async function seedTaskRun( + prisma: PrismaClient, + params: { + id: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + runTags?: string[]; + } +) { + return prisma.taskRun.create({ + data: { + id: params.id, + engine: "V2", + status: "COMPLETED_SUCCESSFULLY", + friendlyId: params.friendlyId, + taskIdentifier: "my-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + traceId: "trace_1", + spanId: "span_1", + queue: "task/my-task", + runtimeEnvironmentId: params.runtimeEnvironmentId, + projectId: params.projectId, + organizationId: params.organizationId, + environmentType: "DEVELOPMENT", + isTest: false, + taskEventStore: "taskEvent", + runTags: params.runTags ?? ["alpha", "beta"], + createdAt: new Date("2024-01-01T00:00:00.000Z"), + completedAt: new Date("2024-01-01T00:01:00.000Z"), + }, + }); +} + +async function seedBatch( + prisma: PrismaClient, + params: { id: string; friendlyId: string; runtimeEnvironmentId: string } +) { + return prisma.batchTaskRun.create({ + data: { + id: params.id, + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + status: "PENDING", + }, + }); +} + +function makeBatchDeps( + overrides: { + splitEnabled?: boolean; + newReplica?: PrismaClient; + newWriter?: PrismaClient; + legacyWriter?: PrismaClient; + legacyReplica?: PrismaClient; + } & { single?: PrismaClient } +): BatchCompletionDeps & { tryCompleteBatchCalls: string[] } { + const single = overrides.single; + const tryCompleteBatchCalls: string[] = []; + return { + splitEnabled: overrides.splitEnabled ?? false, + newReplica: (overrides.newReplica ?? single)!, + newWriter: (overrides.newWriter ?? single)!, + legacyWriter: (overrides.legacyWriter ?? single)!, + tryCompleteBatch: async (batchId: string) => { + tryCompleteBatchCalls.push(batchId); + }, + tryCompleteBatchCalls, + }; +} + +function failure(index: number, errorCode: string, extra?: Record) { + return { + index, + taskIdentifier: "my-task", + payload: '{"item":' + index + "}", + options: { foo: "bar" }, + error: `error ${index}`, + errorCode, + timestamp: Date.now(), + ...extra, + }; +} + +describe("runEngineHandlers read-through", () => { + // A NEW run resolves via read-through against the new store. + containerTest("event read resolves a NEW run via read-through", async ({ prisma }) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const { organization, project, environment } = await seedEnvironment(prisma, "a"); + await seedTaskRun(prisma, { + id: "run_new_a", + friendlyId: "run_friendly_a", + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + runTags: ["x", "y"], + }); + + const deps: EventReadDeps = { + store, + newReplica: prisma, + legacyReplica: prisma, + splitEnabled: false, + }; + + const run = await readRunForEvent("run_new_a", environment.id, EVENT_SELECT, deps); + + expect(run).not.toBeNull(); + expect(run!.id).toBe("run_new_a"); + expect(run!.friendlyId).toBe("run_friendly_a"); + expect(run!.runTags).toEqual(["x", "y"]); + expect(run!.organizationId).toBe(organization.id); + expect(run!.taskEventStore).toBe("taskEvent"); + }); + + // Single-DB short-circuit — readLegacy must never be invoked. + containerTest("single-DB short-circuit never touches a legacy handle", async ({ prisma }) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const { organization, project, environment } = await seedEnvironment(prisma, "c"); + await seedTaskRun(prisma, { + id: "run_single_c", + friendlyId: "run_friendly_c", + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + // A legacy replica that THROWS if read — proves the short-circuit. + const exploding = new Proxy( + {}, + { + get() { + throw new Error("legacy replica must not be touched in single-DB mode"); + }, + } + ) as unknown as PrismaClient; + + const deps: EventReadDeps = { + store, + newReplica: prisma, + legacyReplica: exploding, + splitEnabled: false, + }; + + const run = await readRunForEvent("run_single_c", environment.id, EVENT_SELECT, deps); + expect(run!.id).toBe("run_single_c"); + }); + + // readRunForEventOrThrow reproduces the not-found-as-error semantics. + containerTest("readRunForEventOrThrow throws on a missing run", async ({ prisma }) => { + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + await seedEnvironment(prisma, "nf"); + + const deps: EventReadDeps = { + store, + newReplica: prisma, + legacyReplica: prisma, + splitEnabled: false, + }; + + await expect( + readRunForEventOrThrow("run_missing", "env_x", EVENT_SELECT, deps) + ).rejects.toThrow(); + + // Nullable helper returns null instead of throwing for the same input. + const run = await readRunForEvent("run_missing", "env_x", EVENT_SELECT, deps); + expect(run).toBeNull(); + }); +}); + +describe("runEngineHandlers read-through cross-version", () => { + // An OLD in-retention run is served off the LEGACY REPLICA only, and the legacy + // primary/writer is structurally absent. + heteroPostgresTest( + "event read resolves an OLD in-retention run via the legacy replica", + async ({ prisma14, prisma17 }) => { + const newStore = new PostgresRunStore({ prisma: prisma17, readOnlyPrisma: prisma17 }); + const legacyStore = new PostgresRunStore({ prisma: prisma14, readOnlyPrisma: prisma14 }); + + const legacySeed = await seedEnvironment(prisma14, "b14"); + // A 25-char cuid id classifies as LEGACY so read-through probes new, misses, + // then falls back to the legacy replica. + const legacyRunId = "c".repeat(25); + const seededRow = await seedTaskRun(prisma14, { + id: legacyRunId, + friendlyId: "run_friendly_b", + organizationId: legacySeed.organization.id, + projectId: legacySeed.project.id, + runtimeEnvironmentId: legacySeed.environment.id, + runTags: ["legacy", "tag"], + }); + + // The read uses the NEW store for the new-DB probe and the LEGACY store for + // the replica fallback, so a hit can only come from the legacy replica. + let legacyReplicaUsed = false; + // A store facade that routes the read to the legacy store when handed the + // legacy client and the new store otherwise — both real DBs, no mocks. + const routedStore = { + ...newStore, + findRun: ((where: any, args: any, client: any) => { + if (client === prisma14) { + legacyReplicaUsed = true; + return legacyStore.findRun(where, args, client); + } + return newStore.findRun(where, args, client); + }) as typeof newStore.findRun, + } as PostgresRunStore; + const routedDeps: EventReadDeps = { + store: routedStore, + newReplica: prisma17, + legacyReplica: prisma14, + splitEnabled: true, + }; + + const run = await readRunForEvent( + legacyRunId, + legacySeed.environment.id, + EVENT_SELECT, + routedDeps + ); + + expect(legacyReplicaUsed).toBe(true); + expect(run).not.toBeNull(); + expect(run!.id).toBe(legacyRunId); + // Byte-identity of the enrichment select across the legacy<->new boundary: + // re-read the same row on the legacy replica directly and deep-equal it. + const direct = await legacyStore.findRun( + { id: legacyRunId }, + { select: EVENT_SELECT }, + prisma14 + ); + expect(run).toEqual(direct); + expect(run!.runTags).toEqual(["legacy", "tag"]); + expect(seededRow.id).toBe(legacyRunId); + + // The new DB has no such run. + const onNew = await newStore.findRun({ id: legacyRunId }, { select: EVENT_SELECT }, prisma17); + expect(onNew).toBeNull(); + } + ); +}); + +describe("runEngineHandlers batch completion", () => { + // Tests D + F: the txn commits whole on a single run-ops writer; rolls back atomically. + containerTest("batch txn commits whole on the run-ops writer", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "d"); + const batchId = "c".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_d", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1", "run_friendly_2"], + successfulRunCount: 2, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR", { options: { nested: { a: 1, b: [2, 3] } } })], + }; + + await handleBatchCompletion(result, deps); + + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("PARTIAL_FAILED"); + expect(batch.runIds).toEqual(["run_friendly_1", "run_friendly_2"]); + expect(batch.successfulRunCount).toBe(2); + expect(batch.failedRunCount).toBe(1); + expect(batch.processingCompletedAt).not.toBeNull(); + + const errors = await prisma.batchTaskRunError.findMany({ where: { batchTaskRunId: batchId } }); + expect(errors).toHaveLength(1); + expect(errors[0]!.errorCode).toBe("TRIGGER_ERROR"); + // JSON round-trip of options. + expect(errors[0]!.options).toEqual({ nested: { a: 1, b: [2, 3] } }); + + // PARTIAL_FAILED (not ABORTED) -> tryCompleteBatch is invoked. + expect(deps.tryCompleteBatchCalls).toEqual([batchId]); + }); + + // Atomicity: if the createMany fails, the update rolls back too. + containerTest("batch txn rolls back the update when createMany fails", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "rb"); + const batchId = "d".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_rb", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + // A failure with a null taskIdentifier violates the NOT NULL constraint inside + // the createMany, forcing the whole transaction to roll back. + const result = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 0, + failedRunCount: 1, + failures: [ + { index: 0, taskIdentifier: null as any, payload: "{}", error: "boom", timestamp: 1 }, + ], + } as unknown as CompleteBatchResult; + + await expect(handleBatchCompletion(result, deps)).rejects.toThrow(); + + // The update must NOT have committed — status stays PENDING from the seed. + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("PENDING"); + expect(batch.processingCompletedAt).toBeNull(); + }); + + // Callback retry is idempotent via skipDuplicates. + containerTest("batch txn is idempotent on callback retry", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "e"); + const batchId = "e".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_e", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: [], + successfulRunCount: 0, + failedRunCount: 2, + failures: [failure(0, "TRIGGER_ERROR"), failure(1, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + await handleBatchCompletion(result, deps); + + const errors = await prisma.batchTaskRunError.findMany({ where: { batchTaskRunId: batchId } }); + expect(errors).toHaveLength(2); + }); + + // Aggregate fast-path collapses same-errorCode failures to one row. + containerTest("aggregate fast-path collapses queue-size-limit failures", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "i"); + const batchId = "f".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_i", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: [], + successfulRunCount: 0, + failedRunCount: 3, + failures: [ + failure(5, "QUEUE_SIZE_LIMIT_EXCEEDED"), + failure(6, "QUEUE_SIZE_LIMIT_EXCEEDED"), + failure(7, "QUEUE_SIZE_LIMIT_EXCEEDED"), + ], + }; + + await handleBatchCompletion(result, deps); + + const errors = await prisma.batchTaskRunError.findMany({ where: { batchTaskRunId: batchId } }); + expect(errors).toHaveLength(1); + expect(errors[0]!.index).toBe(5); + expect(errors[0]!.error).toContain("(3 items in this batch failed with the same error)"); + }); + + // ABORTED status does not call tryCompleteBatch. + containerTest("ABORTED batch does not call tryCompleteBatch", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "ab"); + const batchId = "g".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_ab", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: [], + successfulRunCount: 0, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("ABORTED"); + expect(batch.completedAt).not.toBeNull(); + expect(deps.tryCompleteBatchCalls).toEqual([]); + }); + + // A successful (no-failure) batch is PENDING and calls tryCompleteBatch. + containerTest("successful batch is PENDING and calls tryCompleteBatch", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "ok"); + const batchId = "h".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_ok", + runtimeEnvironmentId: environment.id, + }); + + const deps = makeBatchDeps({ single: prisma, splitEnabled: false }); + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 0, + failures: [], + }; + + await handleBatchCompletion(result, deps); + + const batch = await prisma.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(batch.status).toBe("PENDING"); + expect(deps.tryCompleteBatchCalls).toEqual([batchId]); + }); +}); + +describe("runEngineHandlers batch residency routing", () => { + // True single-DB invariant: the topology's cpFallback makes newReplica and + // legacyWriter the SAME control-plane client, so the probe always resolves to + // that one client regardless of where length-classification would guess. + containerTest("true single-DB resolves to the single client", async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma, "single"); + const batchId = "s".repeat(25); + await seedBatch(prisma, { + id: batchId, + friendlyId: "batch_friendly_single", + runtimeEnvironmentId: environment.id, + }); + + const writer = await resolveBatchRunOpsWriter(batchId, { + newReplica: prisma, + newWriter: prisma, + legacyWriter: prisma, + }); + expect(writer).toBe(prisma); + }); + + // A legacy-resident batch (row only on the legacy DB) commits on the LEGACY writer; + // the NEW DB is left with zero rows for the batch. + heteroPostgresTest( + "legacy-resident batch routes to the LEGACY writer, new DB untouched", + async ({ prisma14, prisma17 }) => { + const legacySeed = await seedEnvironment(prisma14, "g14"); + const batchId = "c".repeat(25); + await seedBatch(prisma14, { + id: batchId, + friendlyId: "batch_friendly_g", + runtimeEnvironmentId: legacySeed.environment.id, + }); + + // The probe misses on new (the new DB has no such batch) and resolves the legacy writer. + const writer = await resolveBatchRunOpsWriter(batchId, { + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + }); + expect(writer).toBe(prisma14); + + const deps: BatchCompletionDeps = { + splitEnabled: true, + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + tryCompleteBatch: async () => {}, + }; + + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + + // Committed on the legacy DB. + const legacyBatch = await prisma14.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(legacyBatch.status).toBe("PARTIAL_FAILED"); + const legacyErrors = await prisma14.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(legacyErrors).toHaveLength(1); + + // The new DB has zero rows for this batch — no misroute. + const onNew = await prisma17.batchTaskRun.findMany({ where: { id: batchId } }); + expect(onNew).toHaveLength(0); + const newErrors = await prisma17.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(newErrors).toHaveLength(0); + } + ); + + // Regression: the real "run-ops DB connected, split flag off" state. splitEnabled + // is false, yet newWriter is a DISTINCT (empty) DB while the batch lives on legacy. + // Old code wrote to newWriter -> "No record was found for an update" -> batch hangs. + heteroPostgresTest( + "split-off connected-but-off: legacy-resident batch routes to LEGACY, not newWriter", + async ({ prisma14, prisma17 }) => { + const legacySeed = await seedEnvironment(prisma14, "off14"); + const batchId = "c".repeat(25); + await seedBatch(prisma14, { + id: batchId, + friendlyId: "batch_friendly_off", + runtimeEnvironmentId: legacySeed.environment.id, + }); + + const deps: BatchCompletionDeps = { + splitEnabled: false, + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + tryCompleteBatch: async () => {}, + }; + + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR")], + }; + + await handleBatchCompletion(result, deps); + + // Committed on the legacy DB; the new DB (the distinct newWriter) untouched. + const legacyBatch = await prisma14.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(legacyBatch.status).toBe("PARTIAL_FAILED"); + expect(legacyBatch.processingCompletedAt).not.toBeNull(); + const legacyErrors = await prisma14.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(legacyErrors).toHaveLength(1); + + const onNew = await prisma17.batchTaskRun.findMany({ where: { id: batchId } }); + expect(onNew).toHaveLength(0); + const newErrors = await prisma17.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(newErrors).toHaveLength(0); + } + ); + + // A new batch (row only on the new DB) commits on the NEW writer; the LEGACY DB is untouched. + heteroPostgresTest( + "new batch routes to the NEW writer, legacy DB untouched", + async ({ prisma14, prisma17 }) => { + const newSeed = await seedEnvironment(prisma17, "h17"); + const batchId = "d".repeat(25); + await seedBatch(prisma17, { + id: batchId, + friendlyId: "batch_friendly_h", + runtimeEnvironmentId: newSeed.environment.id, + }); + + const writer = await resolveBatchRunOpsWriter(batchId, { + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + }); + expect(writer).toBe(prisma17); + + const deps: BatchCompletionDeps = { + splitEnabled: true, + newReplica: prisma17, + newWriter: prisma17, + legacyWriter: prisma14, + tryCompleteBatch: async () => {}, + }; + + const result: CompleteBatchResult = { + batchId, + runIds: ["run_friendly_1"], + successfulRunCount: 1, + failedRunCount: 1, + failures: [failure(0, "TRIGGER_ERROR", { options: { json: { deep: [1, 2, 3] } } })], + }; + + await handleBatchCompletion(result, deps); + + const newBatch = await prisma17.batchTaskRun.findFirstOrThrow({ where: { id: batchId } }); + expect(newBatch.status).toBe("PARTIAL_FAILED"); + const newErrors = await prisma17.batchTaskRunError.findMany({ + where: { batchTaskRunId: batchId }, + }); + expect(newErrors).toHaveLength(1); + // Batch JSON round-trip on the new DB. + expect(newErrors[0]!.options).toEqual({ json: { deep: [1, 2, 3] } }); + + // The legacy DB is untouched. + const onLegacy = await prisma14.batchTaskRun.findMany({ where: { id: batchId } }); + expect(onLegacy).toHaveLength(0); + } + ); +}); diff --git a/apps/webapp/test/runOpsCrossSeamGuard.test.ts b/apps/webapp/test/runOpsCrossSeamGuard.test.ts new file mode 100644 index 00000000000..5f232c08693 --- /dev/null +++ b/apps/webapp/test/runOpsCrossSeamGuard.test.ts @@ -0,0 +1,134 @@ +import { describe, it, expect } from "vitest"; +import { + computeStoreForCompletion, + selectStoreForWaitpoint, +} from "~/v3/runOpsMigration/crossSeamGuard.server"; +import { UnclassifiableRunId } from "@trigger.dev/core/v3/isomorphic"; + +// Real sample ids exercising the genuine run-id residency classifier (no stub). +const NEW = "waitpoint_" + "a".repeat(27); // 27-char ksuid body -> NEW +const LEGACY = "waitpoint_" + "a".repeat(25); // 25-char cuid body -> LEGACY +const AMBIGUOUS = "waitpoint_" + "a".repeat(10); // neither length -> throws + +describe("selectStoreForWaitpoint — happy-path residency routing", () => { + it("MANUAL completion of a NEW waitpoint selects the new store", () => { + const d = selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "MANUAL" }); + expect(d.store).toBe("new"); + expect(d.residency).toBe("NEW"); + }); + + it("RESUME_TOKEN completion of a LEGACY waitpoint selects the legacy store", () => { + const d = selectStoreForWaitpoint({ waitpointId: LEGACY, routeKind: "RESUME_TOKEN" }); + expect(d.store).toBe("legacy"); + expect(d.residency).toBe("LEGACY"); + }); + + it("DATETIME completion of a NEW waitpoint selects the new store", () => { + expect(selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "DATETIME" }).store).toBe("new"); + }); + + it("RUN completion of a NEW waitpoint selects the new store", () => { + expect(selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "RUN" }).store).toBe("new"); + }); + + it("IDEMPOTENCY_REUSE of a NEW waitpoint with no pins selects the new store", () => { + const d = selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "IDEMPOTENCY_REUSE" }); + expect(d.store).toBe("new"); + expect(d.pinnedReason).toBeUndefined(); + }); +}); + +describe("selectStoreForWaitpoint — legacy pins", () => { + it("pins a NEW-residency waitpoint to legacy when non-tree-owned", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "MANUAL", + treeOwnerResidency: "LEGACY", + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("non-tree-owned"); + }); + + it("pins cross-tree idempotency reuse to legacy", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "IDEMPOTENCY_REUSE", + isCrossTreeIdempotency: true, + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("cross-tree-idempotency"); + }); + + it("pins a descendant of a legacy parent to legacy", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "RUN", + hasLegacyParent: true, + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("legacy-parent-descendant"); + }); + + it("applies deterministic pin precedence: non-tree-owned wins", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "RUN", + treeOwnerResidency: "LEGACY", + isCrossTreeIdempotency: true, + hasLegacyParent: true, + }); + expect(d.store).toBe("legacy"); + expect(d.pinnedReason).toBe("non-tree-owned"); + }); + + it("reports the waitpoint's own residency even when pinned to legacy", () => { + const d = selectStoreForWaitpoint({ + waitpointId: NEW, + routeKind: "MANUAL", + treeOwnerResidency: "LEGACY", + }); + expect(d.store).toBe("legacy"); + expect(d.residency).toBe("NEW"); + }); +}); + +describe("selectStoreForWaitpoint — ambiguity and unknown routes are loud", () => { + it("rethrows UnclassifiableRunId for an ambiguous-length id (never silently routes)", () => { + expect(() => selectStoreForWaitpoint({ waitpointId: AMBIGUOUS, routeKind: "MANUAL" })).toThrow( + UnclassifiableRunId + ); + }); + + it("throws when an unknown routeKind is supplied", () => { + expect(() => + // @ts-expect-error deliberately invalid kind + selectStoreForWaitpoint({ waitpointId: NEW, routeKind: "WAT" as any }) + ).toThrow(); + }); +}); + +describe("computeStoreForCompletion — single-DB no-op + flag wrapper", () => { + it("returns the single store without classifying when split is OFF", () => { + const calls: string[] = []; + const d = computeStoreForCompletion( + { waitpointId: AMBIGUOUS, routeKind: "MANUAL" }, + { + splitEnabled: false, + classify: (id) => { + calls.push(id); + return "NEW"; + }, + } + ); + expect(d.store).toBe("legacy"); // the single store + expect(calls).toEqual([]); // classifier never consulted + }); + + it("delegates to selectStoreForWaitpoint when split is ON", () => { + const d = computeStoreForCompletion( + { waitpointId: NEW, routeKind: "MANUAL" }, + { splitEnabled: true } + ); + expect(d.store).toBe("new"); + }); +}); diff --git a/apps/webapp/test/runOpsDbTopology.test.ts b/apps/webapp/test/runOpsDbTopology.test.ts new file mode 100644 index 00000000000..b33c0db7fd3 --- /dev/null +++ b/apps/webapp/test/runOpsDbTopology.test.ts @@ -0,0 +1,121 @@ +import { PostgreSqlContainer } from "@testcontainers/postgresql"; +import { describe, expect, it, vi } from "vitest"; +import { buildReplicaClient, buildWriterClient, selectRunOpsTopology } from "~/db.server"; + +const cp = { writer: {} as any, replica: {} as any }; + +describe("selectRunOpsTopology (pure)", () => { + it("split OFF: all run-ops handles collapse to control-plane and NO client is built", () => { + const buildNewWriter = vi.fn(); + const buildNewReplica = vi.fn(); + const topo = selectRunOpsTopology( + { splitEnabled: false, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { controlPlane: cp, buildNewWriter, buildNewReplica } + ); + // new run-ops collapses to the control-plane client refs (no second connection). + expect(topo.newRunOps.writer).toBe(cp.writer); + expect(topo.newRunOps.replica).toBe(cp.replica); + expect(topo.legacyRunOps).toBe(cp); + expect(topo.controlPlane).toBe(cp); + expect(buildNewWriter).not.toHaveBeenCalled(); // no second connection opened + expect(buildNewReplica).not.toHaveBeenCalled(); + }); + + it("split ON: new-run-ops builds its own writer + replica; cp/legacy reuse cp", () => { + const newWriter = { tag: "nw" } as any; + const newReplica = { tag: "nr" } as any; + const buildNewWriter = vi.fn().mockReturnValue(newWriter); + const buildNewReplica = vi.fn().mockReturnValue(newReplica); + const topo = selectRunOpsTopology( + { + splitEnabled: true, + legacyUrl: "postgres://legacy", + newUrl: "postgres://new", + newReplicaUrl: "postgres://new-r", + }, + { controlPlane: cp, buildNewWriter, buildNewReplica } + ); + expect(topo.newRunOps.writer).toBe(newWriter); + expect(topo.newRunOps.replica).toBe(newReplica); + expect(topo.controlPlane).toBe(cp); + expect(topo.legacyRunOps).toBe(cp); // legacy run-ops shares the control-plane server initially + expect(buildNewWriter).toHaveBeenCalledTimes(1); + }); + + it("split ON without a new replica URL: replica falls back to the new writer", () => { + const newWriter = { tag: "nw" } as any; + const buildNewWriter = vi.fn().mockReturnValue(newWriter); + const buildNewReplica = vi.fn(); + const topo = selectRunOpsTopology( + { splitEnabled: true, legacyUrl: "postgres://legacy", newUrl: "postgres://new" }, + { controlPlane: cp, buildNewWriter, buildNewReplica } + ); + expect(topo.newRunOps.replica).toBe(newWriter); + expect(buildNewReplica).not.toHaveBeenCalled(); + }); +}); + +describe("selectRunOpsTopology (integration, real containers)", () => { + it("split OFF: opens exactly one DB; all run-ops handles share the control-plane client", async () => { + const pg = await new PostgreSqlContainer("docker.io/postgres:14").start(); + try { + const cpWriter = buildWriterClient({ url: pg.getConnectionUri(), clientType: "cp" }); + const cp = { writer: cpWriter, replica: cpWriter }; + const builtUrls: string[] = []; + const topo = selectRunOpsTopology( + { splitEnabled: false, legacyUrl: pg.getConnectionUri(), newUrl: pg.getConnectionUri() }, + { + controlPlane: cp, + buildNewWriter: (url) => { + builtUrls.push(url); + return buildWriterClient({ url, clientType: "x" }) as any; + }, + buildNewReplica: (url) => { + builtUrls.push(url); + return buildReplicaClient({ url, clientType: "x" }) as any; + }, + } + ); + expect(builtUrls).toHaveLength(0); // no second connection opened + expect(topo.newRunOps.writer).toBe(cp.writer); + expect(topo.newRunOps.replica).toBe(cp.replica); + expect(topo.legacyRunOps).toBe(cp); + await topo.newRunOps.writer.$queryRawUnsafe("SELECT 1"); + await cpWriter.$disconnect(); + } finally { + await pg.stop(); + } + }, 60_000); + + it("split ON: constructs CP + legacy-run-ops + new-run-ops + replicas (legacy + new)", async () => { + const rds = await new PostgreSqlContainer("docker.io/postgres:14").start(); + const ps = await new PostgreSqlContainer("docker.io/postgres:17").start(); + try { + const cpWriter = buildWriterClient({ url: rds.getConnectionUri(), clientType: "cp" }); + const cp = { writer: cpWriter, replica: cpWriter }; + const topo = selectRunOpsTopology( + { splitEnabled: true, legacyUrl: rds.getConnectionUri(), newUrl: ps.getConnectionUri() }, + { + controlPlane: cp, + buildNewWriter: (url, ct) => buildWriterClient({ url, clientType: ct }) as any, + buildNewReplica: (url, ct) => buildReplicaClient({ url, clientType: ct }) as any, + } + ); + // CP + legacy resolve to the legacy/control-plane pair; new run-ops is the dedicated run-ops box. + expect(topo.controlPlane).toBe(cp); + expect(topo.legacyRunOps).toBe(cp); + expect(topo.newRunOps.writer).not.toBe(cpWriter); + await topo.controlPlane.writer.$queryRawUnsafe("SELECT 1"); + await topo.newRunOps.writer.$queryRawUnsafe("SELECT 1"); + const ver = await topo.newRunOps.writer.$queryRawUnsafe>( + "SELECT current_setting('server_version') AS v" + ); + expect(ver[0].v.startsWith("17")).toBe(true); // new run-ops really is the dedicated box + await cpWriter.$disconnect(); + await topo.newRunOps.writer.$disconnect(); + } finally { + await rds.stop(); + await ps.stop(); + } + }, 120_000); +}); diff --git a/apps/webapp/test/runOpsMintCutover.test.ts b/apps/webapp/test/runOpsMintCutover.test.ts new file mode 100644 index 00000000000..d838a382e92 --- /dev/null +++ b/apps/webapp/test/runOpsMintCutover.test.ts @@ -0,0 +1,193 @@ +// Per-env KSUID mint cutover integration proof. +// +// NEVER mocks the DB: the mint decision runs through the pure core `computeRunIdMintKind` +// wired to a REAL `makeFlag(prisma)` that reads the REAL `Organization.featureFlags` / +// `FeatureFlag` rows in a testcontainers Postgres. Only the two boundary knobs +// are injected — `masterEnabled` and the `splitEnabled` boot-boolean — never a +// mocked DB. The KSUID/cuid format + residency are then proven through the SAME isomorphic +// helpers the real trigger path uses (`generateKsuidId` / `RunId.toFriendlyId` / +// `RunId.fromFriendlyId` / `ownerEngine`). +import type { PrismaClient } from "@trigger.dev/database"; +import { generateKsuidId, ownerEngine, RunId } from "@trigger.dev/core/v3/isomorphic"; +import { postgresTest } from "@internal/testcontainers"; +import { describe, expect, vi } from "vitest"; +import { + computeRunIdMintKind, + type RunIdMintKind, +} from "~/v3/runOpsMigration/runOpsMintKind.server"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { + createTestOrgProjectWithMember, + createRuntimeEnvironment, + uniqueId, +} from "./fixtures/environmentVariablesFixtures"; + +vi.setConfig({ testTimeout: 60_000 }); + +// The real trigger-path mint helper, copied verbatim from triggerTask.server.ts so the +// test exercises the exact id format a cut-over env produces. +function mintRunKsuidFriendlyId(): string { + return RunId.toFriendlyId(generateKsuidId()); +} + +// Mirrors the real trigger path: resolve the kind, then mint either a KSUID friendlyId or +// the default cuid one (RunId.generate()). +function mintRunFriendlyId(kind: RunIdMintKind): string { + return kind === "ksuid" ? mintRunKsuidFriendlyId() : RunId.generate().friendlyId; +} + +async function seedOrgEnv(prisma: PrismaClient, mintFlag?: RunIdMintKind) { + const { organization, project } = await createTestOrgProjectWithMember(prisma); + const environment = await createRuntimeEnvironment(prisma, { + projectId: project.id, + organizationId: organization.id, + type: "PRODUCTION", + slug: uniqueId("prod"), + }); + if (mintFlag) { + await prisma.organization.update({ + where: { id: organization.id }, + data: { featureFlags: { [FEATURE_FLAG.runOpsMintKsuid]: mintFlag } }, + }); + } + return { organization, environment }; +} + +// Build the env-bound `flag` dependency around a REAL makeFlag(prisma) reading the real +// Organization.featureFlags override store. Pure-core gets the real DB-backed flag; only +// masterEnabled + splitEnabled are injected boundary config. +function realFlag(prisma: PrismaClient) { + const flagFn = makeFlag(prisma); + return async (orgId: string, orgFeatureFlags: unknown | undefined): Promise => { + const overrides = + orgFeatureFlags !== undefined + ? orgFeatureFlags + : ( + await prisma.organization.findFirst({ + where: { id: orgId }, + select: { featureFlags: true }, + }) + )?.featureFlags; + return flagFn({ + key: FEATURE_FLAG.runOpsMintKsuid, + defaultValue: "cuid", + overrides: (overrides as Record) ?? {}, + }); + }; +} + +describe("per-env KSUID mint cutover", () => { + postgresTest( + "canary org mints KSUID/NEW; non-canary org mints cuid/LEGACY", + async ({ prisma }) => { + const a = await seedOrgEnv(prisma, "ksuid"); // canary + const b = await seedOrgEnv(prisma); // not cut over + + const flag = realFlag(prisma); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + const kindA = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + deps + ); + const kindB = await computeRunIdMintKind( + { organizationId: b.organization.id, id: b.environment.id }, + deps + ); + + expect(kindA).toBe("ksuid"); + expect(kindB).toBe("cuid"); + + const friendlyA = mintRunFriendlyId(kindA); + const friendlyB = mintRunFriendlyId(kindB); + + expect(RunId.fromFriendlyId(friendlyA).length).toBe(27); + expect(ownerEngine(RunId.fromFriendlyId(friendlyA))).toBe("NEW"); + + expect(RunId.fromFriendlyId(friendlyB).length).toBe(25); + expect(ownerEngine(RunId.fromFriendlyId(friendlyB))).toBe("LEGACY"); + } + ); + + postgresTest( + "split OFF mints cuid even for a flagged-ksuid org (split gate dominates)", + async ({ prisma }) => { + const a = await seedOrgEnv(prisma, "ksuid"); + const flag = vi.fn(realFlag(prisma)); + + const kind = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + { masterEnabled: true, splitEnabled: async () => false, flag } + ); + + expect(kind).toBe("cuid"); + expect(flag).not.toHaveBeenCalled(); // gated off before any DB read + } + ); + + postgresTest( + "drain-new-forward (D8): flipping back to cuid stops new KSUID mints without reverting existing", + async ({ prisma }) => { + const a = await seedOrgEnv(prisma, "ksuid"); + const flag = realFlag(prisma); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + // First run is born KSUID/NEW while cut over. + const firstKind = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + deps + ); + const firstFriendly = mintRunFriendlyId(firstKind); + expect(firstKind).toBe("ksuid"); + expect(ownerEngine(RunId.fromFriendlyId(firstFriendly))).toBe("NEW"); + + // Roll the org back to cuid (drain-new-forward — set the flag to "cuid"). + await prisma.organization.update({ + where: { id: a.organization.id }, + data: { featureFlags: { [FEATURE_FLAG.runOpsMintKsuid]: "cuid" } }, + }); + + // The NEXT run mints cuid again (the env-bound resolver's TTL cache is not used here, + // so the flip is observed immediately — production waits one cache TTL). + const nextKind = await computeRunIdMintKind( + { organizationId: a.organization.id, id: a.environment.id }, + deps + ); + const nextFriendly = mintRunFriendlyId(nextKind); + expect(nextKind).toBe("cuid"); + expect(ownerEngine(RunId.fromFriendlyId(nextFriendly))).toBe("LEGACY"); + + // The already-minted KSUID run is untouched — drain-new-forward never reverts it. + expect(RunId.fromFriendlyId(firstFriendly).length).toBe(27); + expect(ownerEngine(RunId.fromFriendlyId(firstFriendly))).toBe("NEW"); + } + ); + + postgresTest( + "parent and child re-resolve independently from their own org flag", + async ({ prisma }) => { + // Parent lives in a cut-over org; child is triggered into a NON-cut-over org. + const parentOrg = await seedOrgEnv(prisma, "ksuid"); + const childOrg = await seedOrgEnv(prisma); // not cut over + const flag = realFlag(prisma); + const deps = { masterEnabled: true, splitEnabled: async () => true, flag }; + + const parentKind = await computeRunIdMintKind( + { organizationId: parentOrg.organization.id, id: parentOrg.environment.id }, + deps + ); + const childKind = await computeRunIdMintKind( + { organizationId: childOrg.organization.id, id: childOrg.environment.id }, + deps + ); + + // Observed behavior: the mint decision is resolved per the run's OWN org/env flag — + // it does NOT inherit the parent's residency. A child in a non-cut-over org mints cuid + // even when its parent was born KSUID. If children must inherit, that inheritance + // belongs to the child-trigger path, not this resolver. + expect(parentKind).toBe("ksuid"); + expect(childKind).toBe("cuid"); + } + ); +}); diff --git a/apps/webapp/test/runOpsSplitMode.test.ts b/apps/webapp/test/runOpsSplitMode.test.ts new file mode 100644 index 00000000000..7ce2bec3a5d --- /dev/null +++ b/apps/webapp/test/runOpsSplitMode.test.ts @@ -0,0 +1,121 @@ +import { describe, expect, it, vi } from "vitest"; +// @testcontainers/postgresql resolves because it is declared in apps/webapp/package.json. +import { PostgreSqlContainer } from "@testcontainers/postgresql"; +import { + computeSplitEnabled, + assertSplitRealtimeInterlock, +} from "~/v3/runOpsMigration/splitMode.server"; +import { probeDistinctDatabases } from "~/v3/runOpsMigration/distinctDbSentinel.server"; + +describe("computeSplitEnabled (pure)", () => { + it("is OFF by default and never probes when the flag is off", async () => { + const probe = vi.fn(); + const result = await computeSplitEnabled( + { flagEnabled: false, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ); + expect(result).toBe(false); + expect(probe).not.toHaveBeenCalled(); // self-host opens no second connection + }); + + it("stays single-DB when flag is on but URLs are missing", async () => { + const probe = vi.fn(); + expect(await computeSplitEnabled({ flagEnabled: true }, { probe })).toBe(false); + expect(probe).not.toHaveBeenCalled(); + }); + + it("enables split only when flag is on AND sentinel confirms distinct", async () => { + const probe = vi.fn().mockResolvedValue({ distinct: true }); + expect( + await computeSplitEnabled( + { flagEnabled: true, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ) + ).toBe(true); + }); + + it("stays single-DB when sentinel reports NOT distinct", async () => { + const probe = vi.fn().mockResolvedValue({ distinct: false, reason: "same DB" }); + expect( + await computeSplitEnabled( + { flagEnabled: true, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ) + ).toBe(false); + }); + + // Migration-family unreachability proof: with the flag off the gate returns false and + // no probe runs. Downstream migration-family code is required to early-return on + // !isSplitEnabled(); this unit proves the gate's value, each downstream unit's own test + // proves it honors the gate. Split OFF collapsing to a single prisma/$replica pair with + // no second connection opened depends on this no-probe behavior. + it("is provably unreachable (no probe) when the flag is off", async () => { + const probe = vi.fn(); + expect( + await computeSplitEnabled( + { flagEnabled: false, legacyUrl: "postgres://a", newUrl: "postgres://b" }, + { probe } + ) + ).toBe(false); + expect(probe).not.toHaveBeenCalled(); + }); +}); + +describe("assertSplitRealtimeInterlock (pure)", () => { + it("throws when split is on but the native realtime backend is off", () => { + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: true, nativeRealtimeEnabled: false }) + ).toThrowError(/native realtime backend|REALTIME_BACKEND_NATIVE_ENABLED/i); + }); + + it("does not throw when split is on and the native realtime backend is on", () => { + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: true, nativeRealtimeEnabled: true }) + ).not.toThrow(); + }); + + it("does not throw when split is off, regardless of the native realtime backend", () => { + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: false, nativeRealtimeEnabled: false }) + ).not.toThrow(); + expect(() => + assertSplitRealtimeInterlock({ splitEnabled: false, nativeRealtimeEnabled: true }) + ).not.toThrow(); + }); +}); + +describe("distinct-DB sentinel (real Postgres)", () => { + it("reports NOT distinct when both URLs hit the same physical cluster", async () => { + const pg = await new PostgreSqlContainer("docker.io/postgres:14").start(); + try { + const url = pg.getConnectionUri(); + const result = await probeDistinctDatabases(url, url); + expect(result.distinct).toBe(false); // identical URL -> false-split prevented + } finally { + await pg.stop(); + } + }, 60_000); + + it("reports distinct when URLs hit two separate clusters (legacy + new)", async () => { + const legacy = await new PostgreSqlContainer("docker.io/postgres:14").start(); + const next = await new PostgreSqlContainer("docker.io/postgres:17").start(); + try { + const result = await probeDistinctDatabases( + legacy.getConnectionUri(), + next.getConnectionUri() + ); + expect(result.distinct).toBe(true); + } finally { + await legacy.stop(); + await next.stop(); + } + }, 120_000); + + it("fails closed (single-DB) when a DB is unreachable", async () => { + const result = await probeDistinctDatabases( + "postgresql://nouser:nopass@127.0.0.1:1/none", + "postgresql://nouser:nopass@127.0.0.1:2/none" + ); + expect(result.distinct).toBe(false); + }, 30_000); +}); diff --git a/apps/webapp/test/runOpsSplitReadGate.test.ts b/apps/webapp/test/runOpsSplitReadGate.test.ts new file mode 100644 index 00000000000..c9238bcff34 --- /dev/null +++ b/apps/webapp/test/runOpsSplitReadGate.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it } from "vitest"; +import { computeRunOpsSplitReadEnabled } from "~/v3/runOpsMigration/runOpsSplitReadGate"; + +// Distinct sentinel objects standing in for the prisma client singletons. +const cpWriter = { __tag: "cp-writer" }; +const cpReplica = { __tag: "cp-replica" }; +const dedicatedNew = { __tag: "dedicated-new" }; + +describe("computeRunOpsSplitReadEnabled", () => { + it("enables split when a distinct dedicated NEW client is open and both URLs are set", () => { + expect( + computeRunOpsSplitReadEnabled({ + newReplica: dedicatedNew, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(true); + }); + + // Regression: the LEGACY run-ops handle IS the control-plane replica by design. The gate must + // depend only on the NEW client's distinctness — never on the legacy handle differing from CP. + it("stays enabled even though the legacy handle equals the control-plane replica", () => { + // The caller passes controlPlaneReplica (=== legacy handle) for the CP slot; NEW is still + // distinct, so split must remain ON. (A gate that required legacy !== CP would be false here.) + expect( + computeRunOpsSplitReadEnabled({ + newReplica: dedicatedNew, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, // legacy run-ops replica is this very object in prod + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(true); + }); + + it("disables split when NEW falls back to the control-plane client (no dedicated DB)", () => { + expect( + computeRunOpsSplitReadEnabled({ + newReplica: cpReplica, // cpFallback: NEW === control-plane replica + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(false); + }); + + it("disables split when NEW equals the control-plane writer", () => { + expect( + computeRunOpsSplitReadEnabled({ + newReplica: cpWriter, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + hasNewUrl: true, + hasLegacyUrl: true, + }) + ).toBe(false); + }); + + it("disables split when either URL is missing, even with a distinct client", () => { + const base = { + newReplica: dedicatedNew, + controlPlaneWriter: cpWriter, + controlPlaneReplica: cpReplica, + }; + expect(computeRunOpsSplitReadEnabled({ ...base, hasNewUrl: false, hasLegacyUrl: true })).toBe( + false + ); + expect(computeRunOpsSplitReadEnabled({ ...base, hasNewUrl: true, hasLegacyUrl: false })).toBe( + false + ); + }); +}); diff --git a/apps/webapp/test/services.controlPlane.readthrough.test.ts b/apps/webapp/test/services.controlPlane.readthrough.test.ts new file mode 100644 index 00000000000..11ddb8dd0bf --- /dev/null +++ b/apps/webapp/test/services.controlPlane.readthrough.test.ts @@ -0,0 +1,115 @@ +// Real PG14 (control-plane) + PG17 (run-ops) proof for the run-rooted services that were +// decomposed onto the ControlPlaneResolver. The env (slug/project/org) lives on PG14; +// the run-ops scalar row on PG17 with cross-seam FKs dropped. A PostgresRunStore over PG17 reads +// run scalars; the ControlPlaneResolver over PG14 resolves the env. The DB is never mocked. The +// .count() proof shows neither DB joins the other. +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PostgresRunStore } from "@internal/run-store"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000, hookTimeout: 60_000 }); + +const TASK_RUN_CROSS_SEAM_FKS = [ + "TaskRun_runtimeEnvironmentId_fkey", + "TaskRun_projectId_fkey", + "TaskRun_organizationId_fkey", +] as const; + +async function dropTaskRunCrossSeamFks(prisma: PrismaClient) { + for (const c of TASK_RUN_CROSS_SEAM_FKS) { + await prisma.$executeRawUnsafe(`ALTER TABLE "TaskRun" DROP CONSTRAINT IF EXISTS "${c}"`); + } +} + +let n = 0; +async function seedControlPlane(prisma: PrismaClient) { + const s = n++; + const organization = await prisma.organization.create({ + data: { title: `Org ${s}`, slug: `org-${s}` }, + }); + const project = await prisma.project.create({ + data: { + name: `P ${s}`, + slug: `p-${s}`, + externalRef: `proj_${s}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${s}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${s}`, + pkApiKey: `pk_${s}`, + shortcode: `sc_${s}`, + }, + }); + return { organization, project, environment }; +} + +async function seedRun( + prisma: PrismaClient, + cp: { environment: { id: string }; project: { id: string }; organization: { id: string } } +) { + const s = n++; + return prisma.taskRun.create({ + data: { + id: `run_${s}_pg17`, + engine: "V2", + status: "PENDING", + friendlyId: `run_${s}`, + runtimeEnvironmentId: cp.environment.id, + projectId: cp.project.id, + organizationId: cp.organization.id, + taskIdentifier: "svc-task", + payload: "{}", + payloadType: "application/json", + queue: "task/svc-task", + traceId: `tr_${s}`, + spanId: `sp_${s}`, + workerQueue: "main", + }, + }); +} + +function buildResolver(cp: PrismaClient) { + return new ControlPlaneResolver({ + controlPlanePrimary: cp, + controlPlaneReplica: cp, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); +} + +describe("service control-plane read-through", () => { + heteroPostgresTest( + "expireEnqueuedRun: org id resolves from PG14 via resolveEnv while run scalars resolve from PG17", + async ({ prisma14, prisma17 }) => { + await dropTaskRunCrossSeamFks(prisma17 as unknown as PrismaClient); + const cp = await seedControlPlane(prisma14 as unknown as PrismaClient); + const run = await seedRun(prisma17 as unknown as PrismaClient, cp); + + const runStore = new PostgresRunStore({ + prisma: prisma17 as unknown as PrismaClient, + readOnlyPrisma: prisma17 as unknown as PrismaClient, + }); + const resolver = buildResolver(prisma14 as unknown as PrismaClient); + + const found = await runStore.findRun( + { id: run.id }, + { select: { id: true, runtimeEnvironmentId: true } }, + prisma17 as unknown as PrismaClient + ); + const env = await resolver.resolveEnv(found!.runtimeEnvironmentId); + expect(env!.organizationId).toBe(cp.organization.id); + + expect(await (prisma17 as unknown as PrismaClient).runtimeEnvironment.count()).toBe(0); + expect(await (prisma14 as unknown as PrismaClient).taskRun.count()).toBe(0); + } + ); +}); diff --git a/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts b/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts new file mode 100644 index 00000000000..fff5f848f75 --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/controlPlaneRepoint.server.test.ts @@ -0,0 +1,210 @@ +import { heteroPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import { expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +// Control-plane datasource repoint (legacy DB -> new DB). +// +// Post-repoint the control plane lives on the new DB, so we model the new topology by seeding the +// control-plane rows on the new side (`prisma17`) and injecting it as both the resolver's primary +// and replica. `prisma14` stands in for the pre-repoint legacy source for the cross-version +// transition test. NEVER mock — we seed and read the real testcontainer clients, and we observe +// the DB boundary via a $extends query counter. + +// Cross-DB testcontainer spin-up + queries can exceed the 5s default on the first test. +vi.setConfig({ testTimeout: 60_000 }); + +let seedCounter = 0; + +/** + * Wraps a real testcontainer PrismaClient with a `$extends` query hook that increments a counter + * on every actual operation. NOT a mock: the returned client still issues the real query and + * returns real rows — we only observe the DB boundary (the countQueries pattern). + */ +function countQueries(client: PrismaClient): { client: PrismaClient; reads: () => number } { + let count = 0; + const extended = client.$extends({ + query: { + async $allOperations({ args, query }) { + count++; + return query(args); + }, + }, + }) as unknown as PrismaClient; + return { client: extended, reads: () => count }; +} + +/** Seeds org -> project -> env + a pinned BackgroundWorker (+task) + TaskQueue + TaskSchedule. */ +async function seedControlPlane(prisma: PrismaClient) { + const n = seedCounter++; + const org = await prisma.organization.create({ + data: { title: `Org ${n}`, slug: `org-${n}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${n}`, + slug: `project-${n}`, + externalRef: `proj_${n}`, + organizationId: org.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${n}`, + projectId: project.id, + organizationId: org.id, + apiKey: `tr_prod_${n}`, + pkApiKey: `pk_prod_${n}`, + shortcode: `short_${n}`, + }, + }); + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${n}`, + contentHash: `hash_${n}`, + projectId: project.id, + runtimeEnvironmentId: environment.id, + version: `2024.1.${n}`, + metadata: {}, + engine: "V2", + }, + }); + await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${n}`, + slug: `my-task-${n}`, + filePath: "index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: environment.id, + projectId: project.id, + }, + }); + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: `queue_${n}`, + name: `task/my-task-${n}`, + runtimeEnvironmentId: environment.id, + projectId: project.id, + workers: { connect: { id: worker.id } }, + }, + }); + const schedule = await prisma.taskSchedule.create({ + data: { + friendlyId: `schedule_${n}`, + taskIdentifier: `my-task-${n}`, + generatorExpression: "0 * * * *", + projectId: project.id, + }, + }); + return { org, project, environment, worker, queue, schedule }; +} + +// --- Repoint resolution (split ON, CP on the new DB) --------- + +heteroPostgresTest( + "control-plane references resolve against the repointed (new-DB) CP client", + async ({ prisma17 }) => { + const { environment, worker } = await seedControlPlane(prisma17); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: prisma17, + controlPlaneReplica: prisma17, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + expect(await resolver.resolveEnv(environment.id)).toMatchObject({ id: environment.id }); + expect( + await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }) + ).not.toBeNull(); + } +); + +// --- Relaxed-cache (no latency regression) ------------------------- + +heteroPostgresTest("relaxed (longer TTL) cache still hits on the new DB", async ({ prisma17 }) => { + const { environment } = await seedControlPlane(prisma17); + const { client: counting, reads } = countQueries(prisma17); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: counting, + controlPlaneReplica: counting, + // Relaxed: a much longer TTL than the default — same-provider resolution is cheap. + cache: new ControlPlaneCache({ ttlMs: 300_000, maxEntries: 10_000 }), + splitEnabled: () => true, + }); + + expect(await resolver.resolveEnv(environment.id)).toMatchObject({ id: environment.id }); + expect(reads()).toBe(1); + // Second read served from the relaxed cache — no extra DB round-trip. + await resolver.resolveEnv(environment.id); + expect(reads()).toBe(1); +}); + +// --- Cross-version transition (legacy DB -> new DB) ----------------------- + +heteroPostgresTest( + "resolution is byte-identical across the legacy-DB -> new-DB host transition", + async ({ prisma14, prisma17, pinnedCollation }) => { + // Seed identical control-plane shapes on the pre-repoint (legacy) and post-repoint + // (new) sides. + const before = await seedControlPlane(prisma14); + const after = await seedControlPlane(prisma17); + + const resolver14 = new ControlPlaneResolver({ + controlPlanePrimary: prisma14, + controlPlaneReplica: prisma14, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + const resolver17 = new ControlPlaneResolver({ + controlPlanePrimary: prisma17, + controlPlaneReplica: prisma17, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const env14 = await resolver14.resolveEnv(before.environment.id); + const env17 = await resolver17.resolveEnv(after.environment.id); + // Same resolution shape across the version boundary (ids differ per-seed; structure identical). + expect(Object.keys(env14 ?? {}).sort()).toEqual(Object.keys(env17 ?? {}).sort()); + expect(env14?.type).toBe(env17?.type); + expect(env14?.archivedAt).toBe(env17?.archivedAt); + + // ORDER BY on a representative text-heavy column must agree across the version boundary, using + // the pinned ICU collation the hetero fixture exposes so the comparison is apples-to-apples. + const slugs = ["banana", "Apple", "cherry", "Äpfel", "apple"]; + const orderBy = async (prisma: PrismaClient) => { + const rows = await prisma.$queryRawUnsafe<{ s: string }[]>( + `SELECT s FROM (VALUES ('${slugs.join("'),('")}')) AS t(s) ORDER BY s COLLATE "${pinnedCollation}"` + ); + return rows.map((r) => r.s); + }; + expect(await orderBy(prisma14)).toEqual(await orderBy(prisma17)); + } +); + +// --- Single-DB no-op (passthrough preserved) ----------------------- + +heteroPostgresTest( + "single-DB passthrough (split OFF) runs plain in-DB joins with no cache", + async ({ prisma17 }) => { + const { environment } = await seedControlPlane(prisma17); + const { client: counting, reads } = countQueries(prisma17); + const resolver = new ControlPlaneResolver({ + controlPlanePrimary: counting, + controlPlaneReplica: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveEnv(environment.id); + await resolver.resolveEnv(environment.id); + // No cache when split is OFF — every call hits the DB, identical to today's passthrough. + expect(reads()).toBe(2); + } +); diff --git a/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts new file mode 100644 index 00000000000..e85843114db --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/controlPlaneResolver.server.test.ts @@ -0,0 +1,750 @@ +import { heteroPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, it, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { + ControlPlaneReferenceError, + ControlPlaneResolver, +} from "~/v3/runOpsMigration/controlPlaneResolver.server"; + +// Cross-DB testcontainer spin-up + queries can exceed the 5s default on the first test. +vi.setConfig({ testTimeout: 60_000 }); + +// --- test helpers ---------------------------------------------------------- + +let seedCounter = 0; + +/** + * Wraps a real testcontainer PrismaClient with a `$extends` query hook that increments a + * counter on every actual operation. NOT a mock: the returned client still issues the real + * query and returns real rows — we only observe the DB boundary. + */ +function countQueries(client: PrismaClient): { client: PrismaClient; reads: () => number } { + let count = 0; + const extended = client.$extends({ + query: { + async $allOperations({ args, query }) { + count++; + return query(args); + }, + }, + }) as unknown as PrismaClient; + return { client: extended, reads: () => count }; +} + +async function seedControlPlane(prisma: PrismaClient) { + const n = seedCounter++; + const org = await prisma.organization.create({ + data: { title: `Org ${n}`, slug: `org-${n}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${n}`, + slug: `project-${n}`, + externalRef: `proj_${n}`, + organizationId: org.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type: "PRODUCTION", + slug: `env-${n}`, + projectId: project.id, + organizationId: org.id, + apiKey: `tr_prod_${n}`, + pkApiKey: `pk_prod_${n}`, + shortcode: `short_${n}`, + }, + }); + return { org, project, environment }; +} + +async function seedWorker( + prisma: PrismaClient, + ctx: { projectId: string; environmentId: string }, + opts?: { promote?: boolean } +) { + const n = seedCounter++; + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${n}`, + contentHash: `hash_${n}`, + projectId: ctx.projectId, + runtimeEnvironmentId: ctx.environmentId, + version: `2024.1.${n}`, + metadata: {}, + engine: "V2", + }, + }); + const task = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${n}`, + slug: `my-task-${n}`, + filePath: "index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + }, + }); + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: `queue_${n}`, + name: `task/my-task-${n}`, + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + workers: { connect: { id: worker.id } }, + }, + }); + let deployment = null; + if (opts?.promote) { + deployment = await prisma.workerDeployment.create({ + data: { + friendlyId: `deployment_${n}`, + contentHash: `hash_${n}`, + version: worker.version, + shortCode: `dep_${n}`, + type: "MANAGED", + status: "DEPLOYED", + projectId: ctx.projectId, + environmentId: ctx.environmentId, + workerId: worker.id, + }, + }); + await prisma.workerDeploymentPromotion.create({ + data: { + label: "current", + deploymentId: deployment.id, + environmentId: ctx.environmentId, + }, + }); + } + return { worker, task, queue, deployment }; +} + +// --- cache unit tests (no DB) ---------------------------------------------- + +describe("ControlPlaneCache", () => { + it("caches null as a confirmed absence (distinct from a miss)", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + expect(cache.getEnv("env_x")).toBeUndefined(); + cache.setEnv("env_x", null); + expect(cache.getEnv("env_x")).toBeNull(); + }); + + it("invalidateEnv drops the entry (next read is a miss)", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + cache.setEnv("env_y", { id: "env_y" } as any); + cache.invalidateEnv("env_y"); + expect(cache.getEnv("env_y")).toBeUndefined(); + }); + + it("invalidating one key does not affect another", () => { + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + cache.setEnv("env_a", { id: "env_a" } as any); + cache.setEnv("env_b", { id: "env_b" } as any); + cache.invalidateEnv("env_a"); + expect(cache.getEnv("env_a")).toBeUndefined(); + expect(cache.getEnv("env_b")).toMatchObject({ id: "env_b" }); + }); +}); + +// --- resolveEnv ------------------------------------------------------------- + +heteroPostgresTest( + "resolveEnv returns the cross-DB env row and caches it", + async ({ prisma14 }) => { + const { environment, org } = await seedControlPlane(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + const first = await resolver.resolveEnv(environment.id); + expect(first).toMatchObject({ + id: environment.id, + projectId: environment.projectId, + organizationId: org.id, + type: "PRODUCTION", + archivedAt: null, + }); + expect(reads()).toBe(1); + + const second = await resolver.resolveEnv(environment.id); + expect(second).toEqual(first); + expect(reads()).toBe(1); + } +); + +heteroPostgresTest("resolveEnv caches a null absence", async ({ prisma14 }) => { + const cache = new ControlPlaneCache(); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + expect(await resolver.resolveEnv("env_does_not_exist")).toBeNull(); + expect(reads()).toBe(1); + expect(await resolver.resolveEnv("env_does_not_exist")).toBeNull(); + expect(reads()).toBe(1); +}); + +heteroPostgresTest( + "resolveEnv passthrough (split OFF) hits the DB every time, no cache", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveEnv(environment.id); + await resolver.resolveEnv(environment.id); + expect(reads()).toBe(2); + } +); + +// --- resolveWorkerVersion --------------------------------------------------- + +heteroPostgresTest( + "resolveWorkerVersion (pinned) returns worker/tasks/queues and caches it", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, task, queue } = await seedWorker(prisma14, { + projectId: project.id, + environmentId: environment.id, + }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const first = await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + expect(first?.worker.id).toBe(worker.id); + expect(first?.tasks.map((t) => t.id)).toContain(task.id); + expect(first?.queues.map((q) => q.id)).toContain(queue.id); + expect(first?.deployment).toBeNull(); + const readsAfterFirst = reads(); + expect(readsAfterFirst).toBeGreaterThanOrEqual(1); + + const second = await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + expect(second?.worker.id).toBe(worker.id); + expect(reads()).toBe(readsAfterFirst); + } +); + +heteroPostgresTest( + "resolveWorkerVersion (current deployment) resolves the promoted worker", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, deployment } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const first = await resolver.resolveWorkerVersion({ environmentId: environment.id }); + expect(first?.worker.id).toBe(worker.id); + expect(first?.deployment?.id).toBe(deployment?.id); + const readsAfterFirst = reads(); + + const second = await resolver.resolveWorkerVersion({ environmentId: environment.id }); + expect(second?.worker.id).toBe(worker.id); + expect(reads()).toBe(readsAfterFirst); + } +); + +heteroPostgresTest( + "resolveWorkerVersion passthrough (split OFF) re-reads every call", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker } = await seedWorker(prisma14, { + projectId: project.id, + environmentId: environment.id, + }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + const readsAfterFirst = reads(); + await resolver.resolveWorkerVersion({ + environmentId: environment.id, + backgroundWorkerId: worker.id, + }); + expect(reads()).toBe(readsAfterFirst * 2); + } +); + +// --- assertEnvExists -------------------------------------------------------- + +heteroPostgresTest( + "assertEnvExists resolves for a seeded env, caches, and throws for a missing one", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + await expect(resolver.assertEnvExists(environment.id)).resolves.toBeUndefined(); + expect(reads()).toBe(1); + await expect(resolver.assertEnvExists(environment.id)).resolves.toBeUndefined(); + expect(reads()).toBe(1); + + await expect(resolver.assertEnvExists("env_missing")).rejects.toBeInstanceOf( + ControlPlaneReferenceError + ); + } +); + +heteroPostgresTest( + "assertEnvExists passthrough (split OFF) is a no-op: never reads, never throws", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + // Split OFF = single DB, run and env co-located, so there is nothing to assert + // and the hot-path read is skipped entirely — resolves for present and missing. + await expect(resolver.assertEnvExists(environment.id)).resolves.toBeUndefined(); + await expect(resolver.assertEnvExists("env_missing")).resolves.toBeUndefined(); + expect(reads()).toBe(0); + } +); + +// --- resolveAuthenticatedEnv ------------------------------------------------ + +heteroPostgresTest( + "resolveAuthenticatedEnv returns the toAuthenticated shape and caches it", + async ({ prisma14 }) => { + const { environment, project, org } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + const first = await resolver.resolveAuthenticatedEnv(environment.id); + expect(first).not.toBeNull(); + expect(first!.id).toBe(environment.id); + expect(first!.slug).toBe(environment.slug); + expect(first!.type).toBe("PRODUCTION"); + expect(first!.organizationId).toBe(org.id); + expect(first!.projectId).toBe(project.id); + expect(first!.project.id).toBe(project.id); + expect(first!.project.externalRef).toBe(project.externalRef); + expect(first!.organization.id).toBe(org.id); + expect(first!.organization.title).toBe(org.title); + // concurrencyLimitBurstFactor is coerced to a plain number by toAuthenticated(). + expect(typeof first!.concurrencyLimitBurstFactor).toBe("number"); + expect(reads()).toBe(1); + + const second = await resolver.resolveAuthenticatedEnv(environment.id); + expect(second).toEqual(first); + expect(reads()).toBe(1); + + expect(await resolver.resolveAuthenticatedEnv("env_missing")).toBeNull(); + } +); + +heteroPostgresTest( + "resolveAuthenticatedEnv populates parentEnvironment { id, apiKey } for a branch env", + async ({ prisma14 }) => { + const m = seedCounter++; + const org = await prisma14.organization.create({ + data: { title: `Org wp ${m}`, slug: `org-wp-${m}` }, + }); + const project = await prisma14.project.create({ + data: { + name: `P wp ${m}`, + slug: `p-wp-${m}`, + externalRef: `proj_wp_${m}`, + organizationId: org.id, + }, + }); + const parent = await prisma14.runtimeEnvironment.create({ + data: { + type: "PREVIEW", + slug: `preview-parent-${m}`, + projectId: project.id, + organizationId: org.id, + apiKey: `tr_parent_key_${m}`, + pkApiKey: `pk_parent_${m}`, + shortcode: `sc_parent_${m}`, + }, + }); + const branch = await prisma14.runtimeEnvironment.create({ + data: { + type: "PREVIEW", + slug: `preview-branch-${m}`, + branchName: "feat/x", + projectId: project.id, + organizationId: org.id, + apiKey: `tr_branch_key_${m}`, + pkApiKey: `pk_branch_${m}`, + shortcode: `sc_branch_${m}`, + parentEnvironmentId: parent.id, + }, + }); + + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const env = await resolver.resolveAuthenticatedEnv(branch.id); + expect(env).not.toBeNull(); + expect(env!.apiKey).toBe(`tr_branch_key_${m}`); + expect(env!.parentEnvironment).not.toBeNull(); + expect(env!.parentEnvironment!.id).toBe(parent.id); + expect(env!.parentEnvironment!.apiKey).toBe(`tr_parent_key_${m}`); + + const noParent = await resolver.resolveAuthenticatedEnv(parent.id); + expect(noParent!.parentEnvironment).toBeNull(); + } +); + +heteroPostgresTest( + "resolveAuthenticatedEnv passthrough (split OFF) hits the DB every time, no cache", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveAuthenticatedEnv(environment.id); + await resolver.resolveAuthenticatedEnv(environment.id); + expect(reads()).toBe(2); + } +); + +heteroPostgresTest( + "resolveAuthenticatedEnv carries the `git` column (cached across calls)", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const gitMeta = { commitSha: "abc123", branchName: "main" }; + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { git: gitMeta }, + }); + + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const first = await resolver.resolveAuthenticatedEnv(environment.id); + expect(first).not.toBeNull(); + expect(first!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + + // Served from cache, still carrying `git`. + const second = await resolver.resolveAuthenticatedEnv(environment.id); + expect(second!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + } +); + +// --- invalidation over the DB boundary ------------------------------------- + +heteroPostgresTest( + "invalidateEnvironment forces resolveEnv/resolveAuthenticatedEnv to re-read after a write", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache, + splitEnabled: () => true, + }); + + // Warm both env-scoped slots. + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).not.toBe(999); + expect((await resolver.resolveAuthenticatedEnv(environment.id))!.paused).toBe(false); + + // Control-plane write + invalidation (as a write site would do). + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { maximumConcurrencyLimit: 999, paused: true }, + }); + resolver.invalidateEnvironment(environment.id); + + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).toBe(999); + expect((await resolver.resolveAuthenticatedEnv(environment.id))!.paused).toBe(true); + } +); + +heteroPostgresTest( + "without invalidation a cached env stays stale after a control-plane write (fail-before contrast)", + async ({ prisma14 }) => { + const { environment } = await seedControlPlane(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const before = (await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit; + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { maximumConcurrencyLimit: 777 }, + }); + + // No invalidation: the cache still serves the pre-write value (this is the bug the + // write-site invalidation fixes). + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).toBe(before); + + // And with invalidation it re-reads. + resolver.invalidateEnvironment(environment.id); + expect((await resolver.resolveEnv(environment.id))!.maximumConcurrencyLimit).toBe(777); + } +); + +heteroPostgresTest( + "invalidateOrganization forces every env of the org to re-read after an org write", + async ({ prisma14 }) => { + const { org: organization, project } = await seedControlPlane(prisma14); + // A second env in the same org. + const m = seedCounter++; + const secondEnv = await prisma14.runtimeEnvironment.create({ + data: { + type: "STAGING", + slug: `env-second-${m}`, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_stg_${m}`, + pkApiKey: `pk_stg_${m}`, + shortcode: `short_stg_${m}`, + }, + }); + const firstEnv = await prisma14.runtimeEnvironment.findFirstOrThrow({ + where: { projectId: project.id, type: "PRODUCTION" }, + }); + + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: prisma14, + controlPlanePrimary: prisma14, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + // Warm both envs' authEnv slots. + expect((await resolver.resolveAuthenticatedEnv(firstEnv.id))!.organization.runsEnabled).toBe( + true + ); + expect((await resolver.resolveAuthenticatedEnv(secondEnv.id))!.organization.runsEnabled).toBe( + true + ); + + // Org-level write (runsEnabled) + a single org invalidation. + await prisma14.organization.update({ + where: { id: organization.id }, + data: { runsEnabled: false }, + }); + resolver.invalidateOrganization(organization.id); + + // BOTH envs re-read and now observe the org change, with no reverse org->env index. + expect((await resolver.resolveAuthenticatedEnv(firstEnv.id))!.organization.runsEnabled).toBe( + false + ); + expect((await resolver.resolveAuthenticatedEnv(secondEnv.id))!.organization.runsEnabled).toBe( + false + ); + } +); + +// --- resolveRunLockedWorker ------------------------------------------------- + +heteroPostgresTest( + "resolveRunLockedWorker returns lockedBy (task+worker+deployment) and lockedToVersion, caches it", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, task, deployment } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting, reads } = countQueries(prisma14); + const cache = new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache, + splitEnabled: () => true, + }); + + const first = await resolver.resolveRunLockedWorker({ + lockedById: task.id, + lockedToVersionId: worker.id, + }); + expect(first).not.toBeNull(); + expect(first!.lockedBy!.id).toBe(task.id); + expect(first!.lockedBy!.filePath).toBe(task.filePath); + expect(first!.lockedBy!.slug).toBe(task.slug); + expect(first!.lockedBy!.exportName).toBe(task.exportName); + expect(first!.lockedBy!.machineConfig).toEqual(task.machineConfig); + expect(first!.lockedBy!.worker.id).toBe(worker.id); + expect(first!.lockedBy!.worker.version).toBe(worker.version); + expect(first!.lockedBy!.worker.deployment!.friendlyId).toBe(deployment!.friendlyId); + expect(first!.lockedToVersion!.version).toBe(worker.version); + expect(first!.lockedToVersion!.supportsLazyAttempts).toBe(worker.supportsLazyAttempts); + const readsAfterFirst = reads(); + expect(readsAfterFirst).toBeGreaterThanOrEqual(1); + + const second = await resolver.resolveRunLockedWorker({ + lockedById: task.id, + lockedToVersionId: worker.id, + }); + expect(second).toEqual(first); + expect(reads()).toBe(readsAfterFirst); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker returns null lockedBy/lockedToVersion when ids are absent", + async ({ prisma14 }) => { + const { client: counting } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => true, + }); + + const resolved = await resolver.resolveRunLockedWorker({ + lockedById: null, + lockedToVersionId: null, + }); + expect(resolved).not.toBeNull(); + expect(resolved!.lockedBy).toBeNull(); + expect(resolved!.lockedToVersion).toBeNull(); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker resolves lockedBy only when lockedToVersionId is absent", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { task } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const result = await resolver.resolveRunLockedWorker({ lockedById: task.id }); + expect(result).not.toBeNull(); + expect(result!.lockedBy!.id).toBe(task.id); + expect(result!.lockedBy!.slug).toBe(task.slug); + expect(result!.lockedToVersion).toBeNull(); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker resolves lockedToVersion only when lockedById is absent", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker } = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id }, + { promote: true } + ); + const { client: counting } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => true, + }); + + const result = await resolver.resolveRunLockedWorker({ lockedToVersionId: worker.id }); + expect(result).not.toBeNull(); + expect(result!.lockedToVersion!.version).toBe(worker.version); + expect(result!.lockedBy).toBeNull(); + } +); + +heteroPostgresTest( + "resolveRunLockedWorker passthrough (split OFF) re-reads every call", + async ({ prisma14 }) => { + const { environment, project } = await seedControlPlane(prisma14); + const { worker, task } = await seedWorker(prisma14, { + projectId: project.id, + environmentId: environment.id, + }); + const { client: counting, reads } = countQueries(prisma14); + const resolver = new ControlPlaneResolver({ + controlPlaneReplica: counting, + controlPlanePrimary: counting, + cache: new ControlPlaneCache(), + splitEnabled: () => false, + }); + + await resolver.resolveRunLockedWorker({ lockedById: task.id, lockedToVersionId: worker.id }); + const readsAfterFirst = reads(); + await resolver.resolveRunLockedWorker({ lockedById: task.id, lockedToVersionId: worker.id }); + expect(reads()).toBe(readsAfterFirst * 2); + } +); diff --git a/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts b/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts new file mode 100644 index 00000000000..d2baaa6404a --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/distinctDbSentinel.server.test.ts @@ -0,0 +1,64 @@ +import { heteroPostgresTest } from "@internal/testcontainers"; +import { PrismaClient } from "@trigger.dev/database"; +import { describe, expect, vi } from "vitest"; +import { probeDistinctDatabases } from "~/v3/runOpsMigration/distinctDbSentinel.server"; + +// Spinning up two separate postgres clusters and probing each can exceed the 5s default. +vi.setConfig({ testTimeout: 60_000 }); + +function urlWithDatabase(uri: string, database: string): string { + const url = new URL(uri); + url.pathname = `/${database}`; + return url.toString(); +} + +describe("probeDistinctDatabases", () => { + heteroPostgresTest( + "reports distinct for two separate physical clusters", + async ({ uri14, uri17 }) => { + const result = await probeDistinctDatabases(uri14, uri17); + expect(result).toEqual({ distinct: true }); + } + ); + + heteroPostgresTest( + "reports NOT distinct, citing the same physical database, when both URLs point at it", + async ({ uri14 }) => { + const result = await probeDistinctDatabases(uri14, uri14); + expect(result.distinct).toBe(false); + if (result.distinct === false) { + expect(result.reason).toMatch(/same physical database/i); + } + } + ); + + heteroPostgresTest( + "reports distinct for two databases in the SAME cluster", + async ({ postgresContainer14, uri14 }) => { + const otherDb = `sentinel_other_${Date.now()}`; + const admin = new PrismaClient({ + datasources: { + db: { url: urlWithDatabase(postgresContainer14.getConnectionUri(), "postgres") }, + }, + }); + try { + await admin.$executeRawUnsafe(`CREATE DATABASE "${otherDb}"`); + } finally { + await admin.$disconnect(); + } + + const otherUrl = urlWithDatabase(uri14, otherDb); + const result = await probeDistinctDatabases(uri14, otherUrl); + expect(result).toEqual({ distinct: true }); + } + ); + + heteroPostgresTest( + "fails closed to NOT distinct when a probe cannot reach a database", + async ({ uri14 }) => { + const unreachable = "postgresql://nobody:nobody@127.0.0.1:1/does_not_exist"; + const result = await probeDistinctDatabases(uri14, unreachable); + expect(result.distinct).toBe(false); + } + ); +}); diff --git a/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts new file mode 100644 index 00000000000..5e1c6902f7c --- /dev/null +++ b/apps/webapp/test/v3/runOpsMigration/runEngineControlPlaneResolver.server.test.ts @@ -0,0 +1,265 @@ +// The webapp adapter presents the cross-DB app ControlPlaneResolver as the run-engine seam. +// Proven over real testcontainers (never mocked): resolveEnv maps onto the MinimalAuthenticatedEnv +// superset; resolveWorkerVersion forwards the env type so the engine dequeue dispatch (DEV +// most-recent / MANAGED promotion) runs; assertEnvExists delegates and rejects on a missing env. +import { heteroPostgresTest } from "@internal/testcontainers"; +import type { PrismaClient } from "@trigger.dev/database"; +import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; +import { describe, expect, vi } from "vitest"; +import { ControlPlaneCache } from "~/v3/runOpsMigration/controlPlaneCache.server"; +import { ControlPlaneResolver } from "~/v3/runOpsMigration/controlPlaneResolver.server"; +import { RunEngineControlPlaneResolver } from "~/v3/runOpsMigration/runEngineControlPlaneResolver.server"; + +vi.setConfig({ testTimeout: 60_000 }); + +let n = 0; + +function buildAppResolver(controlPlane: PrismaClient, opts?: { splitEnabled?: boolean }) { + return new ControlPlaneResolver({ + controlPlanePrimary: controlPlane, + controlPlaneReplica: controlPlane, + cache: new ControlPlaneCache({ ttlMs: 60_000, maxEntries: 100 }), + splitEnabled: () => opts?.splitEnabled ?? false, + }); +} + +/** + * Wraps a real testcontainer PrismaClient with a `$extends` query hook counting DB operations. + * Not a mock — the real query still runs; we only observe the boundary to prove cache hits. + */ +function countQueries(client: PrismaClient): { client: PrismaClient; reads: () => number } { + let count = 0; + const extended = client.$extends({ + query: { + async $allOperations({ args, query }) { + count++; + return query(args); + }, + }, + }) as unknown as PrismaClient; + return { client: extended, reads: () => count }; +} + +async function seedEnv(prisma: PrismaClient, type: "PRODUCTION" | "DEVELOPMENT") { + const suffix = `re-${n++}`; + const organization = await prisma.organization.create({ + data: { title: `Org ${suffix}`, slug: `org-${suffix}` }, + }); + const project = await prisma.project.create({ + data: { + name: `Project ${suffix}`, + slug: `project-${suffix}`, + externalRef: `proj_${suffix}`, + organizationId: organization.id, + }, + }); + const environment = await prisma.runtimeEnvironment.create({ + data: { + type, + slug: suffix, + projectId: project.id, + organizationId: organization.id, + apiKey: `tr_${suffix}`, + pkApiKey: `pk_${suffix}`, + shortcode: `short_${suffix}`, + maximumConcurrencyLimit: 9, + }, + }); + return { organization, project, environment, suffix }; +} + +async function seedWorker( + prisma: PrismaClient, + ctx: { projectId: string; environmentId: string; suffix: string }, + opts: { promote?: boolean; deploy?: boolean } +) { + const worker = await prisma.backgroundWorker.create({ + data: { + friendlyId: `worker_${ctx.suffix}`, + contentHash: `hash_${ctx.suffix}`, + projectId: ctx.projectId, + runtimeEnvironmentId: ctx.environmentId, + version: `2024.1.${ctx.suffix}`, + metadata: {}, + engine: "V2", + }, + }); + const task = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: `task_${ctx.suffix}`, + slug: "my-task", + filePath: "index.ts", + exportName: "myTask", + workerId: worker.id, + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + }, + }); + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: `queue_${ctx.suffix}`, + name: "task/my-task", + runtimeEnvironmentId: ctx.environmentId, + projectId: ctx.projectId, + type: "VIRTUAL", + workers: { connect: { id: worker.id } }, + tasks: { connect: { id: task.id } }, + }, + }); + if (opts.deploy) { + const deployment = await prisma.workerDeployment.create({ + data: { + friendlyId: `deployment_${ctx.suffix}`, + contentHash: worker.contentHash, + version: worker.version, + shortCode: `short_${ctx.suffix}`, + imageReference: `image:${ctx.suffix}`, + status: "DEPLOYED", + projectId: ctx.projectId, + environmentId: ctx.environmentId, + workerId: worker.id, + type: "MANAGED", + }, + }); + if (opts.promote) { + await prisma.workerDeploymentPromotion.create({ + data: { + label: CURRENT_DEPLOYMENT_LABEL, + deploymentId: deployment.id, + environmentId: ctx.environmentId, + }, + }); + } + return { worker, task, queue, deployment }; + } + return { worker, task, queue }; +} + +describe("RunEngineControlPlaneResolver adapter", () => { + heteroPostgresTest( + "resolveEnv maps app ResolvedEnv onto ResolvedEngineEnv", + async ({ prisma14 }) => { + const { organization, project, environment } = await seedEnv(prisma14, "PRODUCTION"); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + const env = await adapter.resolveEnv(environment.id); + expect(env).not.toBeNull(); + expect(env!.id).toBe(environment.id); + expect(env!.type).toBe("PRODUCTION"); + expect(env!.projectId).toBe(project.id); + expect(env!.organizationId).toBe(organization.id); + // Nested + concurrency fields the run-engine MinimalAuthenticatedEnvironment requires. + expect(env!.project.id).toBe(project.id); + expect(env!.organization.id).toBe(organization.id); + expect(env!.maximumConcurrencyLimit).toBe(9); + expect(env!.concurrencyLimitBurstFactor.toNumber()).toBe(2); + expect(env!.archivedAt).toBeNull(); + + expect(await adapter.resolveEnv("env_missing")).toBeNull(); + } + ); + + heteroPostgresTest( + "resolveWorkerVersion (deployed, no workerId) resolves the promoted MANAGED deployment", + async ({ prisma14 }) => { + const { project, environment, suffix } = await seedEnv(prisma14, "PRODUCTION"); + const seeded = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id, suffix }, + { deploy: true, promote: true } + ); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + const version = await adapter.resolveWorkerVersion({ + environmentId: environment.id, + type: "PRODUCTION", + }); + expect(version).not.toBeNull(); + expect(version!.worker.id).toBe(seeded.worker.id); + expect(version!.deployment?.id).toBe( + "deployment" in seeded ? seeded.deployment.id : undefined + ); + expect(version!.tasks.map((t) => t.slug)).toContain("my-task"); + } + ); + + heteroPostgresTest( + "resolveWorkerVersion (DEVELOPMENT, no workerId) resolves the most-recent worker (no deployment)", + async ({ prisma14 }) => { + const { project, environment, suffix } = await seedEnv(prisma14, "DEVELOPMENT"); + const seeded = await seedWorker( + prisma14, + { projectId: project.id, environmentId: environment.id, suffix }, + { deploy: false } + ); + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + const version = await adapter.resolveWorkerVersion({ + environmentId: environment.id, + type: "DEVELOPMENT", + }); + expect(version).not.toBeNull(); + expect(version!.worker.id).toBe(seeded.worker.id); + expect(version!.deployment).toBeNull(); + } + ); + + heteroPostgresTest( + "assertEnvExists (split ON) resolves for a present env, rejects for a missing one", + async ({ prisma14 }) => { + const { environment } = await seedEnv(prisma14, "PRODUCTION"); + // split ON: the only mode where assertEnvExists asserts (split OFF is a no-op, + // covered in controlPlaneResolver.server.test.ts). + const adapter = new RunEngineControlPlaneResolver( + buildAppResolver(prisma14, { splitEnabled: true }) + ); + + await expect(adapter.assertEnvExists(environment.id)).resolves.toBeUndefined(); + await expect(adapter.assertEnvExists("env_missing")).rejects.toThrow(); + } + ); + + heteroPostgresTest( + "resolveAuthenticatedEnv delegates to the app resolver, returns `git`, and is cached", + async ({ prisma14 }) => { + const { environment } = await seedEnv(prisma14, "PRODUCTION"); + const gitMeta = { commitSha: "deadbeef", branchName: "main" }; + await prisma14.runtimeEnvironment.update({ + where: { id: environment.id }, + data: { git: gitMeta }, + }); + + // split ON so the delegated app resolver caches; the counter proves the second call + // is a cache hit rather than re-querying $replica directly (the pre-fix behavior). + const { client: counting, reads } = countQueries(prisma14); + const adapter = new RunEngineControlPlaneResolver( + buildAppResolver(counting, { splitEnabled: true }) + ); + + const first = await adapter.resolveAuthenticatedEnv(environment.id); + expect(first).not.toBeNull(); + expect(first!.id).toBe(environment.id); + expect(first!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + + const second = await adapter.resolveAuthenticatedEnv(environment.id); + expect(second!.git).toEqual(gitMeta); + expect(reads()).toBe(1); + } + ); + + heteroPostgresTest( + "resolveAuthenticatedEnv returns null for a deleted project", + async ({ prisma14 }) => { + const { environment, project } = await seedEnv(prisma14, "PRODUCTION"); + await prisma14.project.update({ + where: { id: project.id }, + data: { deletedAt: new Date() }, + }); + + const adapter = new RunEngineControlPlaneResolver(buildAppResolver(prisma14)); + + expect(await adapter.resolveAuthenticatedEnv(environment.id)).toBeNull(); + } + ); +}); diff --git a/apps/webapp/vitest.config.ts b/apps/webapp/vitest.config.ts index 69eb980732f..8e05aec1ebc 100644 --- a/apps/webapp/vitest.config.ts +++ b/apps/webapp/vitest.config.ts @@ -5,7 +5,18 @@ import tsconfigPaths from "vite-tsconfig-paths"; export default defineConfig({ test: { sequence: { sequencer: DurationShardingSequencer }, - include: ["test/**/*.test.ts"], + // Webapp tests live under test/**; the run-ops migration family + // colocates its *.server.test.ts next to source under app/v3/runOpsMigration/. + // The run-store seam test colocates next to its source at app/v3/runStore.server.test.ts. + // Pure unit tests for runEngine concerns colocate next to their source file. + include: [ + "test/**/*.test.ts", + "app/v3/runOpsMigration/**/*.test.ts", + "app/v3/runStore.server.test.ts", + "app/v3/services/bulk/**/*.test.ts", + "app/runEngine/concerns/**/*.test.ts", + "app/runEngine/services/**/*.test.ts", + ], // *.e2e.test.ts: smoke matrix, run via vitest.e2e.config.ts. // *.e2e.full.test.ts: full auth suite, runs via vitest.e2e.full.config.ts // (needs a globalSetup-spawned webapp + Postgres container). diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 034f7d76ef4..1a56a054f42 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -371,6 +371,9 @@ importers: '@internal/run-engine': specifier: workspace:* version: link:../../internal-packages/run-engine + '@internal/run-ops-database': + specifier: workspace:* + version: link:../../internal-packages/run-ops-database '@internal/run-store': specifier: workspace:* version: link:../../internal-packages/run-store @@ -936,6 +939,9 @@ importers: '@tailwindcss/typography': specifier: ^0.5.9 version: 0.5.9(tailwindcss@3.4.1) + '@testcontainers/postgresql': + specifier: ^11.14.0 + version: 11.14.0 '@total-typescript/ts-reset': specifier: ^0.4.2 version: 0.4.2