From b229cd3cd0042cbbcc1585ea08497a6839c9ed7e Mon Sep 17 00:00:00 2001 From: Katia Bulatova Date: Wed, 1 Jul 2026 12:16:05 +0000 Subject: [PATCH] feat(webapp): notify customers of incidents via BetterStack webhook When a status report is published on the status page, an inbound BetterStack webhook fans the update out over Slack, email, and Discord. Fires only on published incident updates, not monitor auto-alerts. Deduped per update; each channel no-ops unless its own config is present. --- .server-changes/incident-notifications.md | 12 + apps/webapp/app/env.server.ts | 16 ++ .../webhooks.v1.betterstack-incidents.ts | 72 +++++ .../services/betterstack/incidentWebhook.ts | 96 +++++++ apps/webapp/app/utils/redactUrl.ts | 28 ++ apps/webapp/app/v3/alertsWorker.server.ts | 69 +++++ .../deliverDiscord.server.ts | 27 ++ .../deliverEmail.server.ts | 66 +++++ .../deliverSlack.server.ts | 91 +++++++ .../incidentNotifications/fanout.server.ts | 26 ++ .../alerts/incidentNotifications/messages.ts | 100 +++++++ .../recipients.server.ts | 47 ++++ apps/webapp/app/v3/tracer.server.ts | 39 ++- apps/webapp/server.ts | 38 ++- .../webapp/test/incidentNotifications.test.ts | 247 ++++++++++++++++++ .../emails/emails/incident-notification.tsx | 66 +++++ internal-packages/emails/src/index.tsx | 10 + 17 files changed, 1048 insertions(+), 2 deletions(-) create mode 100644 .server-changes/incident-notifications.md create mode 100644 apps/webapp/app/routes/webhooks.v1.betterstack-incidents.ts create mode 100644 apps/webapp/app/services/betterstack/incidentWebhook.ts create mode 100644 apps/webapp/app/utils/redactUrl.ts create mode 100644 apps/webapp/app/v3/services/alerts/incidentNotifications/deliverDiscord.server.ts create mode 100644 apps/webapp/app/v3/services/alerts/incidentNotifications/deliverEmail.server.ts create mode 100644 apps/webapp/app/v3/services/alerts/incidentNotifications/deliverSlack.server.ts create mode 100644 apps/webapp/app/v3/services/alerts/incidentNotifications/fanout.server.ts create mode 100644 apps/webapp/app/v3/services/alerts/incidentNotifications/messages.ts create mode 100644 apps/webapp/app/v3/services/alerts/incidentNotifications/recipients.server.ts create mode 100644 apps/webapp/test/incidentNotifications.test.ts create mode 100644 internal-packages/emails/emails/incident-notification.tsx diff --git a/.server-changes/incident-notifications.md b/.server-changes/incident-notifications.md new file mode 100644 index 00000000000..118da39bbd4 --- /dev/null +++ b/.server-changes/incident-notifications.md @@ -0,0 +1,12 @@ +--- +area: webapp +type: feature +--- + +Add an inbound webhook (`POST /webhooks/v1/betterstack-incidents`) that receives +status-page incident updates and proactively notifies customers over Slack +(channels matching a configurable name prefix), email (org admins, via the +alerts email transport), and Discord (an incoming webhook). Delivery runs on the +alerts redis-worker with per-surface jobs and is deduped on the incident update +id. Gated by `INCIDENT_NOTIFY_ENABLED` plus a shared-secret token in the webhook +URL; each surface no-ops unless its own config is present. diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 7c57733ad8f..2850e840921 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1606,6 +1606,14 @@ const EnvironmentSchema = z BETTERSTACK_API_KEY: z.string().optional(), BETTERSTACK_STATUS_PAGE_ID: z.string().optional(), + // Incident notifications: fan a published status report out over + // Slack/email/Discord. Each surface no-ops unless configured; the unsigned + // webhook is gated by a shared secret in the URL. + INCIDENT_NOTIFY_ENABLED: z.string().default("0"), + BETTERSTACK_INCIDENT_WEBHOOK_SECRET: z.string().optional(), + INCIDENT_NOTIFY_SLACK_CHANNEL_PREFIX: z.string().optional(), + INCIDENT_NOTIFY_DISCORD_WEBHOOK_URL: z.string().optional(), + RUN_REPLICATION_REDIS_HOST: z .string() .optional() @@ -2010,6 +2018,14 @@ const EnvironmentSchema = z .and(GithubAppEnvSchema) .and(S2EnvSchema) .superRefine((env, ctx) => { + if (env.INCIDENT_NOTIFY_ENABLED === "1" && !env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ["BETTERSTACK_INCIDENT_WEBHOOK_SECRET"], + message: "BETTERSTACK_INCIDENT_WEBHOOK_SECRET is required when INCIDENT_NOTIFY_ENABLED=1", + }); + } + const presets = new Set(env.COMPUTE_TEMPLATE_MACHINE_PRESETS); for (const required of env.COMPUTE_TEMPLATE_MACHINE_PRESETS_REQUIRED) { if (!presets.has(required)) { diff --git a/apps/webapp/app/routes/webhooks.v1.betterstack-incidents.ts b/apps/webapp/app/routes/webhooks.v1.betterstack-incidents.ts new file mode 100644 index 00000000000..42dba7613d5 --- /dev/null +++ b/apps/webapp/app/routes/webhooks.v1.betterstack-incidents.ts @@ -0,0 +1,72 @@ +import { type ActionFunctionArgs, json } from "@remix-run/server-runtime"; +import { createHash, timingSafeEqual } from "node:crypto"; +import { env } from "~/env.server"; +import { + IncidentWebhookSchema, + isCustomerNotifiableEvent, + normalizeIncidentUpdate, +} from "~/services/betterstack/incidentWebhook"; +import { logger } from "~/services/logger.server"; +import { alertsWorker } from "~/v3/alertsWorker.server"; + +// Inbound status-page webhook. BetterStack can't send custom headers, so we +// auth via a `?token=` shared secret (redacted from logs at ingress). 404 when +// disabled or unconfigured. We 200 fast and hand off to the worker; the enqueue +// is deduped on the update id since BetterStack redelivers on failure. +export async function action({ request }: ActionFunctionArgs) { + if (request.method.toUpperCase() !== "POST") { + return json({ error: "Method not allowed" }, { status: 405 }); + } + + const secret = env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET; + if (env.INCIDENT_NOTIFY_ENABLED !== "1" || !secret) { + return json({ error: "Not found" }, { status: 404 }); + } + + const token = new URL(request.url).searchParams.get("token") ?? ""; + if (!secretsMatch(token, secret)) { + return json({ error: "Invalid token" }, { status: 401 }); + } + + const rawBody = await request.text(); + + let parsed: unknown; + try { + parsed = JSON.parse(rawBody); + } catch { + return json({ error: "Invalid JSON" }, { status: 400 }); + } + + const payload = IncidentWebhookSchema.safeParse(parsed); + if (!payload.success) { + logger.warn("BetterStack incident webhook: invalid payload", { + issues: payload.error.issues, + }); + return json({ error: "Invalid payload", issues: payload.error.issues }, { status: 400 }); + } + + // Maintenance and component-update events are not customer incidents. + if (!isCustomerNotifiableEvent(payload.data)) { + return json({ ignored: true, reason: "non_incident_event" }, { status: 200 }); + } + + const update = normalizeIncidentUpdate(payload.data); + if (!update) { + return json({ ignored: true, reason: "no_updates" }, { status: 200 }); + } + + await alertsWorker.enqueueOnce({ + id: `incident-notify:${update.updateId}`, + job: "v3.fanoutIncidentNotification", + payload: update, + }); + + return json({ received: true }, { status: 200 }); +} + +// Hash both sides so timingSafeEqual gets equal-length buffers without leaking length. +function secretsMatch(a: string, b: string): boolean { + const aHash = createHash("sha256").update(a).digest(); + const bHash = createHash("sha256").update(b).digest(); + return timingSafeEqual(aHash, bHash); +} diff --git a/apps/webapp/app/services/betterstack/incidentWebhook.ts b/apps/webapp/app/services/betterstack/incidentWebhook.ts new file mode 100644 index 00000000000..178967ce1ff --- /dev/null +++ b/apps/webapp/app/services/betterstack/incidentWebhook.ts @@ -0,0 +1,96 @@ +import { z } from "zod"; + +// Payload for the BetterStack status-page webhook. The endpoint is unsigned, so +// the route auths via a shared secret in the URL. + +// BetterStack sends ids as numbers; accept either and normalize to string. +const IdSchema = z.union([z.string(), z.number()]).transform((v) => String(v)); + +export const IncidentUpdateSchema = z.object({ + id: IdSchema, + status_report_id: IdSchema.optional(), + body: z.string().nullish(), + created_at: z.string().nullish(), + updated_at: z.string().nullish(), +}); + +export const IncidentWebhookSchema = z.object({ + event_type: z.string(), + page: z + .object({ + id: IdSchema.optional(), + status_indicator: z.string().nullish(), + status_description: z.string().nullish(), + }) + .optional(), + // Optional so non-incident callbacks (maintenance/component) parse and are + // ignored instead of 400ing. + incident: z + .object({ + id: IdSchema, + name: z.string().nullish(), + created_at: z.string().nullish(), + updated_at: z.string().nullish(), + shortlink: z.string().nullish(), + incident_updates: z.array(IncidentUpdateSchema).default([]), + }) + .optional(), +}); + +export type IncidentWebhook = z.infer; + +export const NormalizedIncidentUpdateSchema = z.object({ + incidentId: z.string(), + updateId: z.string(), + name: z.string(), + statusIndicator: z.string(), + body: z.string(), + shortlink: z.string().nullable(), + updatedAt: z.string().nullable(), +}); + +export type NormalizedIncidentUpdate = { + incidentId: string; + /** The specific update id — our idempotency key. */ + updateId: string; + name: string; + /** operational | degraded | downtime | maintenance */ + statusIndicator: string; + body: string; + shortlink: string | null; + updatedAt: string | null; +}; + +/** Only published "incident" events notify customers, not monitor auto-alerts. */ +export function isCustomerNotifiableEvent(payload: IncidentWebhook): boolean { + return payload.event_type === "incident" && !!payload.incident; +} + +/** Reduce the webhook to its most recent update, or null if there are none. */ +export function normalizeIncidentUpdate(payload: IncidentWebhook): NormalizedIncidentUpdate | null { + if (!payload.incident) { + return null; + } + + const updates = payload.incident.incident_updates; + if (updates.length === 0) { + return null; + } + + // Sort by created_at so we don't rely on BetterStack's ordering. + const mostRecent = [...updates].sort((a, b) => { + const aTime = a.created_at ? Date.parse(a.created_at) : 0; + const bTime = b.created_at ? Date.parse(b.created_at) : 0; + return bTime - aTime; + })[0]; + + return { + incidentId: payload.incident.id, + updateId: mostRecent.id, + name: payload.incident.name?.trim() || "Service incident", + statusIndicator: payload.page?.status_indicator?.trim() || "downtime", + body: mostRecent.body?.trim() || "", + shortlink: payload.incident.shortlink?.trim() || null, + updatedAt: mostRecent.created_at ?? payload.incident.updated_at ?? null, + }; +} diff --git a/apps/webapp/app/utils/redactUrl.ts b/apps/webapp/app/utils/redactUrl.ts new file mode 100644 index 00000000000..5563ad8de61 --- /dev/null +++ b/apps/webapp/app/utils/redactUrl.ts @@ -0,0 +1,28 @@ +// Credential query params redacted from logs and traces (some webhooks can only +// auth via a URL token). +export const SENSITIVE_QUERY_PARAMS = ["token", "secret", "access_token", "api_key"]; + +/** + * Replace sensitive query param values with `[redacted]`. Accepts absolute or + * path+query URLs; returns malformed input unchanged (never throws). + */ +export function redactSensitiveQueryParams(url: string): string { + const queryStart = url.indexOf("?"); + if (queryStart === -1) { + return url; + } + + try { + const params = new URLSearchParams(url.slice(queryStart + 1)); + let didRedact = false; + for (const key of SENSITIVE_QUERY_PARAMS) { + if (params.has(key)) { + params.set(key, "[redacted]"); + didRedact = true; + } + } + return didRedact ? `${url.slice(0, queryStart)}?${params.toString()}` : url; + } catch { + return url; + } +} diff --git a/apps/webapp/app/v3/alertsWorker.server.ts b/apps/webapp/app/v3/alertsWorker.server.ts index 693b16b738a..ded073865de 100644 --- a/apps/webapp/app/v3/alertsWorker.server.ts +++ b/apps/webapp/app/v3/alertsWorker.server.ts @@ -4,9 +4,17 @@ import { z } from "zod"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import { singleton } from "~/utils/singleton"; +import { NormalizedIncidentUpdateSchema } from "~/services/betterstack/incidentWebhook"; import { DeliverAlertService } from "./services/alerts/deliverAlert.server"; import { DeliverErrorGroupAlertService } from "./services/alerts/deliverErrorGroupAlert.server"; import { ErrorAlertEvaluator } from "./services/alerts/errorAlertEvaluator.server"; +import { deliverIncidentToDiscord } from "./services/alerts/incidentNotifications/deliverDiscord.server"; +import { + deliverIncidentEmailPage, + deliverIncidentEmailToRecipient, +} from "./services/alerts/incidentNotifications/deliverEmail.server"; +import { deliverIncidentToSlack } from "./services/alerts/incidentNotifications/deliverSlack.server"; +import { fanoutIncidentNotification } from "./services/alerts/incidentNotifications/fanout.server"; import { PerformDeploymentAlertsService } from "./services/alerts/performDeploymentAlerts.server"; import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server"; @@ -93,6 +101,52 @@ function initializeWorker() { }, logErrors: true, }, + "v3.fanoutIncidentNotification": { + schema: NormalizedIncidentUpdateSchema, + visibilityTimeoutMs: 30_000, + retry: { + maxAttempts: 3, + }, + logErrors: true, + }, + "v3.deliverIncidentSlack": { + schema: z.object({ update: NormalizedIncidentUpdateSchema }), + visibilityTimeoutMs: 60_000, + retry: { + maxAttempts: 3, + }, + logErrors: true, + }, + "v3.deliverIncidentDiscord": { + schema: z.object({ update: NormalizedIncidentUpdateSchema }), + visibilityTimeoutMs: 30_000, + retry: { + maxAttempts: 3, + }, + logErrors: true, + }, + "v3.deliverIncidentEmail": { + schema: z.object({ + update: NormalizedIncidentUpdateSchema, + cursor: z.string().nullable(), + }), + visibilityTimeoutMs: 60_000, + retry: { + maxAttempts: 3, + }, + logErrors: true, + }, + "v3.deliverIncidentEmailRecipient": { + schema: z.object({ + update: NormalizedIncidentUpdateSchema, + recipient: z.object({ userId: z.string(), email: z.string() }), + }), + visibilityTimeoutMs: 30_000, + retry: { + maxAttempts: 3, + }, + logErrors: true, + }, }, concurrency: { workers: env.ALERTS_WORKER_CONCURRENCY_WORKERS, @@ -126,6 +180,21 @@ function initializeWorker() { const service = new DeliverErrorGroupAlertService(); await service.call(payload); }, + "v3.fanoutIncidentNotification": async ({ payload }) => { + await fanoutIncidentNotification(payload); + }, + "v3.deliverIncidentSlack": async ({ payload }) => { + await deliverIncidentToSlack(payload.update); + }, + "v3.deliverIncidentDiscord": async ({ payload }) => { + await deliverIncidentToDiscord(payload.update); + }, + "v3.deliverIncidentEmail": async ({ payload }) => { + await deliverIncidentEmailPage(payload); + }, + "v3.deliverIncidentEmailRecipient": async ({ payload }) => { + await deliverIncidentEmailToRecipient(payload); + }, }, }); diff --git a/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverDiscord.server.ts b/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverDiscord.server.ts new file mode 100644 index 00000000000..b2091c9ce23 --- /dev/null +++ b/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverDiscord.server.ts @@ -0,0 +1,27 @@ +import { env } from "~/env.server"; +import { type NormalizedIncidentUpdate } from "~/services/betterstack/incidentWebhook"; +import { logger } from "~/services/logger.server"; +import { buildDiscordPayload } from "./messages"; + +/** Post to the Discord webhook. No-op if unconfigured; throws on non-2xx to retry. */ +export async function deliverIncidentToDiscord(update: NormalizedIncidentUpdate): Promise { + const webhookUrl = env.INCIDENT_NOTIFY_DISCORD_WEBHOOK_URL; + if (!webhookUrl) { + logger.debug("Incident Discord delivery skipped: no webhook URL configured"); + return; + } + + const response = await fetch(webhookUrl, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(buildDiscordPayload(update)), + signal: AbortSignal.timeout(10_000), + }); + + if (!response.ok) { + const detail = await response.text().catch(() => ""); + throw new Error(`Discord webhook returned ${response.status}: ${detail.slice(0, 200)}`); + } + + logger.info("Incident Discord delivery complete", { updateId: update.updateId }); +} diff --git a/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverEmail.server.ts b/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverEmail.server.ts new file mode 100644 index 00000000000..9c2752e4a79 --- /dev/null +++ b/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverEmail.server.ts @@ -0,0 +1,66 @@ +import { sendAlertEmail } from "~/services/email.server"; +import { type NormalizedIncidentUpdate } from "~/services/betterstack/incidentWebhook"; +import { logger } from "~/services/logger.server"; +import { alertsWorker } from "~/v3/alertsWorker.server"; +import { incidentUrl, presentStatus } from "./messages"; +import { getIncidentEmailRecipientsPage, type IncidentEmailRecipient } from "./recipients.server"; + +export type DeliverIncidentEmailPayload = { + update: NormalizedIncidentUpdate; + cursor: string | null; +}; + +export type DeliverIncidentEmailToRecipientPayload = { + update: NormalizedIncidentUpdate; + recipient: IncidentEmailRecipient; +}; + +/** + * Fan one page of admin recipients out into deduped per-recipient send jobs, + * then enqueue the next page. Does no sending itself, so a retry of this job + * only re-enqueues (idempotent) rather than re-mailing anyone. + */ +export async function deliverIncidentEmailPage( + payload: DeliverIncidentEmailPayload +): Promise { + const { update, cursor } = payload; + const { recipients, nextCursor } = await getIncidentEmailRecipientsPage(cursor); + + for (const recipient of recipients) { + await alertsWorker.enqueueOnce({ + id: `incident-notify:${update.updateId}:email:recipient:${recipient.userId}`, + job: "v3.deliverIncidentEmailRecipient", + payload: { update, recipient }, + }); + } + + if (nextCursor) { + await alertsWorker.enqueueOnce({ + id: `incident-notify:${update.updateId}:email:${nextCursor}`, + job: "v3.deliverIncidentEmail", + payload: { update, cursor: nextCursor }, + }); + } + + logger.info("Incident email page fanned out", { + page: recipients.length, + hasMore: nextCursor !== null, + updateId: update.updateId, + }); +} + +/** Send the incident email to one recipient; throws so the worker retries. */ +export async function deliverIncidentEmailToRecipient( + payload: DeliverIncidentEmailToRecipientPayload +): Promise { + const { update, recipient } = payload; + const status = presentStatus(update.statusIndicator); + await sendAlertEmail({ + email: "incident-notification", + to: recipient.email, + name: update.name, + statusLabel: status.label, + body: update.body, + url: incidentUrl(update), + }); +} diff --git a/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverSlack.server.ts b/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverSlack.server.ts new file mode 100644 index 00000000000..96952de7bee --- /dev/null +++ b/apps/webapp/app/v3/services/alerts/incidentNotifications/deliverSlack.server.ts @@ -0,0 +1,91 @@ +import { WebClient } from "@slack/web-api"; +import { env } from "~/env.server"; +import { type NormalizedIncidentUpdate } from "~/services/betterstack/incidentWebhook"; +import { logger } from "~/services/logger.server"; +import { singleton } from "~/utils/singleton"; +import { buildSlackBlocks, buildSubject } from "./messages"; + +const slack = singleton("incident-slack-client", () => new WebClient(env.SLACK_BOT_TOKEN)); + +type Channel = { id?: string; name?: string; is_archived?: boolean }; + +/** + * Post to every Slack channel whose name starts with the configured prefix. + * Best-effort per channel. No-op if the bot token or prefix isn't configured. + */ +export async function deliverIncidentToSlack(update: NormalizedIncidentUpdate): Promise { + if (!env.SLACK_BOT_TOKEN) { + logger.debug("Incident Slack delivery skipped: no bot token configured"); + return; + } + + // Blank prefix would match every channel (startsWith("")), so treat as unset. + const prefix = env.INCIDENT_NOTIFY_SLACK_CHANNEL_PREFIX?.trim(); + if (!prefix) { + logger.debug("Incident Slack delivery skipped: no channel prefix configured"); + return; + } + + const channels = await listChannelsWithPrefix(prefix); + + if (channels.length === 0) { + logger.warn("Incident Slack delivery: no matching channels", { prefix }); + return; + } + + const subject = buildSubject(update); + const blocks = buildSlackBlocks(update); + + let delivered = 0; + for (const channel of channels) { + if (!channel.id) { + continue; + } + + try { + await slack.chat.postMessage({ + channel: channel.id, + text: subject, + blocks, + unfurl_links: false, + }); + delivered += 1; + } catch (error) { + // The bot may not be a member of every matching channel — log and move on. + logger.warn("Incident Slack delivery failed for channel", { + channel: channel.name, + error: error instanceof Error ? error.message : error, + }); + } + } + + logger.info("Incident Slack delivery complete", { + matched: channels.length, + delivered, + updateId: update.updateId, + }); +} + +async function listChannelsWithPrefix(prefix: string): Promise { + const matched: Channel[] = []; + let cursor: string | undefined = undefined; + + do { + const response = await slack.conversations.list({ + types: "public_channel,private_channel", + exclude_archived: true, + cursor, + limit: 999, + }); + + for (const channel of response.channels ?? []) { + if (!channel.is_archived && channel.name?.startsWith(prefix)) { + matched.push(channel); + } + } + + cursor = response.response_metadata?.next_cursor || undefined; + } while (cursor); + + return matched; +} diff --git a/apps/webapp/app/v3/services/alerts/incidentNotifications/fanout.server.ts b/apps/webapp/app/v3/services/alerts/incidentNotifications/fanout.server.ts new file mode 100644 index 00000000000..fd8df9a9e28 --- /dev/null +++ b/apps/webapp/app/v3/services/alerts/incidentNotifications/fanout.server.ts @@ -0,0 +1,26 @@ +import { type NormalizedIncidentUpdate } from "~/services/betterstack/incidentWebhook"; +import { alertsWorker } from "~/v3/alertsWorker.server"; + +/** + * Fan an update out into one job per surface, deduped on update id + surface so + * retries never double-notify. Unconfigured surfaces no-op in their handler. + */ +export async function fanoutIncidentNotification(update: NormalizedIncidentUpdate): Promise { + await Promise.all([ + alertsWorker.enqueueOnce({ + id: `incident-notify:${update.updateId}:slack`, + job: "v3.deliverIncidentSlack", + payload: { update }, + }), + alertsWorker.enqueueOnce({ + id: `incident-notify:${update.updateId}:discord`, + job: "v3.deliverIncidentDiscord", + payload: { update }, + }), + alertsWorker.enqueueOnce({ + id: `incident-notify:${update.updateId}:email:start`, + job: "v3.deliverIncidentEmail", + payload: { update, cursor: null }, + }), + ]); +} diff --git a/apps/webapp/app/v3/services/alerts/incidentNotifications/messages.ts b/apps/webapp/app/v3/services/alerts/incidentNotifications/messages.ts new file mode 100644 index 00000000000..55a3203bfeb --- /dev/null +++ b/apps/webapp/app/v3/services/alerts/incidentNotifications/messages.ts @@ -0,0 +1,100 @@ +import { type KnownBlock } from "@slack/web-api"; +import { type NormalizedIncidentUpdate } from "~/services/betterstack/incidentWebhook"; + +// Pure, IO-free formatting helpers shared by every delivery surface. + +const STATUS_PAGE_URL = "https://status.trigger.dev"; + +type StatusPresentation = { + label: string; + /** Emoji used in Slack/Discord text. */ + emoji: string; + /** Whether this update represents a recovery. */ + resolved: boolean; +}; + +export function presentStatus(statusIndicator: string): StatusPresentation { + switch (statusIndicator.toLowerCase()) { + case "operational": + return { label: "Resolved", emoji: "✅", resolved: true }; + case "degraded": + return { label: "Degraded performance", emoji: "⚠️", resolved: false }; + case "maintenance": + return { label: "Maintenance", emoji: "🔧", resolved: false }; + case "downtime": + default: + return { label: "Outage", emoji: "🔴", resolved: false }; + } +} + +export function buildSubject(update: NormalizedIncidentUpdate): string { + const status = presentStatus(update.statusIndicator); + return `[Trigger.dev ${status.label}] ${update.name}`; +} + +/** Link to the incident on the status page, falling back to the page root. */ +export function incidentUrl(update: NormalizedIncidentUpdate): string { + return update.shortlink ?? STATUS_PAGE_URL; +} + +export function buildPlainTextBody(update: NormalizedIncidentUpdate): string { + const status = presentStatus(update.statusIndicator); + const lines = [`${status.emoji} ${update.name} — ${status.label}`]; + + if (update.body) { + lines.push("", update.body); + } + + lines.push("", `Status page: ${incidentUrl(update)}`); + return lines.join("\n"); +} + +export function buildSlackBlocks(update: NormalizedIncidentUpdate): KnownBlock[] { + const status = presentStatus(update.statusIndicator); + + const blocks: KnownBlock[] = [ + { + type: "header", + text: { type: "plain_text", text: `${status.emoji} ${update.name}`, emoji: true }, + }, + { + type: "section", + text: { type: "mrkdwn", text: `*Status:* ${status.label}` }, + }, + ]; + + if (update.body) { + blocks.push({ + type: "section", + text: { type: "mrkdwn", text: update.body }, + }); + } + + blocks.push({ + type: "context", + elements: [{ type: "mrkdwn", text: `<${incidentUrl(update)}|View status page>` }], + }); + + return blocks; +} + +export function buildDiscordPayload(update: NormalizedIncidentUpdate) { + const status = presentStatus(update.statusIndicator); + // Lowercase to match presentStatus so "Degraded" gets amber, not outage red. + const normalizedStatus = update.statusIndicator.toLowerCase(); + + // Green when resolved, amber when degraded, else red. + const color = status.resolved ? 0x2ecc71 : normalizedStatus === "degraded" ? 0xf1c40f : 0xe74c3c; + + return { + embeds: [ + { + title: `${status.emoji} ${update.name}`, + description: update.body || status.label, + url: incidentUrl(update), + color, + footer: { text: `Status: ${status.label}` }, + }, + ], + }; +} diff --git a/apps/webapp/app/v3/services/alerts/incidentNotifications/recipients.server.ts b/apps/webapp/app/v3/services/alerts/incidentNotifications/recipients.server.ts new file mode 100644 index 00000000000..9a63c4d65ed --- /dev/null +++ b/apps/webapp/app/v3/services/alerts/incidentNotifications/recipients.server.ts @@ -0,0 +1,47 @@ +import { prisma, type PrismaClientOrTransaction } from "~/db.server"; + +export const INCIDENT_EMAIL_PAGE_SIZE = 100; + +export type IncidentEmailRecipient = { + userId: string; + email: string; +}; + +export type IncidentEmailRecipientsPage = { + recipients: IncidentEmailRecipient[]; + /** Cursor (last user id) to pass to the next page, or null when done. */ + nextCursor: string | null; +}; + +/** + * One page of recipients: distinct users who ADMIN at least one non-deleted org. + * Transactional, so we don't filter on the marketing-email preference. + */ +export async function getIncidentEmailRecipientsPage( + cursor: string | null, + take: number = INCIDENT_EMAIL_PAGE_SIZE, + db: PrismaClientOrTransaction = prisma +): Promise { + const users = await db.user.findMany({ + where: { + orgMemberships: { + some: { + role: "ADMIN", + organization: { deletedAt: null }, + }, + }, + }, + select: { id: true, email: true }, + orderBy: { id: "asc" }, + take: take + 1, + ...(cursor ? { cursor: { id: cursor }, skip: 1 } : {}), + }); + + const hasMore = users.length > take; + const page = hasMore ? users.slice(0, take) : users; + + return { + recipients: page.map((u) => ({ userId: u.id, email: u.email })), + nextCursor: hasMore ? page[page.length - 1].id : null, + }; +} diff --git a/apps/webapp/app/v3/tracer.server.ts b/apps/webapp/app/v3/tracer.server.ts index 3b924ff8a19..dec23fe29b4 100644 --- a/apps/webapp/app/v3/tracer.server.ts +++ b/apps/webapp/app/v3/tracer.server.ts @@ -52,6 +52,7 @@ import { import { env } from "~/env.server"; import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { singleton } from "~/utils/singleton"; +import { redactSensitiveQueryParams } from "~/utils/redactUrl"; import { LoggerSpanExporter } from "./telemetry/loggerExporter.server"; import { CompactMetricExporter } from "./telemetry/compactMetricExporter.server"; import { logger } from "~/services/logger.server"; @@ -309,7 +310,43 @@ function setupTelemetry() { ]; if (!env.DISABLE_HTTP_INSTRUMENTATION) { - instrumentations.unshift(new HttpInstrumentation(), new ExpressInstrumentation()); + instrumentations.unshift( + new HttpInstrumentation({ + // Redact credential query params from span URL attributes (both legacy + // http.* and stable url.* semconv, in case dual-emit is enabled). + applyCustomAttributesOnSpan: (span, request) => { + // `url` only exists on incoming requests; narrows to IncomingMessage. + if (!("url" in request) || typeof request.url !== "string") { + return; + } + + const rawTarget = request.url; + const redactedTarget = redactSensitiveQueryParams(rawTarget); + if (redactedTarget === rawTarget) { + return; // nothing sensitive to redact + } + + span.setAttribute("http.target", redactedTarget); + const queryStart = redactedTarget.indexOf("?"); + if (queryStart !== -1) { + span.setAttribute("url.query", redactedTarget.slice(queryStart + 1)); + } + + const host = request.headers?.host; + if (host) { + const forwardedProto = request.headers?.["x-forwarded-proto"]; + const forwarded = Array.isArray(forwardedProto) ? forwardedProto[0] : forwardedProto; + const proto = + forwarded?.split(",")[0] ?? + ((request.socket as { encrypted?: boolean })?.encrypted ? "https" : "http"); + const redactedUrl = redactSensitiveQueryParams(`${proto}://${host}${rawTarget}`); + span.setAttribute("http.url", redactedUrl); + span.setAttribute("url.full", redactedUrl); + } + }, + }), + new ExpressInstrumentation() + ); } if (env.INTERNAL_OTEL_TRACE_INSTRUMENT_PRISMA_ENABLED === "1") { diff --git a/apps/webapp/server.ts b/apps/webapp/server.ts index 9d8646b4ca8..c84da1f4c8f 100644 --- a/apps/webapp/server.ts +++ b/apps/webapp/server.ts @@ -24,6 +24,32 @@ const WORKERS = const HTTP_KEEPALIVE_TIMEOUT_MS = Number.parseInt(process.env.HTTP_KEEPALIVE_TIMEOUT_MS || "", 10) || 65 * 1000; +// Local copy of app/utils/redactUrl.ts — server.ts is built without bundling or +// `~` alias resolution, so it can't import app modules. Keep in sync. +const SENSITIVE_QUERY_PARAMS = ["token", "secret", "access_token", "api_key"]; + +function redactSensitiveQueryParams(url: string): string { + const queryStart = url.indexOf("?"); + if (queryStart === -1) { + return url; + } + + try { + const params = new URLSearchParams(url.slice(queryStart + 1)); + let didRedact = false; + for (const key of SENSITIVE_QUERY_PARAMS) { + if (params.has(key)) { + params.set(key, "[redacted]"); + didRedact = true; + } + } + return didRedact ? `${url.slice(0, queryStart)}?${params.toString()}` : url; + } catch { + // Never let redaction (or a malformed query) break request handling. + return url; + } +} + function forkWorkers() { for (let i = 0; i < WORKERS; i++) { cluster.fork(); @@ -116,6 +142,10 @@ if (ENABLE_CLUSTER && cluster.isPrimary) { // log dominates log volume. HTTP_ACCESS_LOG_DISABLED suppresses successful // (2xx) access logs; non-2xx responses are always logged so errors stay visible. const suppressSuccessfulAccessLogs = process.env.HTTP_ACCESS_LOG_DISABLED === "1"; + // Redact credential query params from the access log :url. + morgan.token("url", (req: express.Request) => + redactSensitiveQueryParams(req.originalUrl || req.url) + ); app.use( morgan("tiny", { skip: (_req, res) => @@ -168,7 +198,13 @@ if (ENABLE_CLUSTER && cluster.isPrimary) { res.on("close", () => abortController.abort()); runWithHttpContext( - { requestId, path: req.url, host: req.hostname, method: req.method, abortController }, + { + requestId, + path: redactSensitiveQueryParams(req.url), + host: req.hostname, + method: req.method, + abortController, + }, next ); }); diff --git a/apps/webapp/test/incidentNotifications.test.ts b/apps/webapp/test/incidentNotifications.test.ts new file mode 100644 index 00000000000..a4cd0398e2f --- /dev/null +++ b/apps/webapp/test/incidentNotifications.test.ts @@ -0,0 +1,247 @@ +import { postgresTest } from "@internal/testcontainers"; +import { describe, expect, it } from "vitest"; +import { + IncidentWebhookSchema, + isCustomerNotifiableEvent, + normalizeIncidentUpdate, +} from "~/services/betterstack/incidentWebhook"; +import { + buildDiscordPayload, + buildSubject, + presentStatus, +} from "~/v3/services/alerts/incidentNotifications/messages"; +import { getIncidentEmailRecipientsPage } from "~/v3/services/alerts/incidentNotifications/recipients.server"; + +function samplePayload(overrides: Record = {}) { + return { + event_type: "incident", + page: { id: "page_1", status_indicator: "downtime" }, + incident: { + id: "inc_1", + name: "Elevated error rates", + shortlink: "https://status.trigger.dev/i/abc", + incident_updates: [ + { id: "upd_2", body: "Identified", created_at: "2026-06-30T10:05:00Z" }, + { id: "upd_1", body: "Investigating", created_at: "2026-06-30T10:00:00Z" }, + ], + }, + ...overrides, + }; +} + +describe("incident webhook payload", () => { + it("parses a status-page incident payload", () => { + const parsed = IncidentWebhookSchema.safeParse(samplePayload()); + expect(parsed.success).toBe(true); + }); + + it("accepts numeric ids (BetterStack sends ids as numbers) and normalizes them to strings", () => { + const parsed = IncidentWebhookSchema.safeParse({ + event_type: "incident", + page: { id: 12345, status_indicator: "downtime" }, + incident: { + id: 67890, + name: "Numeric ids", + incident_updates: [ + { id: 111, status_report_id: 222, body: "x", created_at: "2026-06-30T10:00:00Z" }, + ], + }, + }); + + expect(parsed.success).toBe(true); + if (parsed.success) { + const normalized = normalizeIncidentUpdate(parsed.data); + expect(normalized?.incidentId).toBe("67890"); + expect(normalized?.updateId).toBe("111"); + } + }); + + it("only treats incident events as customer-notifiable", () => { + const incident = IncidentWebhookSchema.parse(samplePayload()); + expect(isCustomerNotifiableEvent(incident)).toBe(true); + + const maintenance = IncidentWebhookSchema.parse(samplePayload({ event_type: "maintenance" })); + expect(isCustomerNotifiableEvent(maintenance)).toBe(false); + + const component = IncidentWebhookSchema.parse( + samplePayload({ event_type: "component_update" }) + ); + expect(isCustomerNotifiableEvent(component)).toBe(false); + }); + + it("parses non-incident callbacks that omit the incident payload", () => { + // Maintenance/component callbacks arrive without an `incident` field. They + // must still parse (so the route can 200 + ignore them) rather than 400. + const parsed = IncidentWebhookSchema.safeParse({ + event_type: "maintenance", + page: { id: "page_1", status_indicator: "maintenance" }, + }); + + expect(parsed.success).toBe(true); + if (parsed.success) { + expect(isCustomerNotifiableEvent(parsed.data)).toBe(false); + expect(normalizeIncidentUpdate(parsed.data)).toBeNull(); + } + }); + + it("normalizes to the most recent update", () => { + const payload = IncidentWebhookSchema.parse(samplePayload()); + const normalized = normalizeIncidentUpdate(payload); + + expect(normalized).not.toBeNull(); + expect(normalized?.updateId).toBe("upd_2"); + expect(normalized?.body).toBe("Identified"); + expect(normalized?.incidentId).toBe("inc_1"); + expect(normalized?.statusIndicator).toBe("downtime"); + expect(normalized?.shortlink).toBe("https://status.trigger.dev/i/abc"); + }); + + it("returns null when there are no updates", () => { + const payload = IncidentWebhookSchema.parse( + samplePayload({ + incident: { id: "inc_1", name: "x", incident_updates: [] }, + }) + ); + expect(normalizeIncidentUpdate(payload)).toBeNull(); + }); + + it("falls back to defaults for missing name/status", () => { + const payload = IncidentWebhookSchema.parse( + samplePayload({ + page: {}, + incident: { + id: "inc_2", + name: null, + incident_updates: [{ id: "u", body: null, created_at: null }], + }, + }) + ); + const normalized = normalizeIncidentUpdate(payload); + expect(normalized?.name).toBe("Service incident"); + expect(normalized?.statusIndicator).toBe("downtime"); + expect(normalized?.body).toBe(""); + }); +}); + +describe("incident message formatting", () => { + it("maps status indicators to presentation", () => { + expect(presentStatus("operational").resolved).toBe(true); + expect(presentStatus("downtime").resolved).toBe(false); + expect(presentStatus("degraded").label).toBe("Degraded performance"); + expect(presentStatus("maintenance").label).toBe("Maintenance"); + }); + + it("builds a subject with the status label", () => { + const update = normalizeIncidentUpdate(IncidentWebhookSchema.parse(samplePayload()))!; + expect(buildSubject(update)).toBe("[Trigger.dev Outage] Elevated error rates"); + }); + + it("colors the discord embed by severity", () => { + const resolved = buildDiscordPayload({ + incidentId: "i", + updateId: "u", + name: "n", + statusIndicator: "operational", + body: "b", + shortlink: null, + updatedAt: null, + }); + expect(resolved.embeds[0].color).toBe(0x2ecc71); + + const outage = buildDiscordPayload({ + incidentId: "i", + updateId: "u", + name: "n", + statusIndicator: "downtime", + body: "b", + shortlink: null, + updatedAt: null, + }); + expect(outage.embeds[0].color).toBe(0xe74c3c); + }); +}); + +describe("incident email recipients", () => { + postgresTest( + "returns distinct org admins, excludes members and deleted orgs", + async ({ prisma }) => { + const admin = await prisma.user.create({ + data: { email: "admin@example.com", authenticationMethod: "MAGIC_LINK" }, + }); + const member = await prisma.user.create({ + data: { email: "member@example.com", authenticationMethod: "MAGIC_LINK" }, + }); + + // Admin of two orgs -> should be deduped to a single recipient. + await prisma.organization.create({ + data: { + title: "Org A", + slug: "org-a", + members: { create: { userId: admin.id, role: "ADMIN" } }, + }, + }); + await prisma.organization.create({ + data: { + title: "Org B", + slug: "org-b", + members: { + create: [ + { userId: admin.id, role: "ADMIN" }, + { userId: member.id, role: "MEMBER" }, + ], + }, + }, + }); + + // Admin of a deleted org only -> should be excluded. + const deletedOrgAdmin = await prisma.user.create({ + data: { email: "deleted@example.com", authenticationMethod: "MAGIC_LINK" }, + }); + await prisma.organization.create({ + data: { + title: "Org C", + slug: "org-c", + deletedAt: new Date(), + members: { create: { userId: deletedOrgAdmin.id, role: "ADMIN" } }, + }, + }); + + const { recipients, nextCursor } = await getIncidentEmailRecipientsPage(null, 100, prisma); + const emails = recipients.map((r) => r.email).sort(); + + expect(emails).toEqual(["admin@example.com"]); + expect(nextCursor).toBeNull(); + }, + 30_000 + ); + + postgresTest( + "paginates with a cursor", + async ({ prisma }) => { + for (const slug of ["p-1", "p-2", "p-3"]) { + const user = await prisma.user.create({ + data: { email: `${slug}@example.com`, authenticationMethod: "MAGIC_LINK" }, + }); + await prisma.organization.create({ + data: { + title: `Org ${slug}`, + slug, + members: { create: { userId: user.id, role: "ADMIN" } }, + }, + }); + } + + const first = await getIncidentEmailRecipientsPage(null, 2, prisma); + expect(first.recipients).toHaveLength(2); + expect(first.nextCursor).not.toBeNull(); + + const second = await getIncidentEmailRecipientsPage(first.nextCursor, 2, prisma); + expect(second.recipients).toHaveLength(1); + expect(second.nextCursor).toBeNull(); + + const allEmails = [...first.recipients, ...second.recipients].map((r) => r.email); + expect(new Set(allEmails).size).toBe(3); + }, + 30_000 + ); +}); diff --git a/internal-packages/emails/emails/incident-notification.tsx b/internal-packages/emails/emails/incident-notification.tsx new file mode 100644 index 00000000000..8639daa7866 --- /dev/null +++ b/internal-packages/emails/emails/incident-notification.tsx @@ -0,0 +1,66 @@ +import { Body, Container, Head, Html, Preview, Text } from "@react-email/components"; +import { Button } from "@react-email/components"; +import { z } from "zod"; +import { Footer } from "./components/Footer"; +import { Image } from "./components/Image"; +import { container, h1, main, paragraphLight, sans } from "./components/styles"; + +export const IncidentNotificationEmailSchema = z.object({ + email: z.literal("incident-notification"), + name: z.string(), + statusLabel: z.string(), + body: z.string(), + url: z.string().url(), +}); + +type IncidentNotificationEmailProps = z.infer; + +const previewDefaults: IncidentNotificationEmailProps = { + email: "incident-notification", + name: "Test notification — all systems calm", + statusLabel: "Resolved", + body: "This is a test of the incident notification email. Nothing is wrong — everything is operational.", + url: "https://status.trigger.dev", +}; + +export default function Email(props: IncidentNotificationEmailProps) { + const { name, statusLabel, body, url } = { ...previewDefaults, ...props }; + + return ( + + + {`Trigger.dev ${statusLabel}: ${name}`} + + + {name} + + Status: {statusLabel} + + + {body ? {body} : null} + + + + Trigger.dev +