From cc4a2382c644eb8de50959cefde1df55a2ee0afd Mon Sep 17 00:00:00 2001 From: blaisadmin Date: Fri, 5 Jun 2026 22:28:27 -0400 Subject: [PATCH] Add production health checks --- README.md | 17 ++++ backend/src/app.ts | 99 ++++++++++++++++++++++- backend/src/healthcheck.ts | 54 +++++++++++++ backend/src/queues/adoptionReportQueue.ts | 2 + docker-compose.prod.yml | 18 +++++ docs/API_REFERENCE.md | 37 ++++++++- frontend/nginx.conf | 6 ++ 7 files changed, 228 insertions(+), 5 deletions(-) create mode 100644 backend/src/healthcheck.ts diff --git a/README.md b/README.md index 1fdd36e..d91cdf3 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,23 @@ The default `docker-compose.yml` is development-only. It mounts source files, in ## Operations +### Health checks + +Monitor these production checks: + +- Frontend: `GET https://your-host/healthz` + - Verifies Nginx is serving the frontend container. +- Backend liveness: `GET https://your-host/api/health/live` + - Verifies the API process is running. +- Backend readiness: `GET https://your-host/api/health/ready` + - Verifies the API can reach Postgres and Redis. Returns `503` if either dependency is unavailable. +- Backend metrics: `GET https://your-host/api/metrics` + - Admin-authenticated process, request, and queue metrics. +- Postgres and Redis: + - Use the Docker health checks in `docker-compose.prod.yml`. +- Worker: + - Use the Docker health check in `docker-compose.prod.yml`; it validates worker dependencies. The worker does not expose HTTP. + ### Backups Create a compressed Postgres backup from the Docker Compose Postgres service: diff --git a/backend/src/app.ts b/backend/src/app.ts index 98138ca..a52a689 100644 --- a/backend/src/app.ts +++ b/backend/src/app.ts @@ -12,8 +12,9 @@ import nodemailer, { type SendMailOptions } from 'nodemailer'; import Stripe from 'stripe'; import { z } from 'zod'; +import { db } from './db/client.js'; import { ensureSchema } from './db/schema.js'; -import { adoptionReportQueueEvents, enqueueAdoptionReportJob } from './queues/adoptionReportQueue.js'; +import { adoptionReportQueueEvents, enqueueAdoptionReportJob, getAdoptionReportQueueCounts } from './queues/adoptionReportQueue.js'; import { enqueueBirdMilestoneReminderJob, getBirdMilestoneReminderQueueCounts } from './queues/birdMilestoneReminderQueue.js'; import { enqueueMedicationReminderJob, getMedicationReminderQueueCounts } from './queues/medicationReminderQueue.js'; import { @@ -2265,6 +2266,59 @@ const ensureBirdWritable = (bird: BirdRow, res: Response) => { return false; }; +type HealthCheckResult = { + ok: boolean; + latencyMs?: number; + error?: string; +}; + +const withHealthTimeout = async (operation: Promise, timeoutMs = 2_000): Promise => { + let timeout: NodeJS.Timeout | undefined; + + try { + return await Promise.race([ + operation, + new Promise((_resolve, reject) => { + timeout = setTimeout(() => reject(new Error('Health check timed out')), timeoutMs); + }), + ]); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } +}; + +const checkPostgresHealth = async (): Promise => { + const startedAt = Date.now(); + + try { + await withHealthTimeout(db.query('SELECT 1')); + return { ok: true, latencyMs: Date.now() - startedAt }; + } catch (error) { + return { + ok: false, + latencyMs: Date.now() - startedAt, + error: error instanceof Error ? error.message : 'Postgres health check failed', + }; + } +}; + +const checkRedisHealth = async (): Promise => { + const startedAt = Date.now(); + + try { + await withHealthTimeout(getBirdMilestoneReminderQueueCounts()); + return { ok: true, latencyMs: Date.now() - startedAt }; + } catch (error) { + return { + ok: false, + latencyMs: Date.now() - startedAt, + error: error instanceof Error ? error.message : 'Redis health check failed', + }; + } +}; + const writeAuditLog = async ( auth: AuthContext, action: string, @@ -2293,8 +2347,46 @@ const isBillingOnlyWorkspaceUpdate = ( payload: z.infer, ) => workspace.workspace_type === 'standard' && payload.workspaceType === 'standard' && payload.name === workspace.name; -app.get('/api/health', (_req: Request, res: Response) => { - res.json({ ok: true }); +app.get('/api/health/live', (_req: Request, res: Response) => { + res.json({ + ok: true, + service: 'flockpal-backend', + status: 'live', + uptimeSeconds: Math.round(process.uptime()), + checkedAt: new Date().toISOString(), + }); +}); + +app.get('/api/health/ready', async (_req: Request, res: Response) => { + const [postgres, redis] = await Promise.all([checkPostgresHealth(), checkRedisHealth()]); + const ok = postgres.ok && redis.ok; + + res.status(ok ? 200 : 503).json({ + ok, + service: 'flockpal-backend', + status: ok ? 'ready' : 'degraded', + checkedAt: new Date().toISOString(), + dependencies: { + postgres, + redis, + }, + }); +}); + +app.get('/api/health', async (_req: Request, res: Response) => { + const [postgres, redis] = await Promise.all([checkPostgresHealth(), checkRedisHealth()]); + const ok = postgres.ok && redis.ok; + + res.status(ok ? 200 : 503).json({ + ok, + service: 'flockpal-backend', + status: ok ? 'ready' : 'degraded', + checkedAt: new Date().toISOString(), + dependencies: { + postgres, + redis, + }, + }); }); app.get('/api/metrics', requireAuth, requireAdmin, async (_req: Request, res: Response, next: NextFunction) => { @@ -2326,6 +2418,7 @@ app.get('/api/metrics', requireAuth, requireAdmin, async (_req: Request, res: Re queues: { birdMilestoneReminders: birdMilestoneReminderQueueCounts, medicationReminders: medicationReminderQueueCounts, + adoptionReports: await getAdoptionReportQueueCounts(), }, }); } catch (error) { diff --git a/backend/src/healthcheck.ts b/backend/src/healthcheck.ts new file mode 100644 index 0000000..a3ff9ba --- /dev/null +++ b/backend/src/healthcheck.ts @@ -0,0 +1,54 @@ +import { db } from './db/client.js'; +import { closeBirdMilestoneReminderQueue, getBirdMilestoneReminderQueueCounts } from './queues/birdMilestoneReminderQueue.js'; + +const timeoutMs = Number(process.env.HEALTHCHECK_TIMEOUT_MS ?? 5_000); + +const withTimeout = async (operation: Promise, label: string): Promise => { + let timeout: NodeJS.Timeout | undefined; + + try { + return await Promise.race([ + operation, + new Promise((_resolve, reject) => { + timeout = setTimeout(() => reject(new Error(`${label} timed out`)), timeoutMs); + }), + ]); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } +}; + +const checkHttp = async (path: string) => { + const port = process.env.PORT ?? '5000'; + const response = await withTimeout(fetch(`http://127.0.0.1:${port}${path}`), path); + + if (!response.ok) { + throw new Error(`${path} returned ${response.status}`); + } +}; + +const checkWorkerDependencies = async () => { + await withTimeout(db.query('SELECT 1'), 'postgres'); + await withTimeout(getBirdMilestoneReminderQueueCounts(), 'redis'); +}; + +const mode = process.argv[2] ?? 'api-ready'; + +try { + if (mode === 'api-live') { + await checkHttp('/api/health/live'); + } else if (mode === 'api-ready') { + await checkHttp('/api/health/ready'); + } else if (mode === 'worker') { + await checkWorkerDependencies(); + } else { + throw new Error(`Unknown healthcheck mode: ${mode}`); + } +} catch (error) { + console.error(error instanceof Error ? error.message : error); + process.exitCode = 1; +} finally { + await Promise.allSettled([closeBirdMilestoneReminderQueue(), db.close()]); +} diff --git a/backend/src/queues/adoptionReportQueue.ts b/backend/src/queues/adoptionReportQueue.ts index a1c60d6..be2897f 100644 --- a/backend/src/queues/adoptionReportQueue.ts +++ b/backend/src/queues/adoptionReportQueue.ts @@ -40,3 +40,5 @@ export const closeAdoptionReportQueue = async () => { await adoptionReportQueue.close(); await adoptionReportQueueEvents.close(); }; + +export const getAdoptionReportQueueCounts = () => adoptionReportQueue.getJobCounts('waiting', 'active', 'delayed', 'completed', 'failed'); diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 618df0e..5032ebb 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -96,6 +96,12 @@ services: condition: service_healthy redis: condition: service_healthy + healthcheck: + test: ["CMD", "node", "dist/healthcheck.js", "api-ready"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s labels: - traefik.enable=true - traefik.docker.network=traefik @@ -154,6 +160,12 @@ services: condition: service_healthy redis: condition: service_healthy + healthcheck: + test: ["CMD", "node", "dist/healthcheck.js", "worker"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s restart: unless-stopped frontend: @@ -165,6 +177,12 @@ services: container_name: flockpal-frontend depends_on: - backend + healthcheck: + test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1/healthz"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 10s labels: - traefik.enable=true - traefik.docker.network=traefik diff --git a/docs/API_REFERENCE.md b/docs/API_REFERENCE.md index 68a8ecc..e077570 100644 --- a/docs/API_REFERENCE.md +++ b/docs/API_REFERENCE.md @@ -319,14 +319,47 @@ Validation failures return `400` with this shape: #### `GET /api/health` -Public health check. +Public readiness-compatible health check. Verifies backend dependencies. Response `200`: ```json -{ "ok": true } +{ + "ok": true, + "service": "flockpal-backend", + "status": "ready", + "checkedAt": "2026-06-06T00:00:00.000Z", + "dependencies": { + "postgres": { "ok": true, "latencyMs": 3 }, + "redis": { "ok": true, "latencyMs": 4 } + } +} ``` +Response `503` when Postgres or Redis is unavailable. + +#### `GET /api/health/live` + +Public liveness check. Verifies the backend process is running without checking dependencies. + +Response `200`: + +```json +{ + "ok": true, + "service": "flockpal-backend", + "status": "live", + "uptimeSeconds": 120, + "checkedAt": "2026-06-06T00:00:00.000Z" +} +``` + +#### `GET /api/health/ready` + +Public readiness check. Verifies the backend can reach Postgres and Redis. + +Response `200` uses the same shape as `GET /api/health`; response `503` means at least one dependency failed. + ### Metrics #### `GET /api/metrics` diff --git a/frontend/nginx.conf b/frontend/nginx.conf index 07acaad..6498fc4 100644 --- a/frontend/nginx.conf +++ b/frontend/nginx.conf @@ -12,6 +12,12 @@ server { add_header X-Frame-Options "SAMEORIGIN" always; add_header Permissions-Policy "camera=(), microphone=(), geolocation=()" always; + location = /healthz { + access_log off; + add_header Content-Type text/plain; + return 200 "ok\n"; + } + location / { try_files $uri $uri/ /index.html; }