Add production health checks
Deploy / deploy-dev (push) Has been skipped
Deploy / deploy-prod (push) Successful in 1m46s

This commit is contained in:
blaisadmin
2026-06-05 22:28:27 -04:00
parent 5735bb7735
commit cc4a2382c6
7 changed files with 228 additions and 5 deletions
+17
View File
@@ -47,6 +47,23 @@ The default `docker-compose.yml` is development-only. It mounts source files, in
## Operations
### Health checks
Monitor these production checks:
- Frontend: `GET https://your-host/healthz`
- Verifies Nginx is serving the frontend container.
- Backend liveness: `GET https://your-host/api/health/live`
- Verifies the API process is running.
- Backend readiness: `GET https://your-host/api/health/ready`
- Verifies the API can reach Postgres and Redis. Returns `503` if either dependency is unavailable.
- Backend metrics: `GET https://your-host/api/metrics`
- Admin-authenticated process, request, and queue metrics.
- Postgres and Redis:
- Use the Docker health checks in `docker-compose.prod.yml`.
- Worker:
- Use the Docker health check in `docker-compose.prod.yml`; it validates worker dependencies. The worker does not expose HTTP.
### Backups
Create a compressed Postgres backup from the Docker Compose Postgres service:
+96 -3
View File
@@ -12,8 +12,9 @@ import nodemailer, { type SendMailOptions } from 'nodemailer';
import Stripe from 'stripe';
import { z } from 'zod';
import { db } from './db/client.js';
import { ensureSchema } from './db/schema.js';
import { adoptionReportQueueEvents, enqueueAdoptionReportJob } from './queues/adoptionReportQueue.js';
import { adoptionReportQueueEvents, enqueueAdoptionReportJob, getAdoptionReportQueueCounts } from './queues/adoptionReportQueue.js';
import { enqueueBirdMilestoneReminderJob, getBirdMilestoneReminderQueueCounts } from './queues/birdMilestoneReminderQueue.js';
import { enqueueMedicationReminderJob, getMedicationReminderQueueCounts } from './queues/medicationReminderQueue.js';
import {
@@ -2265,6 +2266,59 @@ const ensureBirdWritable = (bird: BirdRow, res: Response) => {
return false;
};
type HealthCheckResult = {
ok: boolean;
latencyMs?: number;
error?: string;
};
const withHealthTimeout = async <T,>(operation: Promise<T>, timeoutMs = 2_000): Promise<T> => {
let timeout: NodeJS.Timeout | undefined;
try {
return await Promise.race([
operation,
new Promise<never>((_resolve, reject) => {
timeout = setTimeout(() => reject(new Error('Health check timed out')), timeoutMs);
}),
]);
} finally {
if (timeout) {
clearTimeout(timeout);
}
}
};
const checkPostgresHealth = async (): Promise<HealthCheckResult> => {
const startedAt = Date.now();
try {
await withHealthTimeout(db.query('SELECT 1'));
return { ok: true, latencyMs: Date.now() - startedAt };
} catch (error) {
return {
ok: false,
latencyMs: Date.now() - startedAt,
error: error instanceof Error ? error.message : 'Postgres health check failed',
};
}
};
const checkRedisHealth = async (): Promise<HealthCheckResult> => {
const startedAt = Date.now();
try {
await withHealthTimeout(getBirdMilestoneReminderQueueCounts());
return { ok: true, latencyMs: Date.now() - startedAt };
} catch (error) {
return {
ok: false,
latencyMs: Date.now() - startedAt,
error: error instanceof Error ? error.message : 'Redis health check failed',
};
}
};
const writeAuditLog = async (
auth: AuthContext,
action: string,
@@ -2293,8 +2347,46 @@ const isBillingOnlyWorkspaceUpdate = (
payload: z.infer<typeof workspaceSchema>,
) => workspace.workspace_type === 'standard' && payload.workspaceType === 'standard' && payload.name === workspace.name;
app.get('/api/health', (_req: Request, res: Response) => {
res.json({ ok: true });
app.get('/api/health/live', (_req: Request, res: Response) => {
res.json({
ok: true,
service: 'flockpal-backend',
status: 'live',
uptimeSeconds: Math.round(process.uptime()),
checkedAt: new Date().toISOString(),
});
});
app.get('/api/health/ready', async (_req: Request, res: Response) => {
const [postgres, redis] = await Promise.all([checkPostgresHealth(), checkRedisHealth()]);
const ok = postgres.ok && redis.ok;
res.status(ok ? 200 : 503).json({
ok,
service: 'flockpal-backend',
status: ok ? 'ready' : 'degraded',
checkedAt: new Date().toISOString(),
dependencies: {
postgres,
redis,
},
});
});
app.get('/api/health', async (_req: Request, res: Response) => {
const [postgres, redis] = await Promise.all([checkPostgresHealth(), checkRedisHealth()]);
const ok = postgres.ok && redis.ok;
res.status(ok ? 200 : 503).json({
ok,
service: 'flockpal-backend',
status: ok ? 'ready' : 'degraded',
checkedAt: new Date().toISOString(),
dependencies: {
postgres,
redis,
},
});
});
app.get('/api/metrics', requireAuth, requireAdmin, async (_req: Request, res: Response, next: NextFunction) => {
@@ -2326,6 +2418,7 @@ app.get('/api/metrics', requireAuth, requireAdmin, async (_req: Request, res: Re
queues: {
birdMilestoneReminders: birdMilestoneReminderQueueCounts,
medicationReminders: medicationReminderQueueCounts,
adoptionReports: await getAdoptionReportQueueCounts(),
},
});
} catch (error) {
+54
View File
@@ -0,0 +1,54 @@
import { db } from './db/client.js';
import { closeBirdMilestoneReminderQueue, getBirdMilestoneReminderQueueCounts } from './queues/birdMilestoneReminderQueue.js';
const timeoutMs = Number(process.env.HEALTHCHECK_TIMEOUT_MS ?? 5_000);
const withTimeout = async <T>(operation: Promise<T>, label: string): Promise<T> => {
let timeout: NodeJS.Timeout | undefined;
try {
return await Promise.race([
operation,
new Promise<never>((_resolve, reject) => {
timeout = setTimeout(() => reject(new Error(`${label} timed out`)), timeoutMs);
}),
]);
} finally {
if (timeout) {
clearTimeout(timeout);
}
}
};
const checkHttp = async (path: string) => {
const port = process.env.PORT ?? '5000';
const response = await withTimeout(fetch(`http://127.0.0.1:${port}${path}`), path);
if (!response.ok) {
throw new Error(`${path} returned ${response.status}`);
}
};
const checkWorkerDependencies = async () => {
await withTimeout(db.query('SELECT 1'), 'postgres');
await withTimeout(getBirdMilestoneReminderQueueCounts(), 'redis');
};
const mode = process.argv[2] ?? 'api-ready';
try {
if (mode === 'api-live') {
await checkHttp('/api/health/live');
} else if (mode === 'api-ready') {
await checkHttp('/api/health/ready');
} else if (mode === 'worker') {
await checkWorkerDependencies();
} else {
throw new Error(`Unknown healthcheck mode: ${mode}`);
}
} catch (error) {
console.error(error instanceof Error ? error.message : error);
process.exitCode = 1;
} finally {
await Promise.allSettled([closeBirdMilestoneReminderQueue(), db.close()]);
}
@@ -40,3 +40,5 @@ export const closeAdoptionReportQueue = async () => {
await adoptionReportQueue.close();
await adoptionReportQueueEvents.close();
};
export const getAdoptionReportQueueCounts = () => adoptionReportQueue.getJobCounts('waiting', 'active', 'delayed', 'completed', 'failed');
+18
View File
@@ -96,6 +96,12 @@ services:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "node", "dist/healthcheck.js", "api-ready"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
labels:
- traefik.enable=true
- traefik.docker.network=traefik
@@ -154,6 +160,12 @@ services:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "node", "dist/healthcheck.js", "worker"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
restart: unless-stopped
frontend:
@@ -165,6 +177,12 @@ services:
container_name: flockpal-frontend
depends_on:
- backend
healthcheck:
test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1/healthz"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
labels:
- traefik.enable=true
- traefik.docker.network=traefik
+35 -2
View File
@@ -319,14 +319,47 @@ Validation failures return `400` with this shape:
#### `GET /api/health`
Public health check.
Public readiness-compatible health check. Verifies backend dependencies.
Response `200`:
```json
{ "ok": true }
{
"ok": true,
"service": "flockpal-backend",
"status": "ready",
"checkedAt": "2026-06-06T00:00:00.000Z",
"dependencies": {
"postgres": { "ok": true, "latencyMs": 3 },
"redis": { "ok": true, "latencyMs": 4 }
}
}
```
Response `503` when Postgres or Redis is unavailable.
#### `GET /api/health/live`
Public liveness check. Verifies the backend process is running without checking dependencies.
Response `200`:
```json
{
"ok": true,
"service": "flockpal-backend",
"status": "live",
"uptimeSeconds": 120,
"checkedAt": "2026-06-06T00:00:00.000Z"
}
```
#### `GET /api/health/ready`
Public readiness check. Verifies the backend can reach Postgres and Redis.
Response `200` uses the same shape as `GET /api/health`; response `503` means at least one dependency failed.
### Metrics
#### `GET /api/metrics`
+6
View File
@@ -12,6 +12,12 @@ server {
add_header X-Frame-Options "SAMEORIGIN" always;
add_header Permissions-Policy "camera=(), microphone=(), geolocation=()" always;
location = /healthz {
access_log off;
add_header Content-Type text/plain;
return 200 "ok\n";
}
location / {
try_files $uri $uri/ /index.html;
}