Add production health checks
This commit is contained in:
@@ -47,6 +47,23 @@ The default `docker-compose.yml` is development-only. It mounts source files, in
|
|||||||
|
|
||||||
## Operations
|
## Operations
|
||||||
|
|
||||||
|
### Health checks
|
||||||
|
|
||||||
|
Monitor these production checks:
|
||||||
|
|
||||||
|
- Frontend: `GET https://your-host/healthz`
|
||||||
|
- Verifies Nginx is serving the frontend container.
|
||||||
|
- Backend liveness: `GET https://your-host/api/health/live`
|
||||||
|
- Verifies the API process is running.
|
||||||
|
- Backend readiness: `GET https://your-host/api/health/ready`
|
||||||
|
- Verifies the API can reach Postgres and Redis. Returns `503` if either dependency is unavailable.
|
||||||
|
- Backend metrics: `GET https://your-host/api/metrics`
|
||||||
|
- Admin-authenticated process, request, and queue metrics.
|
||||||
|
- Postgres and Redis:
|
||||||
|
- Use the Docker health checks in `docker-compose.prod.yml`.
|
||||||
|
- Worker:
|
||||||
|
- Use the Docker health check in `docker-compose.prod.yml`; it validates worker dependencies. The worker does not expose HTTP.
|
||||||
|
|
||||||
### Backups
|
### Backups
|
||||||
|
|
||||||
Create a compressed Postgres backup from the Docker Compose Postgres service:
|
Create a compressed Postgres backup from the Docker Compose Postgres service:
|
||||||
|
|||||||
+96
-3
@@ -12,8 +12,9 @@ import nodemailer, { type SendMailOptions } from 'nodemailer';
|
|||||||
import Stripe from 'stripe';
|
import Stripe from 'stripe';
|
||||||
import { z } from 'zod';
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
import { db } from './db/client.js';
|
||||||
import { ensureSchema } from './db/schema.js';
|
import { ensureSchema } from './db/schema.js';
|
||||||
import { adoptionReportQueueEvents, enqueueAdoptionReportJob } from './queues/adoptionReportQueue.js';
|
import { adoptionReportQueueEvents, enqueueAdoptionReportJob, getAdoptionReportQueueCounts } from './queues/adoptionReportQueue.js';
|
||||||
import { enqueueBirdMilestoneReminderJob, getBirdMilestoneReminderQueueCounts } from './queues/birdMilestoneReminderQueue.js';
|
import { enqueueBirdMilestoneReminderJob, getBirdMilestoneReminderQueueCounts } from './queues/birdMilestoneReminderQueue.js';
|
||||||
import { enqueueMedicationReminderJob, getMedicationReminderQueueCounts } from './queues/medicationReminderQueue.js';
|
import { enqueueMedicationReminderJob, getMedicationReminderQueueCounts } from './queues/medicationReminderQueue.js';
|
||||||
import {
|
import {
|
||||||
@@ -2265,6 +2266,59 @@ const ensureBirdWritable = (bird: BirdRow, res: Response) => {
|
|||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type HealthCheckResult = {
|
||||||
|
ok: boolean;
|
||||||
|
latencyMs?: number;
|
||||||
|
error?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
const withHealthTimeout = async <T,>(operation: Promise<T>, timeoutMs = 2_000): Promise<T> => {
|
||||||
|
let timeout: NodeJS.Timeout | undefined;
|
||||||
|
|
||||||
|
try {
|
||||||
|
return await Promise.race([
|
||||||
|
operation,
|
||||||
|
new Promise<never>((_resolve, reject) => {
|
||||||
|
timeout = setTimeout(() => reject(new Error('Health check timed out')), timeoutMs);
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
} finally {
|
||||||
|
if (timeout) {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const checkPostgresHealth = async (): Promise<HealthCheckResult> => {
|
||||||
|
const startedAt = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await withHealthTimeout(db.query('SELECT 1'));
|
||||||
|
return { ok: true, latencyMs: Date.now() - startedAt };
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
ok: false,
|
||||||
|
latencyMs: Date.now() - startedAt,
|
||||||
|
error: error instanceof Error ? error.message : 'Postgres health check failed',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const checkRedisHealth = async (): Promise<HealthCheckResult> => {
|
||||||
|
const startedAt = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
await withHealthTimeout(getBirdMilestoneReminderQueueCounts());
|
||||||
|
return { ok: true, latencyMs: Date.now() - startedAt };
|
||||||
|
} catch (error) {
|
||||||
|
return {
|
||||||
|
ok: false,
|
||||||
|
latencyMs: Date.now() - startedAt,
|
||||||
|
error: error instanceof Error ? error.message : 'Redis health check failed',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const writeAuditLog = async (
|
const writeAuditLog = async (
|
||||||
auth: AuthContext,
|
auth: AuthContext,
|
||||||
action: string,
|
action: string,
|
||||||
@@ -2293,8 +2347,46 @@ const isBillingOnlyWorkspaceUpdate = (
|
|||||||
payload: z.infer<typeof workspaceSchema>,
|
payload: z.infer<typeof workspaceSchema>,
|
||||||
) => workspace.workspace_type === 'standard' && payload.workspaceType === 'standard' && payload.name === workspace.name;
|
) => workspace.workspace_type === 'standard' && payload.workspaceType === 'standard' && payload.name === workspace.name;
|
||||||
|
|
||||||
app.get('/api/health', (_req: Request, res: Response) => {
|
app.get('/api/health/live', (_req: Request, res: Response) => {
|
||||||
res.json({ ok: true });
|
res.json({
|
||||||
|
ok: true,
|
||||||
|
service: 'flockpal-backend',
|
||||||
|
status: 'live',
|
||||||
|
uptimeSeconds: Math.round(process.uptime()),
|
||||||
|
checkedAt: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get('/api/health/ready', async (_req: Request, res: Response) => {
|
||||||
|
const [postgres, redis] = await Promise.all([checkPostgresHealth(), checkRedisHealth()]);
|
||||||
|
const ok = postgres.ok && redis.ok;
|
||||||
|
|
||||||
|
res.status(ok ? 200 : 503).json({
|
||||||
|
ok,
|
||||||
|
service: 'flockpal-backend',
|
||||||
|
status: ok ? 'ready' : 'degraded',
|
||||||
|
checkedAt: new Date().toISOString(),
|
||||||
|
dependencies: {
|
||||||
|
postgres,
|
||||||
|
redis,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get('/api/health', async (_req: Request, res: Response) => {
|
||||||
|
const [postgres, redis] = await Promise.all([checkPostgresHealth(), checkRedisHealth()]);
|
||||||
|
const ok = postgres.ok && redis.ok;
|
||||||
|
|
||||||
|
res.status(ok ? 200 : 503).json({
|
||||||
|
ok,
|
||||||
|
service: 'flockpal-backend',
|
||||||
|
status: ok ? 'ready' : 'degraded',
|
||||||
|
checkedAt: new Date().toISOString(),
|
||||||
|
dependencies: {
|
||||||
|
postgres,
|
||||||
|
redis,
|
||||||
|
},
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
app.get('/api/metrics', requireAuth, requireAdmin, async (_req: Request, res: Response, next: NextFunction) => {
|
app.get('/api/metrics', requireAuth, requireAdmin, async (_req: Request, res: Response, next: NextFunction) => {
|
||||||
@@ -2326,6 +2418,7 @@ app.get('/api/metrics', requireAuth, requireAdmin, async (_req: Request, res: Re
|
|||||||
queues: {
|
queues: {
|
||||||
birdMilestoneReminders: birdMilestoneReminderQueueCounts,
|
birdMilestoneReminders: birdMilestoneReminderQueueCounts,
|
||||||
medicationReminders: medicationReminderQueueCounts,
|
medicationReminders: medicationReminderQueueCounts,
|
||||||
|
adoptionReports: await getAdoptionReportQueueCounts(),
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
@@ -0,0 +1,54 @@
|
|||||||
|
import { db } from './db/client.js';
|
||||||
|
import { closeBirdMilestoneReminderQueue, getBirdMilestoneReminderQueueCounts } from './queues/birdMilestoneReminderQueue.js';
|
||||||
|
|
||||||
|
const timeoutMs = Number(process.env.HEALTHCHECK_TIMEOUT_MS ?? 5_000);
|
||||||
|
|
||||||
|
const withTimeout = async <T>(operation: Promise<T>, label: string): Promise<T> => {
|
||||||
|
let timeout: NodeJS.Timeout | undefined;
|
||||||
|
|
||||||
|
try {
|
||||||
|
return await Promise.race([
|
||||||
|
operation,
|
||||||
|
new Promise<never>((_resolve, reject) => {
|
||||||
|
timeout = setTimeout(() => reject(new Error(`${label} timed out`)), timeoutMs);
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
} finally {
|
||||||
|
if (timeout) {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const checkHttp = async (path: string) => {
|
||||||
|
const port = process.env.PORT ?? '5000';
|
||||||
|
const response = await withTimeout(fetch(`http://127.0.0.1:${port}${path}`), path);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`${path} returned ${response.status}`);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const checkWorkerDependencies = async () => {
|
||||||
|
await withTimeout(db.query('SELECT 1'), 'postgres');
|
||||||
|
await withTimeout(getBirdMilestoneReminderQueueCounts(), 'redis');
|
||||||
|
};
|
||||||
|
|
||||||
|
const mode = process.argv[2] ?? 'api-ready';
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (mode === 'api-live') {
|
||||||
|
await checkHttp('/api/health/live');
|
||||||
|
} else if (mode === 'api-ready') {
|
||||||
|
await checkHttp('/api/health/ready');
|
||||||
|
} else if (mode === 'worker') {
|
||||||
|
await checkWorkerDependencies();
|
||||||
|
} else {
|
||||||
|
throw new Error(`Unknown healthcheck mode: ${mode}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error instanceof Error ? error.message : error);
|
||||||
|
process.exitCode = 1;
|
||||||
|
} finally {
|
||||||
|
await Promise.allSettled([closeBirdMilestoneReminderQueue(), db.close()]);
|
||||||
|
}
|
||||||
@@ -40,3 +40,5 @@ export const closeAdoptionReportQueue = async () => {
|
|||||||
await adoptionReportQueue.close();
|
await adoptionReportQueue.close();
|
||||||
await adoptionReportQueueEvents.close();
|
await adoptionReportQueueEvents.close();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const getAdoptionReportQueueCounts = () => adoptionReportQueue.getJobCounts('waiting', 'active', 'delayed', 'completed', 'failed');
|
||||||
|
|||||||
@@ -96,6 +96,12 @@ services:
|
|||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "node", "dist/healthcheck.js", "api-ready"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
labels:
|
labels:
|
||||||
- traefik.enable=true
|
- traefik.enable=true
|
||||||
- traefik.docker.network=traefik
|
- traefik.docker.network=traefik
|
||||||
@@ -154,6 +160,12 @@ services:
|
|||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "node", "dist/healthcheck.js", "worker"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
frontend:
|
frontend:
|
||||||
@@ -165,6 +177,12 @@ services:
|
|||||||
container_name: flockpal-frontend
|
container_name: flockpal-frontend
|
||||||
depends_on:
|
depends_on:
|
||||||
- backend
|
- backend
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1/healthz"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 10s
|
||||||
labels:
|
labels:
|
||||||
- traefik.enable=true
|
- traefik.enable=true
|
||||||
- traefik.docker.network=traefik
|
- traefik.docker.network=traefik
|
||||||
|
|||||||
+35
-2
@@ -319,14 +319,47 @@ Validation failures return `400` with this shape:
|
|||||||
|
|
||||||
#### `GET /api/health`
|
#### `GET /api/health`
|
||||||
|
|
||||||
Public health check.
|
Public readiness-compatible health check. Verifies backend dependencies.
|
||||||
|
|
||||||
Response `200`:
|
Response `200`:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{ "ok": true }
|
{
|
||||||
|
"ok": true,
|
||||||
|
"service": "flockpal-backend",
|
||||||
|
"status": "ready",
|
||||||
|
"checkedAt": "2026-06-06T00:00:00.000Z",
|
||||||
|
"dependencies": {
|
||||||
|
"postgres": { "ok": true, "latencyMs": 3 },
|
||||||
|
"redis": { "ok": true, "latencyMs": 4 }
|
||||||
|
}
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Response `503` when Postgres or Redis is unavailable.
|
||||||
|
|
||||||
|
#### `GET /api/health/live`
|
||||||
|
|
||||||
|
Public liveness check. Verifies the backend process is running without checking dependencies.
|
||||||
|
|
||||||
|
Response `200`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"ok": true,
|
||||||
|
"service": "flockpal-backend",
|
||||||
|
"status": "live",
|
||||||
|
"uptimeSeconds": 120,
|
||||||
|
"checkedAt": "2026-06-06T00:00:00.000Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### `GET /api/health/ready`
|
||||||
|
|
||||||
|
Public readiness check. Verifies the backend can reach Postgres and Redis.
|
||||||
|
|
||||||
|
Response `200` uses the same shape as `GET /api/health`; response `503` means at least one dependency failed.
|
||||||
|
|
||||||
### Metrics
|
### Metrics
|
||||||
|
|
||||||
#### `GET /api/metrics`
|
#### `GET /api/metrics`
|
||||||
|
|||||||
@@ -12,6 +12,12 @@ server {
|
|||||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||||
add_header Permissions-Policy "camera=(), microphone=(), geolocation=()" always;
|
add_header Permissions-Policy "camera=(), microphone=(), geolocation=()" always;
|
||||||
|
|
||||||
|
location = /healthz {
|
||||||
|
access_log off;
|
||||||
|
add_header Content-Type text/plain;
|
||||||
|
return 200 "ok\n";
|
||||||
|
}
|
||||||
|
|
||||||
location / {
|
location / {
|
||||||
try_files $uri $uri/ /index.html;
|
try_files $uri $uri/ /index.html;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user