diff --git a/apps/bff/src/infra/cache/README.md b/apps/bff/src/infra/cache/README.md index eb7e3626..8936038e 100644 --- a/apps/bff/src/infra/cache/README.md +++ b/apps/bff/src/infra/cache/README.md @@ -14,8 +14,8 @@ Redis-backed caching system with CDC (Change Data Capture) event-driven invalida │ ┌───────────────────────────▼─────────────────────────────────┐ │ Domain-Specific Cache Services │ -│ - OrdersCacheService (CDC-driven, no TTL) │ -│ - CatalogCacheService (CDC-driven, no TTL) │ +│ - OrdersCacheService (CDC-driven) │ +│ - ServicesCacheService (CDC-driven + safety TTL) │ │ - WhmcsCacheService (TTL-based) │ │ │ │ Features: │ @@ -61,7 +61,13 @@ Redis-backed caching system with CDC (Change Data Capture) event-driven invalida ### 1. CDC-Driven (Orders, Catalog) -**No TTL** - Cache persists indefinitely until CDC event triggers invalidation. +**Event-driven invalidation + safety TTL** - Cache is invalidated on CDC events, and also expires after a long TTL as a safety net. + +Why: CDC is the primary freshness mechanism, but a safety TTL helps self-heal if events are missed (deploy downtime, subscriber issues, replay gaps). + +Config: + +- `SERVICES_CACHE_SAFETY_TTL_SECONDS` (default: 12 hours, set to `0` to disable) **Pros:** @@ -74,7 +80,7 @@ Redis-backed caching system with CDC (Change Data Capture) event-driven invalida ```typescript @Injectable() export class OrdersCacheService { - // No TTL = CDC-only invalidation + // CDC invalidation + safety TTL (service-specific) async getOrderSummaries( sfAccountId: string, fetcher: () => Promise @@ -216,7 +222,7 @@ async getMyData(id: string, fetcher: () => Promise): Promise { const fetchPromise = (async () => { try { const fresh = await fetcher(); - await this.cache.set(key, fresh); // No TTL = CDC-driven + await this.cache.set(key, fresh); // CDC-driven (TTL varies by domain) return fresh; } finally { this.inflightRequests.delete(key); diff --git a/apps/bff/src/modules/services/services/services-cache.service.ts b/apps/bff/src/modules/services/services/services-cache.service.ts index 4998560d..9102de9a 100644 --- a/apps/bff/src/modules/services/services/services-cache.service.ts +++ b/apps/bff/src/modules/services/services/services-cache.service.ts @@ -31,15 +31,16 @@ interface LegacyCatalogCachePayload { * product dependency tracking for granular invalidation. * * Features: - * - CDC-driven invalidation: No TTL, cache persists until CDC event + * - Event-driven invalidation: CDC / Platform Events invalidate caches on change + * - Safety TTL: long TTL to self-heal if events are missed * - Product dependency tracking: Granular invalidation by product IDs * - Request coalescing: Prevents thundering herd on cache miss * - Metrics tracking: Monitors hits, misses, and invalidations * * Cache buckets: - * - catalog: Product catalog data (CDC-driven) - * - static: Static reference data (CDC-driven) - * - eligibility: Account eligibility data (CDC-driven) + * - catalog: Product catalog data (event-driven + safety TTL) + * - static: Static reference data (event-driven + safety TTL) + * - eligibility: Account eligibility data (event-driven + safety TTL) * - volatile: Frequently changing data (60s TTL) */ @Injectable() @@ -263,8 +264,8 @@ export class ServicesCacheService { // Store and link dependencies separately if (dependencies) { - await this.storeDependencies(key, dependencies); - await this.linkDependencies(key, dependencies); + await this.storeDependencies(key, dependencies, ttlSeconds); + await this.linkDependencies(key, dependencies, ttlSeconds); } return fresh; @@ -293,8 +294,8 @@ export class ServicesCacheService { } if (cached.dependencies) { - await this.storeDependencies(key, cached.dependencies); - await this.linkDependencies(key, cached.dependencies); + await this.storeDependencies(key, cached.dependencies, ttlSeconds); + await this.linkDependencies(key, cached.dependencies, ttlSeconds); } return normalizedValue; @@ -356,11 +357,19 @@ export class ServicesCacheService { /** * Store dependencies metadata for a cache key */ - private async storeDependencies(key: string, dependencies: CacheDependencies): Promise { + private async storeDependencies( + key: string, + dependencies: CacheDependencies, + ttlSeconds: number | null + ): Promise { const normalized = this.normalizeDependencies(dependencies); if (normalized) { const metaKey = this.buildDependencyMetaKey(key); - await this.cache.set(metaKey, normalized); + if (ttlSeconds === null) { + await this.cache.set(metaKey, normalized); + } else { + await this.cache.set(metaKey, normalized, ttlSeconds); + } } } @@ -387,7 +396,11 @@ export class ServicesCacheService { return { productIds: Array.from(new Set(productIds)) }; } - private async linkDependencies(key: string, dependencies: CacheDependencies): Promise { + private async linkDependencies( + key: string, + dependencies: CacheDependencies, + ttlSeconds: number | null + ): Promise { const normalized = this.normalizeDependencies(dependencies); if (!normalized) { return; @@ -400,7 +413,11 @@ export class ServicesCacheService { if (!existing.includes(key)) { existing.push(key); } - await this.cache.set(indexKey, { keys: existing }); + if (ttlSeconds === null) { + await this.cache.set(indexKey, { keys: existing }); + } else { + await this.cache.set(indexKey, { keys: existing }, ttlSeconds); + } } } } diff --git a/docs/how-it-works/system-overview.md b/docs/how-it-works/system-overview.md index 5df1b15e..2b221b66 100644 --- a/docs/how-it-works/system-overview.md +++ b/docs/how-it-works/system-overview.md @@ -29,7 +29,7 @@ Purpose: explain what the portal does, which systems own which data, and how fre ## Caching & Freshness (Redis) -- Services catalog: event-driven (Salesforce CDC), no TTL; "volatile" bits use 60s TTL; eligibility per account is cached without TTL and invalidated on change. +- Services catalog: event-driven (Salesforce CDC) with a 12h safety TTL; "volatile" bits use 60s TTL; eligibility per account is event-driven with the same 12h safety TTL. - Orders: event-driven (Salesforce CDC), no TTL; invalidated when Salesforce emits order/order-item changes or when we create/provision an order. - Invoices: list cached 90s; invoice detail cached 5m; invalidated by WHMCS webhooks and by write operations. - Subscriptions/services: list cached 5m; single subscription cached 10m; invalidated on WHMCS cache busts (webhooks or profile updates). @@ -62,7 +62,7 @@ There are **two independent caching layers** involved: - **Redis (server-side) catalog cache**: - Catalog reads are cached in Redis via `ServicesCacheService`. - - For catalog data (plans/addons/etc) the TTL is intentionally **null** (no TTL): values persist until explicitly invalidated. + - Catalog + eligibility data are primarily invalidated by Salesforce events, but we also apply a **12 hour safety TTL** (configurable via `SERVICES_CACHE_SAFETY_TTL_SECONDS`) to self-heal if events are missed. - Invalidation is driven by Salesforce **CDC** events (Product2 / PricebookEntry) and an account **Platform Event** for eligibility updates. - Result: even if the public catalog is requested millions of times, the BFF typically serves from Redis and only re-queries Salesforce when a relevant Salesforce change event arrives (or on cold start / cache miss). @@ -84,3 +84,9 @@ There are **two independent caching layers** involved: - **CDC subscription health / fallback behavior**: - If Salesforce CDC subscriptions are disabled or unhealthy, invalidations may not arrive and Redis caches can become stale until manually cleared. - Monitor the CDC subscriber and cache health metrics (`GET /api/health/services/cache`). + +### Future work (monitoring + resilience) + +- **CDC subscriber monitoring**: alert on disconnects and sustained lack of events (time since last processed event). +- **Replay cursor persistence**: store/restore a replay position across restarts to reduce missed-event risk. +- **Operational runbook**: document the “flush services caches” procedure for incidents where events were missed for an extended period.