fix(cache): preserve local data on remote 5xx errors

Only update cache when remote returns HTTP 200. On 5xx errors or timeouts, preserve existing local cache instead of overwriting with empty/error data.
refactor(scan): remove multi-thread scan logic, use sequential processing
2026-04-08 12:07:51 -04:00 · 2026-04-08 12:04:27 -04:00
5 changed files with 77 additions and 423 deletions
--- a/engage-api/get-activity.ts
+++ b/engage-api/get-activity.ts
@@ -45,6 +45,7 @@ async function getCompleteCookies(userName: string, userPwd: string): Promise<st
 /**
 * Get activity details from API
 * Only returns data on HTTP 200. Returns null on any error (5xx, timeout, etc.)
 */
 async function getActivityDetailsRaw(
  activityId: string,
@@ -73,6 +74,16 @@ async function getActivityDetailsRaw(
        // Add additional timeout safety
        maxRedirects: 5
      });
      // CRITICAL: Only accept HTTP 200. Reject all other status codes including 5xx
      if (response.status !== 200) {
        logger.error(`Non-200 status ${response.status} for activity ${activityId}. NOT updating cache to preserve local data.`);
        if (attempt === maxRetries - 1) {
          logger.error(`All ${maxRetries} retries failed with non-200 status for activity ${activityId}.`);
        }
        return null;
      }
      logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Received response status ${response.status}`);
      const outerData = JSON.parse(response.data);
      if (outerData && typeof outerData.d === 'string') {
@@ -88,7 +99,7 @@ async function getActivityDetailsRaw(
    } catch (error: any) {
      // Only treat 401 (Unauthorized) and 403 (Forbidden) as authentication errors
      // 404 (Not Found) is valid - activity doesn't exist
-      // Other 4xx errors should not trigger re-authentication
+      // Other 4xx/5xx errors should not trigger re-authentication
      if (error.response && (error.response.status === 401 || error.response.status === 403)) {
        logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`);
        throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status);
@@ -97,6 +108,10 @@ async function getActivityDetailsRaw(
      if (error.response) {
        logger.error(`Status: ${error.response.status}, Data (getActivityDetailsRaw): ${ String(error.response.data).slice(0,100)}...`);
        // CRITICAL: 5xx errors should NOT update cache
        if (error.response.status >= 500 && error.response.status < 600) {
          logger.error(`Server error ${error.response.status} - preserving local cache, not updating.`);
        }
      }
      if (attempt === maxRetries - 1) {
        logger.error(`All ${maxRetries} retries failed for activity ${activityId}.`);
--- a/example.env
+++ b/example.env
@@ -21,7 +21,6 @@ MAX_ACTIVITY_ID_SCAN=8000
 # Maximum concurrent API calls during crawling (default: 8)
 # Higher values = faster crawling but more server load
 # Set to 1 for sequential processing (slow but safe)
 CONCURRENT_API_CALLS=8
 # Request timeout in milliseconds (default: 25000 = 25 seconds)
--- a/services/cache-manager.ts
+++ b/services/cache-manager.ts
@@ -14,7 +14,7 @@ import {
 import { uploadImageFromBase64, listS3Objects, constructS3Url } from './s3-service';
 import { extractBase64Image } from '../utils/image-processor';
 import { logger } from '../utils/logger';
-import { BatchProcessor, executeWithConcurrencyAndProgress } from '../utils/semaphore';
+
 import type { ActivityData } from '../models/activity';
@@ -61,8 +61,7 @@ function extractObjectKeyFromUrl(url: string): string | null {
  }
 }
-// Crawler concurrency configuration
+// Crawler configuration
 const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '8', 10);
 const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10);
 const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10);
 const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10);
@@ -73,9 +72,10 @@ let skippedCount = 0;
 /**
 * Process and cache a single activity
 * @param activityId - The activity ID to process
 * @param forceUpdate - If true, update cache even on fetch failure (default: false)
 * @returns The processed activity data
 */
-async function processAndCacheActivity(activityId: string): Promise<ActivityData> {
+async function processAndCacheActivity(activityId: string, forceUpdate: boolean = false): Promise<ActivityData> {
  logger.debug(`Processing activity ID: ${activityId}`);
  try {
    if (!USERNAME || !PASSWORD) {
@@ -93,11 +93,21 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
    let structuredActivity: ActivityData;
    if (!activityJson) {
-      logger.info(`No data found for activity ID ${activityId} from engage API. Caching as empty.`);
+      // CRITICAL: Only cache empty data if forceUpdate is true
      // This prevents 5xx errors from overwriting valid local data
      if (forceUpdate) {
        logger.info(`No data found for activity ID ${activityId} from engage API. Force updating cache.`);
        structuredActivity = { 
          lastCheck: new Date().toISOString(), 
          source: 'api-fetch-empty' 
        };
        await setActivityData(activityId, structuredActivity);
        return structuredActivity;
      } else {
        logger.warn(`No data for activity ${activityId}. Preserving existing cache - NOT updating.`);
        const existingData = await getActivityData(activityId);
        return existingData || { lastCheck: new Date().toISOString(), source: 'cache-preserved' };
      }
    } else {
      structuredActivity = await structActivityData(activityJson);
      if (structuredActivity && structuredActivity.photo && 
@@ -125,12 +135,19 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
    return structuredActivity;
  } catch (error) {
    logger.error(`Error processing activity ID ${activityId}:`, error);
    // CRITICAL: On error, preserve existing cache instead of overwriting with error data
    if (forceUpdate) {
      const errorData: ActivityData = { 
        lastCheck: new Date().toISOString(), 
        error: "Failed to fetch or process" 
      };
      await setActivityData(activityId, errorData);
      return errorData;
    } else {
      logger.warn(`Error fetching activity ${activityId}. Preserving existing cache.`);
      const existingData = await getActivityData(activityId);
      return existingData || { lastCheck: new Date().toISOString(), error: (error as Error).message };
    }
  }
 }
@@ -155,11 +172,10 @@ async function processSingleActivity(activityId: string): Promise<void> {
 /**
 * Initialize the club cache by scanning through all activity IDs
- * Processed concurrently with controlled parallelism
+ * Processed sequentially
 */
 export async function initializeClubCache(): Promise<void> {
  logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`);
  logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
  const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1;
  let successCount = 0;
@@ -172,30 +188,23 @@ export async function initializeClubCache(): Promise<void> {
    (_, i) => String(MIN_ACTIVITY_ID_SCAN + i)
  );
-  // Create batch processor with concurrency control
+  // Process all activities sequentially
-  const processor = new BatchProcessor(
+  for (let i = 0; i < activityIds.length; i++) {
-    async (activityId: string) => {
+    const activityId = activityIds[i]!;
    try {
      await processSingleActivity(activityId);
-      return activityId;
+      successCount++;
-    },
+    } catch (error) {
    CONCURRENT_API_CALLS,
    {
      onError: (error, activityId) => {
      errorCount++;
      logger.error(`Error processing activity ID ${activityId}:`, error);
      },
      onProgress: (completed, total) => {
        if (completed % 100 === 0 || completed === total) {
          const mem = process.memoryUsage();
          logger.info(`Progress: ${completed}/${total} (${Math.round(completed/total*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB | Concurrent: ${CONCURRENT_API_CALLS}`);
    }
      }
    }
  );
-  // Process all activities concurrently
+    // Log progress every 100 activities or at completion
-  const results = await processor.process(activityIds);
+    if ((i + 1) % 100 === 0 || i === activityIds.length - 1) {
-  successCount = results.length;
+      const mem = process.memoryUsage();
      logger.info(`Progress: ${i + 1}/${totalIds} (${Math.round((i + 1) / totalIds * 100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed / 1024 / 1024)}MB`);
    }
  }
  logger.info(`Initial club cache population finished.`);
  logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
@@ -203,11 +212,10 @@ export async function initializeClubCache(): Promise<void> {
 /**
 * Update stale clubs in the cache
- * Processed concurrently with controlled parallelism
+ * Processed sequentially
 */
 export async function updateStaleClubs(): Promise<void> {
  logger.info('Starting stale club check...');
  logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
  const now = Date.now();
  const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000;
  const activityKeys = await getAllActivityKeys();
@@ -237,28 +245,21 @@ export async function updateStaleClubs(): Promise<void> {
  logger.info(`Found ${staleActivityIds.length} stale activities to update.`);
-  // Create batch processor for concurrent updates
+  // Process stale activities sequentially
-  const processor = new BatchProcessor(
+  for (let i = 0; i < staleActivityIds.length; i++) {
-    async (activityId: string) => {
+    const activityId = staleActivityIds[i]!;
    try {
      logger.debug(`Updating stale activity ${activityId}`);
      await processAndCacheActivity(activityId);
-      return activityId;
+    } catch (error) {
    },
    CONCURRENT_API_CALLS,
    {
      onError: (error, activityId) => {
      logger.error(`Error updating stale activity ${activityId}:`, error);
      },
      onProgress: (completed, total) => {
        if (completed % 10 === 0 || completed === total) {
          logger.info(`Update progress: ${completed}/${total} (${Math.round(completed/total*100)}%)`);
    }
      }
    }
  );
-  // Process stale activities concurrently
+    // Log progress every 10 activities or at completion
-  await processor.process(staleActivityIds);
+    if ((i + 1) % 10 === 0 || i === staleActivityIds.length - 1) {
      logger.info(`Update progress: ${i + 1}/${staleActivityIds.length} (${Math.round((i + 1) / staleActivityIds.length * 100)}%)`);
    }
  }
  logger.info('Stale club check finished.');
 }
--- a/test/test-concurrency.ts
+++ b/test/test-concurrency.ts
@@ -1,141 +0,0 @@
 // test/test-concurrency.ts
 /**
 * Test script for concurrency features
 * Run with: bun run test/test-concurrency.ts
 */
 import { Semaphore, executeWithConcurrency, BatchProcessor } from '../utils/semaphore';
 // Simulate API call
 function simulateApiCall(id: number, delay: number = 100): Promise<{ id: number; result: string }> {
  return new Promise((resolve) => {
    setTimeout(() => {
      resolve({ id, result: `Result for ${id}` });
    }, delay);
  });
 }
 async function testSemaphore(): Promise<void> {
  console.log('\n=== Test 1: Basic Semaphore ===');
  const semaphore = new Semaphore(3);
  const start = Date.now();
  const promises = [];
  for (let i = 1; i <= 10; i++) {
    const id = i;
    promises.push(
      (async () => {
        await semaphore.acquire();
        console.log(`[${id}] Acquired permit (available: ${semaphore.getAvailablePermits()})`);
        await simulateApiCall(id, 200);
        console.log(`[${id}] Releasing permit`);
        semaphore.release();
      })()
    );
  }
  await Promise.all(promises);
  const duration = Date.now() - start;
  console.log(`\n✓ Completed 10 tasks with max 3 concurrent in ${duration}ms`);
  console.log(`  (Sequential would take ~2000ms, parallel should be ~700-800ms)`);
 }
 async function testExecuteWithConcurrency(): Promise<void> {
  console.log('\n=== Test 2: executeWithConcurrency ===');
  const tasks = Array.from({ length: 10 }, (_, i) => () => simulateApiCall(i + 1, 100));
  const start = Date.now();
  const results = await executeWithConcurrency(tasks, 5);
  const duration = Date.now() - start;
  console.log(`✓ Completed ${results.length} tasks with max 5 concurrent in ${duration}ms`);
  console.log(`  Results: ${results.map(r => r.id).join(', ')}`);
 }
 async function testBatchProcessor(): Promise<void> {
  console.log('\n=== Test 3: BatchProcessor ===');
  const items = Array.from({ length: 20 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` }));
  let processedCount = 0;
  const processor = new BatchProcessor(
    async (item: { id: number; name: string }) => {
      await simulateApiCall(item.id, 50);
      processedCount++;
      return { ...item, processed: true };
    },
    4, // concurrency
    {
      onProgress: (completed, total) => {
        if (completed % 5 === 0 || completed === total) {
          console.log(`  Progress: ${completed}/${total} (${Math.round(completed / total * 100)}%)`);
        }
      },
      onError: (error, item) => {
        console.error(`  Error processing item ${item.id}:`, error.message);
      }
    }
  );
  const start = Date.now();
  const results = await processor.process(items);
  const duration = Date.now() - start;
  console.log(`✓ Processed ${results.length} items with max 4 concurrent in ${duration}ms`);
  console.log(`  Expected ~500ms (20 items / 4 concurrent * 50ms each)`);
 }
 async function testErrorHandling(): Promise<void> {
  console.log('\n=== Test 4: Error Handling ===');
  const items = [1, 2, 3, 4, 5];
  let errorCount = 0;
  const processor = new BatchProcessor(
    async (id: number) => {
      if (id % 2 === 0) {
        throw new Error(`Simulated error for ${id}`);
      }
      return { id, success: true };
    },
    3,
    {
      onError: (error, item) => {
        errorCount++;
        console.log(`  Caught error for item ${item}: ${error.message}`);
      }
    }
  );
  const results = await processor.process(items);
  console.log(`✓ Completed: ${results.length} success, ${errorCount} errors (errors handled gracefully)`);
 }
 async function main(): Promise<void> {
  console.log('╔═══════════════════════════════════════════════════╗');
  console.log('║         Concurrency Module Test Suite            ║');
  console.log('╚═══════════════════════════════════════════════════╝');
  try {
    await testSemaphore();
    await testExecuteWithConcurrency();
    await testBatchProcessor();
    await testErrorHandling();
    console.log('\n╔═══════════════════════════════════════════════════╗');
    console.log('║              All tests passed! ✓                  ║');
    console.log('╚═══════════════════════════════════════════════════╝');
    console.log('\n📝 Configuration:');
    console.log('   - Set CONCURRENT_API_CALLS in .env to control parallelism');
    console.log('   - Current default: 8 concurrent requests');
    console.log('   - Example: CONCURRENT_API_CALLS=16 for faster crawling');
    console.log('');
  } catch (error) {
    console.error('\n❌ Test failed:', error);
    process.exit(1);
  }
 }
 main();
--- a/utils/semaphore.ts
+++ b/utils/semaphore.ts
@@ -1,220 +0,0 @@
 // utils/semaphore.ts
 /**
 * Semaphore implementation for controlling concurrent operations
 * Based on patterns from civitai/civitai and p-queue
 */
 export class Semaphore {
  private capacity: number;
  private permits: number;
  private queue: Array<() => void> = [];
  constructor(capacity: number) {
    if (capacity < 1) {
      throw new Error('Semaphore capacity must be at least 1');
    }
    this.capacity = capacity;
    this.permits = capacity;
  }
  /**
   * Acquire a permit. If none available, waits until one is released.
   */
  async acquire(): Promise<void> {
    return new Promise<void>((resolve) => {
      if (this.permits > 0) {
        this.permits--;
        resolve();
      } else {
        // Queue the release callback
        this.queue.push(() => {
          resolve();
        });
      }
    });
  }
  /**
   * Release a permit and wake up a waiting task if any.
   */
  release(): void {
    if (this.queue.length > 0) {
      const next = this.queue.shift();
      if (next) {
        next();
      }
    } else {
      this.permits++;
    }
  }
  /**
   * Get current available permits.
   */
  getAvailablePermits(): number {
    return this.permits;
  }
  /**
   * Get total capacity.
   */
  getCapacity(): number {
    return this.capacity;
  }
  /**
   * Get number of waiting tasks.
   */
  getWaitingCount(): number {
    return this.queue.length;
  }
 }
 /**
 * Execute async tasks with concurrency limit
 * @param tasks Array of async task functions
 * @param concurrency Maximum concurrent tasks
 * @returns Promise that resolves with all results when complete
 */
 export async function executeWithConcurrency<T>(
  tasks: Array<() => Promise<T>>,
  concurrency: number
 ): Promise<T[]> {
  const semaphore = new Semaphore(concurrency);
  const results: T[] = new Array(tasks.length);
  const promises = tasks.map(async (task, index) => {
    await semaphore.acquire();
    try {
      results[index] = await task();
      return results[index];
    } finally {
      semaphore.release();
    }
  });
  return Promise.all(promises);
 }
 /**
 * Execute async tasks with concurrency limit and progress callback
 * @param tasks Array of async task functions
 * @param concurrency Maximum concurrent tasks
 * @param onProgress Callback with (completed, total, result) after each task
 * @returns Promise that resolves with all results when complete
 */
 export async function executeWithConcurrencyAndProgress<T>(
  tasks: Array<() => Promise<T>>,
  concurrency: number,
  onProgress?: (completed: number, total: number, result: T, error?: Error) => void
 ): Promise<T[]> {
  const semaphore = new Semaphore(concurrency);
  const results: T[] = new Array(tasks.length);
  let completed = 0;
  const total = tasks.length;
  const promises = tasks.map(async (task, index) => {
    await semaphore.acquire();
    try {
      results[index] = await task();
      completed++;
      onProgress?.(completed, total, results[index]!);
      return results[index];
    } catch (error) {
      completed++;
      onProgress?.(completed, total, undefined as T, error as Error);
      throw error;
    } finally {
      semaphore.release();
    }
  });
  return Promise.all(promises);
 }
 /**
 * Batch processor with concurrency control
 * Useful for processing large arrays in chunks with controlled concurrency
 */
 export class BatchProcessor<T, R> {
  private semaphore: Semaphore;
  private processor: (item: T, index: number) => Promise<R>;
  private onError?: (error: Error, item: T, index: number) => void;
  private onProgress?: (completed: number, total: number) => void;
  constructor(
    processor: (item: T, index: number) => Promise<R>,
    concurrency: number,
    options?: {
      onError?: (error: Error, item: T, index: number) => void;
      onProgress?: (completed: number, total: number) => void;
    }
  ) {
    this.processor = processor;
    this.semaphore = new Semaphore(concurrency);
    this.onError = options?.onError;
    this.onProgress = options?.onProgress;
  }
  /**
   * Process an array of items with concurrency control
   * Only returns successful results, errors are handled by onError callback
   */
  async process(items: T[]): Promise<Awaited<R>[]> {
    const results: (Awaited<R> | undefined)[] = new Array(items.length);
    let completed = 0;
    const total = items.length;
    const promises = items.map(async (item, index) => {
      await this.semaphore.acquire();
      try {
        const result = await this.processor(item, index);
        completed++;
        this.onProgress?.(completed, total);
        return result;
      } catch (error) {
        completed++;
        this.onProgress?.(completed, total);
        this.onError?.(error as Error, item, index);
        return undefined;
      } finally {
        this.semaphore.release();
      }
    });
    const allResults = await Promise.all(promises);
    return allResults.filter((r): r is Awaited<R> => r !== undefined);
  }
  /**
   * Process an array and return both results and errors
   */
  async processWithErrors(items: T[]): Promise<{
    results: R[];
    errors: Array<{ error: Error; item: T; index: number }>;
  }> {
    const results: R[] = [];
    const errors: Array<{ error: Error; item: T; index: number }> = [];
    let completed = 0;
    const total = items.length;
    const promises = items.map(async (item, index) => {
      await this.semaphore.acquire();
      try {
        const result = await this.processor(item, index);
        results.push(result);
        completed++;
        this.onProgress?.(completed, total);
      } catch (error) {
        completed++;
        this.onProgress?.(completed, total);
        errors.push({ error: error as Error, item, index });
      } finally {
        this.semaphore.release();
      }
    });
    await Promise.all(promises);
    return { results, errors };
  }
 }
Author	SHA1	Message	Date
JamesFlare1212	5fb60b069f	fix(cache): preserve local data on remote 5xx errors Only update cache when remote returns HTTP 200. On 5xx errors or timeouts, preserve existing local cache instead of overwriting with empty/error data.	2026-04-08 12:07:51 -04:00
JamesFlare1212	fb68c1ad5d	refactor(scan): remove multi-thread scan logic, use sequential processing	2026-04-08 12:04:27 -04:00