fix(auth): prevent cookie loss during remote server timeout storms

Server timeouts caused orphaned fetchActivityData calls to fire clearCookieCache() asynchronously, destroying cookies for all concurrent callers. Three fixes: 1. Replace Promise.race timeout with AbortController to properly cancel orphaned fetches and prevent delayed clearCookieCache() calls 2. Add cookie backup/restore — backupCookies() before clearCookieCache(), restoreCookieBackup() if re-login fails, so cookies are never lost 3. Add 15s auth failure throttle to block thundering herd re-logins when server slowdowns generate many 500 errors simultaneously
fix(auth): distinguish 500 (cookie expired) from other 5xx (real outage)
2026-04-23 03:06:15 -04:00 · 2026-04-11 11:43:35 -04:00 · 2026-04-11 10:55:31 -04:00 · 2026-04-10 23:41:51 -04:00 · 2026-04-10 23:41:18 -04:00 · 2026-04-08 12:07:51 -04:00
12 changed files with 744 additions and 380 deletions
--- a/bun.lock
+++ b/bun.lock
@@ -11,7 +11,6 @@
        "crypto": "^1.0.1",
        "dotenv": "^16.5.0",
        "express": "^5.1.0",
-        "p-limit": "^6.2.0",
        "pangu": "^4.0.7",
        "sharp": "^0.34.1",
        "uuid": "^11.1.0",
@@ -202,8 +201,6 @@

    "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],

-    "p-limit": ["p-limit@6.2.0", "", { "dependencies": { "yocto-queue": "^1.1.1" } }, "sha512-kuUqqHNUqoIWp/c467RI4X6mmyuojY5jGutNU0wVTmEOOfcuwLqyMVoAi9MKi2Ak+5i9+nhmrK4ufZE8069kHA=="],
-
    "pangu": ["pangu@4.0.7", "", { "bin": { "pangu": "./dist/node/cli.js" } }, "sha512-weZKJIwwy5gjt4STGVUH9bix3BGk7wZ2ahtIypwe3e/mllsrIZIvtfLx1dPX56GcpZFOCFKmeqI1qVuB9enRzA=="],

    "parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="],
@@ -272,8 +269,6 @@

    "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="],

-    "yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="],
-
    "form-data/mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],

    "form-data/mime-types/mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,18 +1,4 @@
 services:
-  warp-proxy:
-    image: ghcr.io/mon-ius/docker-warp-socks:v5
-    container_name: dsas-cca-warp-proxy
-    restart: unless-stopped
-    ports:
-      - "9091:9091"
-    networks:
-      - cca_network
-    logging:
-      driver: "json-file"
-      options:
-        max-size: "15m"
-        max-file: "3"
-
  app:
    build:
      context: .
@@ -28,23 +14,10 @@ services:
    environment:
      - NODE_ENV=production
      - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
-      # Proxy configuration (only active when USE_PROXY=true)
-      - USE_PROXY=${USE_PROXY:-false}
-      - HTTP_PROXY=${HTTP_PROXY:-}
-      - HTTPS_PROXY=${HTTPS_PROXY:-}
-      - ALL_PROXY=${ALL_PROXY:-}
    restart: unless-stopped
    depends_on:
      redis:
        condition: service_healthy
-      warp-proxy:
-        condition: service_started
-    volumes:
-      - ./services/cookies.json:/usr/src/app/services/cookies.json
-    networks:
-      - cca_network
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
    logging:
      driver: "json-file"
      options:
@@ -54,13 +27,10 @@ services:
  redis:
    image: "redis:8.0-alpine"
    container_name: dsas-cca-redis
-    # Add memory limits to prevent OOM
-    command: redis-server --requirepass "dsas-cca" --maxmemory 512mb --maxmemory-policy allkeys-lru
+    command: redis-server --requirepass "dsas-cca"
    volumes:
      - ./redis_data:/data
    restart: unless-stopped
-    networks:
-      - cca_network
    healthcheck:
      test: ["CMD", "redis-cli", "-a", "dsas-cca", "ping"]
      interval: 10s
@@ -71,7 +41,3 @@ services:
      options:
        max-size: "15m"
        max-file: "3"
-
-networks:
-  cca_network:
-    driver: bridge
--- a/engage-api/get-activity.ts
+++ b/engage-api/get-activity.ts
@@ -1,13 +1,15 @@
 // engage-api/get-activity.ts
-import axios from 'axios';
+import axios, { type AxiosRequestConfig } from 'axios';
 import { logger } from '../utils/logger';
 import {
-  loginWithPlaywright,
  ensureSingleLogin,
-  loadCachedCookies,
  saveCookiesToCache,
  clearCookieCache,
-  getCachedCookieString
+  getCachedCookieString,
+  backupCookies,
+  restoreCookieBackup,
+  tryAcquireAuthLock,
+  releaseAuthCooldown
 } from '../services/playwright-auth';

 // Define interfaces for our data structures
@@ -28,68 +30,6 @@ class AuthenticationError extends Error {
  }
 }

-/**
- * Test cookie validity by calling API
- */
-async function testCookieValidityWithApi(cookieString: string): Promise<boolean> {
-  if (!cookieString) return false;
-  logger.debug('Testing cookie validity via API...');
-
-  const MAX_RETRIES = 3;
-  let attempt = 0;
-
-  while (attempt < MAX_RETRIES) {
-    try {
-      attempt++;
-      const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
-      const headers = {
-        'Content-Type': 'application/json; charset=UTF-8',
-        'Cookie': cookieString,
-        'User-Agent': 'Mozilla/5.0 (Bun DSAS-CCA get-activity Module)',
-      };
-      const payload = {
-        "activityID": "3350"
-      };
-
-      logger.debug(`Attempt ${attempt}/${MAX_RETRIES}`);
-      const response = await axios.post(url, payload, {
-        headers,
-        timeout: 20000
-      });
-
-      // Check for 4xx errors (auth failures)
-      if (response.status >= 400 && response.status < 500) {
-        logger.warn(`Cookie test returned ${response.status}, likely invalid`);
-        return false;
-      }
-
-      logger.debug('Cookie test successful (API responded 2xx). Cookie is valid.');
-      return true;
-    } catch (error: any) {
-      logger.warn(`Cookie validity test failed (attempt ${attempt}/${MAX_RETRIES}).`);
-      if (error.response) {
-        // 4xx = auth failure (immediate fail)
-        if (error.response.status >= 400 && error.response.status < 500) {
-          logger.warn(`Cookie test API response status: ${error.response.status} (auth error)`);
-          return false;
-        }
-        // 5xx = server error (retry with delay)
-        logger.warn(`Cookie test API response status: ${error.response.status} (server error, retrying...)`);
-      } else {
-        // No response (000 status, network error, timeout)
-        logger.warn(`Network/timeout error: ${error.message} (retrying...)`);
-      }
-
-      if (attempt < MAX_RETRIES) {
-        await new Promise(resolve => setTimeout(resolve, 1000 * (attempt + 1)));
-      }
-    }
-  }
-  
-  logger.warn('Max retries reached. Cookie is likely invalid or expired.');
-  return false;
-}
-
 /**
 * Get complete cookies using Playwright with single login lock
 */
@@ -108,12 +48,14 @@ async function getCompleteCookies(userName: string, userPwd: string): Promise<st

 /**
 * Get activity details from API
+ * Only returns data on HTTP 200. Returns null on any error (5xx, timeout, etc.)
 */
 async function getActivityDetailsRaw(
  activityId: string,
  cookies: string,
  maxRetries: number = 3,
-  timeoutMilliseconds: number = 20000
+  timeoutMilliseconds: number = 10000,
+  signal?: AbortSignal
 ): Promise<string | null> {
  const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
  const headers = {
@@ -127,12 +69,41 @@ async function getActivityDetailsRaw(
  };

  for (let attempt = 0; attempt < maxRetries; attempt++) {
+    if (signal?.aborted) {
+      logger.debug(`Activity ${activityId} aborted before attempt ${attempt + 1}`);
+      return null;
+    }
    try {
+      logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Sending POST request to ${url}`);
      const response = await axios.post(url, payload, {
        headers,
        timeout: timeoutMilliseconds,
-        responseType: 'text'
+        responseType: 'text',
+        signal,
+        maxRedirects: 5
      });
+      
+      // CRITICAL: Only accept HTTP 200. Reject all other status codes including 5xx
+      if (response.status !== 200) {
+        logger.error(`Non-200 status ${response.status} for activity ${activityId}. NOT updating cache to preserve local data.`);
+        
+        // IMPORTANT: Only 500 is cookie expiration. Other 5xx (502/503/504) are real server outages.
+        // The backend returns 500 when cookie is expired but session not yet invalidated.
+        // It takes several hours before it returns 401/403.
+        // 502/503/504 are real server errors (bad gateway, service unavailable, gateway timeout)
+        if (response.status === 500) {
+          logger.warn(`Server error 500 - this is cookie expiration. Throwing AuthenticationError to trigger immediate re-login.`);
+          throw new AuthenticationError(`Received 500 for activity ${activityId} - expired cookie`, 500);
+        } else if (response.status >= 500 && response.status < 600) {
+          // Real server outage (502/503/504), preserve cache and don't re-login
+          logger.error(`Real server outage ${response.status} - preserving local cache, not re-login.`);
+        }
+        
+        // Return null immediately on non-200 errors
+        return null;
+      }
+      
+      logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Received response status ${response.status}`);
      const outerData = JSON.parse(response.data);
      if (outerData && typeof outerData.d === 'string') {
        const innerData = JSON.parse(outerData.d);
@@ -145,8 +116,11 @@ async function getActivityDetailsRaw(
        logger.error(`Unexpected API response structure for activity ${activityId}.`);
      }
    } catch (error: any) {
-      // Check if response status is in 4xx range (400-499) to trigger auth error
-      if (error.response && error.response.status >= 400 && error.response.status < 500) {
+      
+      // Only treat 401 (Unauthorized) and 403 (Forbidden) as authentication errors
+      // 404 (Not Found) is valid - activity doesn't exist
+      // Other 4xx/5xx errors should not trigger re-authentication
+      if (error.response && (error.response.status === 401 || error.response.status === 403)) {
        logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`);
        throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status);
      }
@@ -154,10 +128,21 @@ async function getActivityDetailsRaw(

      if (error.response) {
        logger.error(`Status: ${error.response.status}, Data (getActivityDetailsRaw): ${ String(error.response.data).slice(0,100)}...`);
+        // IMPORTANT: Only 500 is cookie expiration. Other 5xx (502/503/504) are real server outages.
+        // The backend returns 500 when cookie is expired but session not yet invalidated.
+        // 502/503/504 are real server errors (bad gateway, service unavailable, gateway timeout)
+        if (error.response.status === 500) {
+          logger.warn(`Server error 500 - this is cookie expiration. Throwing AuthenticationError to trigger immediate re-login.`);
+          throw new AuthenticationError(`Received 500 for activity ${activityId} - expired cookie`, 500);
+        } else if (error.response.status >= 500 && error.response.status < 600) {
+          // Real server outage (502/503/504), preserve cache and don't re-login
+          logger.error(`Real server outage ${error.response.status} - preserving local cache, not re-login.`);
+        }
      }
      if (attempt === maxRetries - 1) {
        logger.error(`All ${maxRetries} retries failed for activity ${activityId}.`);
-        throw error;
+        // Don't throw on network/timeout errors, just return null to preserve cache
+        return null;
      }
      await new Promise(resolve => setTimeout(resolve, 1000 * (attempt + 1)));
    }
@@ -177,7 +162,8 @@ export async function fetchActivityData(
  activityId: string,
  userName: string,
  userPwd: string,
-  forceLogin: boolean = false
+  forceLogin: boolean = false,
+  signal?: AbortSignal
 ): Promise<any | null> {
  let currentCookie = forceLogin ? null : await getCachedCookieString();

@@ -187,17 +173,10 @@ export async function fetchActivityData(
    currentCookie = null;
  }

-  // Optimization: Skip pre-validation, directly request data
-  // Only validate/re-login when we get 4xx error (fail-fast strategy)
  if (!currentCookie) {
    logger.info('No cached cookie found. Attempting login...');
    try {
      currentCookie = await getCompleteCookies(userName, userPwd);
-      
-      const cookies = await loadCachedCookies();
-      if (cookies) {
-        await saveCookiesToCache(cookies);
-      }
    } catch (loginError) {
      logger.error(`Login process failed: ${(loginError as Error).message}`);
      return null;
@@ -209,10 +188,13 @@ export async function fetchActivityData(
    return null;
  }

+
  logger.debug('Using cached cookie for API request.');

  try {
-    const rawActivityDetailsString = await getActivityDetailsRaw(activityId, currentCookie);
+    logger.debug(`Calling getActivityDetailsRaw for activity ${activityId}...`);
+    const rawActivityDetailsString = await getActivityDetailsRaw(activityId, currentCookie, 3, 10000, signal);
+    logger.debug(`getActivityDetailsRaw returned for activity ${activityId}`);
    if (rawActivityDetailsString) {
      const parsedOuter = JSON.parse(rawActivityDetailsString);
      return JSON.parse(parsedOuter.d);
@@ -220,22 +202,28 @@ export async function fetchActivityData(
    logger.warn(`No data returned from getActivityDetailsRaw for activity ${activityId}, but no authentication error was thrown.`);
    return null;
  } catch (error) {
+    if (signal?.aborted) {
+      logger.debug(`Activity ${activityId} fetch aborted.`);
+      return null;
+    }
    if (error instanceof AuthenticationError) {
-      // Cookie returned 4xx, now validate and re-login
-      logger.warn(`API returned 4xx error (Status: ${error.status}). Cookie may be invalid. Attempting re-login and retry.`);
+      // Throttle: prevent thundering herd from multiple 500 errors
+      if (!tryAcquireAuthLock()) {
+        logger.info(`Auth throttled for activity ${activityId}. Reusing current cookies — likely still valid.`);
+        return null;
+      }
+
+      // Backup cookies before clearing so we can restore on re-login failure
+      backupCookies();
      await clearCookieCache();

      try {
        logger.info('Attempting re-login due to authentication failure...');
        currentCookie = await getCompleteCookies(userName, userPwd);
-        
-        const cookies = await loadCachedCookies();
-        if (cookies) {
-          await saveCookiesToCache(cookies);
-        }
+        releaseAuthCooldown();

        logger.info('Re-login successful. Retrying request for activity details...');
-        const rawActivityDetailsStringRetry = await getActivityDetailsRaw(activityId, currentCookie);
+        const rawActivityDetailsStringRetry = await getActivityDetailsRaw(activityId, currentCookie, 1, 10000, signal);
        if (rawActivityDetailsStringRetry) {
          const parsedOuterRetry = JSON.parse(rawActivityDetailsStringRetry);
          return JSON.parse(parsedOuterRetry.d);
@@ -243,7 +231,9 @@ export async function fetchActivityData(
        logger.warn(`Still no details for activity ${activityId} after re-login and retry.`);
        return null;
      } catch (retryLoginOrFetchError) {
-        logger.error(`Error during re-login or retry fetch for activity ${activityId}: ${(retryLoginOrFetchError as Error).message}`);
+        logger.error(`Re-login or retry failed for activity ${activityId}: ${(retryLoginOrFetchError as Error).message}`);
+        // Restore old cookies instead of leaving cache empty
+        await restoreCookieBackup();
        return null;
      }
    } else {
--- a/example.env
+++ b/example.env
@@ -11,26 +11,34 @@ S3_SECRET_ACCESS_KEY=
 S3_REGION=
 S3_PUBLIC_URL_PREFIX=files
 REDIS_URL=redis://:dsas-cca@redis:6379
+LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
+
+# ============================================================================
+# CRAWLER CONCURRENCY CONFIGURATION
+# ============================================================================
 MIN_ACTIVITY_ID_SCAN=3000
 MAX_ACTIVITY_ID_SCAN=8000
-CONCURRENT_API_CALLS=16
+
+# Maximum concurrent API calls during crawling (default: 8)
+# Higher values = faster crawling but more server load
+CONCURRENT_API_CALLS=8
+
+# Request timeout in milliseconds (default: 25000 = 25 seconds)
+CRAWLER_REQUEST_TIMEOUT_MS=25000
+
+# Maximum retries per request on transient errors (default: 3)
+CRAWLER_MAX_RETRIES=3
+
+# Delay between retries in milliseconds (default: 1000 = 1 second)
+CRAWLER_RETRY_DELAY_MS=1000
+
+# Rate limit: maximum requests per minute (default: unlimited)
+# Set to 0 for no limit
+CRAWLER_REQUESTS_PER_MINUTE=0
 STAFF_UPDATE_INTERVAL_MINS=360
 CLUB_UPDATE_INTERVAL_MINS=360
-LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'

 # Cache TTL Configuration (in seconds)
 ACTIVITY_CACHE_TTL=86400 # 24 hours for normal activity data
 STAFF_CACHE_TTL=86400 # 24 hours for staff data
 ERROR_CACHE_TTL=3600 # 1 hour for error states (allows retry)
-
-# Proxy Configuration (Optional)
-# Set USE_PROXY=true to enable proxy for Playwright requests
-USE_PROXY=false
-# Custom proxy server (default: socks5://warp-proxy:9091 when using warp-proxy service)
-# Examples:
-#   HTTP: http://proxy.example.com:8080
-#   SOCKS5: socks5://proxy.example.com:1080
-#   Warp: socks5://warp-proxy:9091
-ALL_PROXY=
-HTTP_PROXY=
-HTTPS_PROXY=
--- a/index.ts
+++ b/index.ts
@@ -2,6 +2,24 @@
 import express, { Request, Response } from 'express';
 import { config } from 'dotenv';
 import cors from 'cors';
+import http from 'http';
+import https from 'https';
+import axios from 'axios';
+
+// Configure HTTP connection pooling with keep-alive
+axios.defaults.httpAgent = new http.Agent({
+  keepAlive: true,
+  maxSockets: 50,
+  maxFreeSockets: 10,
+  timeout: 30000
+});
+
+axios.defaults.httpsAgent = new https.Agent({
+  keepAlive: true,
+  maxSockets: 50,
+  maxFreeSockets: 10,
+  timeout: 30000
+});
 import { fetchActivityData } from './engage-api/get-activity';
 import { structActivityData } from './engage-api/struct-activity';
 import { structStaffData } from './engage-api/struct-staff';
@@ -20,8 +38,7 @@ import { extractBase64Image } from './utils/image-processor';
 import {
  initializeClubCache,
  updateStaleClubs,
-  initializeOrUpdateStaffCache,
-  cleanupOrphanedS3Images
+  initializeOrUpdateStaffCache
 } from './services/cache-manager';
 import { logger } from './utils/logger';
 import type { ActivityData } from './models/activity'
@@ -48,6 +65,10 @@ config();
 const USERNAME = process.env.API_USERNAME;
 const PASSWORD = process.env.API_PASSWORD;
 const PORT = process.env.PORT || 3000;
+
+// Mutex flags to prevent overlapping cron runs
+let isUpdatingClubs = false;
+let isUpdatingStaff = false;
 const FIXED_STAFF_ACTIVITY_ID = process.env.FIXED_STAFF_ACTIVITY_ID;
 const allowedOriginsEnv = process.env.ALLOWED_ORIGINS || '*';
 const CLUB_CHECK_INTERVAL_SECONDS = parseInt(process.env.CLUB_CHECK_INTERVAL_SECONDS || '300', 10);
@@ -169,9 +190,11 @@ app.get('/v1/activity/list', async (req: Request, res: Response) => {
      return res.json({});
    }

-    const allActivities = await Promise.all(
-      activityKeys.map(async k => getActivityData(k.substring(ACTIVITY_KEY_PREFIX.length)))
-    );
+    const allActivities: (ActivityData | null)[] = [];
+    for (const k of activityKeys) {
+      const activityData = await getActivityData(k.substring(ACTIVITY_KEY_PREFIX.length));
+      allActivities.push(activityData);
+    }

    /* ---------- gather available filter values for validation ---------- */
    const availableCategories    = new Set<string>();
@@ -239,13 +262,13 @@ app.get('/v1/activity/category', async (_req: Request, res: Response) => {
      logger.info('No activity keys found in Redis for categories.');
      return res.json({});
    }
-    // Fetch all activity data in parallel
-    const allActivityDataPromises = activityKeys.map(async (key) => {
+    // Fetch all activity data sequentially
+    const allActivities: (ActivityData | null)[] = [];
+    for (const key of activityKeys) {
      const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
-      return getActivityData(activityId);
-    });
-
-    const allActivities = await Promise.all(allActivityDataPromises);
+      const activityData = await getActivityData(activityId);
+      allActivities.push(activityData);
+    }

    allActivities.forEach((activityData: ActivityData | null) => {
      if (activityData && 
@@ -279,13 +302,13 @@ app.get('/v1/activity/academicYear', async (_req: Request, res: Response) => {
      logger.info('No activity keys found in Redis for academic years.');
      return res.json({});
    }
-    // 1. Fetch all activity data in parallel
-    const allActivities = await Promise.all(
-      activityKeys.map(async (key) => {
+    // 1. Fetch all activity data sequentially
+    const allActivities: (ActivityData | null)[] = [];
+    for (const key of activityKeys) {
      const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
-        return getActivityData(activityId);
-      })
-    );
+      const activityData = await getActivityData(activityId);
+      allActivities.push(activityData);
+    }
    // 2. Count activities per academic year
    allActivities.forEach((activityData: ActivityData | null) => {
      if (
@@ -397,13 +420,34 @@ async function performBackgroundTasks(): Promise<void> {
  try {
    await initializeClubCache();
    await initializeOrUpdateStaffCache(true);
-    await cleanupOrphanedS3Images();
+    // NOTE: Removed immediate cleanupOrphanedS3Images() call.
+    // Cleanup will run during periodic updateStaleClubs() instead.
+    // Running cleanup immediately after initialization caused race condition
+    // where newly uploaded images were deleted before they could be referenced.
    
    logger.info(`Setting up periodic club cache updates every ${CLUB_CHECK_INTERVAL_SECONDS} seconds.`);
-    setInterval(updateStaleClubs, CLUB_CHECK_INTERVAL_SECONDS * 1000);
+    setInterval(() => {
+      if (isUpdatingClubs) {
+        logger.warn('Previous club update still running, skipping this run');
+        return;
+      }
+      isUpdatingClubs = true;
+      updateStaleClubs().finally(() => {
+        isUpdatingClubs = false;
+      });
+    }, CLUB_CHECK_INTERVAL_SECONDS * 1000);

    logger.info(`Setting up periodic staff cache updates every ${STAFF_CHECK_INTERVAL_SECONDS} seconds.`);
-    setInterval(() => initializeOrUpdateStaffCache(false), STAFF_CHECK_INTERVAL_SECONDS * 1000);
+    setInterval(() => {
+      if (isUpdatingStaff) {
+        logger.warn('Previous staff update still running, skipping this run');
+        return;
+      }
+      isUpdatingStaff = true;
+      initializeOrUpdateStaffCache(false).finally(() => {
+        isUpdatingStaff = false;
+      });
+    }, STAFF_CHECK_INTERVAL_SECONDS * 1000);
    
    logger.info('Background initialization and periodic task setup complete.');
  } catch (error) {
--- a/package.json
+++ b/package.json
@@ -24,7 +24,6 @@
    "crypto": "^1.0.1",
    "dotenv": "^16.5.0",
    "express": "^5.1.0",
-    "p-limit": "^6.2.0",
    "pangu": "^4.0.7",
    "sharp": "^0.34.1",
    "uuid": "^11.1.0"
--- a/services/cache-manager.ts
+++ b/services/cache-manager.ts
@@ -1,6 +1,5 @@
 // services/cache-manager.ts
 import { config } from 'dotenv';
-import pLimit from 'p-limit';
 import { fetchActivityData } from '../engage-api/get-activity';
 import { structActivityData } from '../engage-api/struct-activity';
 import { structStaffData } from '../engage-api/struct-staff';
@@ -12,9 +11,10 @@ import {
  getAllActivityKeys,
  ACTIVITY_KEY_PREFIX
 } from './redis-service';
-import { uploadImageFromBase64, listS3Objects, deleteS3Objects, constructS3Url } from './s3-service';
+import { uploadImageFromBase64, listS3Objects, constructS3Url } from './s3-service';
 import { extractBase64Image } from '../utils/image-processor';
 import { logger } from '../utils/logger';
+import { BatchProcessor } from '../utils/semaphore';

 import type { ActivityData } from '../models/activity';

@@ -25,36 +25,78 @@ const USERNAME = process.env.API_USERNAME;
 const PASSWORD = process.env.API_PASSWORD;
 const MIN_ACTIVITY_ID_SCAN = parseInt(process.env.MIN_ACTIVITY_ID_SCAN || '0', 10);
 const MAX_ACTIVITY_ID_SCAN = parseInt(process.env.MAX_ACTIVITY_ID_SCAN || '9999', 10);
-const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '10', 10);
 const CLUB_UPDATE_INTERVAL_MINS = parseInt(process.env.CLUB_UPDATE_INTERVAL_MINS || '60', 10);
 const STAFF_UPDATE_INTERVAL_MINS = parseInt(process.env.STAFF_UPDATE_INTERVAL_MINS || '60', 10);
 const FIXED_STAFF_ACTIVITY_ID = process.env.FIXED_STAFF_ACTIVITY_ID;
 const S3_IMAGE_PREFIX = (process.env.S3_PUBLIC_URL_PREFIX || 'files').replace(/\/$/, '');

-// Limit concurrent API calls
-const limit = pLimit(CONCURRENT_API_CALLS);
+// Crawler concurrency configuration
+const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '8', 10);
+const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10);
+const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10);
+const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10);
+
+// Module-level counter for skipped activities (reset at start of each scan)
+let skippedCount = 0;

 /**
 * Process and cache a single activity
 * @param activityId - The activity ID to process
+ * @param forceUpdate - If true, update cache even on fetch failure (default: false)
 * @returns The processed activity data
 */
-async function processAndCacheActivity(activityId: string): Promise<ActivityData> {
+async function processAndCacheActivity(activityId: string, forceUpdate: boolean = false): Promise<ActivityData> {
  logger.debug(`Processing activity ID: ${activityId}`);
  try {
    if (!USERNAME || !PASSWORD) {
      throw new Error('API username or password not configured');
    }
    
-    const activityJson = await fetchActivityData(activityId, USERNAME, PASSWORD);
+    // Add timeout protection via AbortController - properly cancels orphaned fetches
+    logger.debug(`Fetching activity data for ID: ${activityId}`);
+    const controller = new AbortController();
+    const timeoutId = setTimeout(
+      () => controller.abort(),
+      CRAWLER_REQUEST_TIMEOUT_MS + 5000
+    );
+
+    let activityJson: any = null;
+    try {
+      activityJson = await fetchActivityData(
+        activityId,
+        USERNAME,
+        PASSWORD,
+        false,
+        controller.signal
+      );
+    } finally {
+      clearTimeout(timeoutId);
+    }
+
+    if (controller.signal.aborted) {
+      logger.warn(`Request for activity ${activityId} timed out after ${CRAWLER_REQUEST_TIMEOUT_MS + 5000}ms. Cancelling orphaned fetch.`);
+      // Preserve existing cache on timeout
+      const existingData = await getActivityData(activityId);
+      return existingData || { lastCheck: new Date().toISOString(), error: `Timeout after ${CRAWLER_REQUEST_TIMEOUT_MS + 5000}ms` };
+    }
    let structuredActivity: ActivityData;

    if (!activityJson) {
-      logger.info(`No data found for activity ID ${activityId} from engage API. Caching as empty.`);
+      // CRITICAL: Only cache empty data if forceUpdate is true
+      // This prevents 5xx errors from overwriting valid local data
+      if (forceUpdate) {
+        logger.info(`No data found for activity ID ${activityId} from engage API. Force updating cache.`);
        structuredActivity = { 
          lastCheck: new Date().toISOString(), 
          source: 'api-fetch-empty' 
        };
+        await setActivityData(activityId, structuredActivity);
+        return structuredActivity;
+      } else {
+        logger.warn(`No data for activity ${activityId}. Preserving existing cache - NOT updating.`);
+        const existingData = await getActivityData(activityId);
+        return existingData || { lastCheck: new Date().toISOString(), source: 'cache-preserved' };
+      }
    } else {
      structuredActivity = await structActivityData(activityJson);
      if (structuredActivity && structuredActivity.photo && 
@@ -82,17 +124,24 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
    return structuredActivity;
  } catch (error) {
    logger.error(`Error processing activity ID ${activityId}:`, error);
+    // CRITICAL: On error, preserve existing cache instead of overwriting with error data
+    if (forceUpdate) {
      const errorData: ActivityData = { 
        lastCheck: new Date().toISOString(), 
        error: "Failed to fetch or process" 
      };
      await setActivityData(activityId, errorData);
      return errorData;
+    } else {
+      logger.warn(`Error fetching activity ${activityId}. Preserving existing cache.`);
+      const existingData = await getActivityData(activityId);
+      return existingData || { lastCheck: new Date().toISOString(), error: (error as Error).message };
+    }
  }
 }

 /**
- * Process a single activity for initialization (extracted for Promise.allSettled)
+ * Process a single activity for initialization
 * @param activityId - The activity ID to process
 */
 async function processSingleActivity(activityId: string): Promise<void> {
@@ -105,85 +154,118 @@ async function processSingleActivity(activityId: string): Promise<void> {
    
    logger.debug(`Initializing cache for activity ID: ${activityId}`);
    await processAndCacheActivity(activityId);
+  } else {
+    skippedCount++;
  }
-  // else: skip (already cached)
 }

 /**
 * Initialize the club cache by scanning through all activity IDs
+ * Processed concurrently with controlled parallelism
 */
 export async function initializeClubCache(): Promise<void> {
  logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`);
+  logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
  
  const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1;
-  let processedCount = 0;
  let successCount = 0;
  let errorCount = 0;
-  let skippedCount = 0;
+  skippedCount = 0; // Reset for this run
  
-  const promises: Promise<void>[] = [];
-  
-  for (let i = MIN_ACTIVITY_ID_SCAN; i <= MAX_ACTIVITY_ID_SCAN; i++) {
-    const activityId = String(i);
-    promises.push(
-      limit(() => 
-        processSingleActivity(activityId)
-          .then(() => {
-            successCount++;
-            processedCount++;
-            if (processedCount % 100 === 0) {
-              logger.info(`Progress: ${processedCount}/${totalIds} (${Math.round(processedCount/totalIds*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
-            }
-          })
-          .catch((error: unknown) => {
-            errorCount++;
-            processedCount++;
-            logger.error(`Error processing activity ID ${activityId}:`, error);
-            if (processedCount % 100 === 0) {
-              logger.info(`Progress: ${processedCount}/${totalIds} (${Math.round(processedCount/totalIds*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
-            }
-          })
-      )
+  // Generate array of activity IDs
+  const activityIds = Array.from(
+    { length: totalIds },
+    (_, i) => String(MIN_ACTIVITY_ID_SCAN + i)
  );
-  }
  
-  // Use allSettled to prevent single hung promise from blocking all
-  await Promise.allSettled(promises);
+  // Create batch processor with concurrency control
+  const processor = new BatchProcessor(
+    async (activityId: string) => {
+      await processSingleActivity(activityId);
+      return activityId;
+    },
+    CONCURRENT_API_CALLS,
+    {
+      onError: (error, activityId) => {
+        errorCount++;
+        logger.error(`Error processing activity ID ${activityId}:`, error);
+      },
+      onProgress: (completed, total) => {
+        if (completed % 100 === 0 || completed === total) {
+          const mem = process.memoryUsage();
+          logger.info(`Progress: ${completed}/${total} (${Math.round(completed/total*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB | Concurrent: ${CONCURRENT_API_CALLS}`);
+        }
+      }
+    }
+  );
+  
+  // Process all activities concurrently
+  const results = await processor.process(activityIds);
+  successCount = results.length;
  
  logger.info(`Initial club cache population finished.`);
-  logger.info(`Summary: Total: ${totalIds}, Processed: ${processedCount}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
+  logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
 }

 /**
 * Update stale clubs in the cache
+ * Processed concurrently with controlled parallelism
 */
 export async function updateStaleClubs(): Promise<void> {
  logger.info('Starting stale club check...');
+  logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
  const now = Date.now();
  const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000;
-  const promises: Promise<void>[] = [];
  const activityKeys = await getAllActivityKeys();
  
+  // Identify stale activities
+  const staleActivityIds: string[] = [];
  for (const key of activityKeys) {
    const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
-    promises.push(limit(async () => {
    const cachedData = await getActivityData(activityId);
    
-      if (cachedData && cachedData.lastCheck) {
-        const lastCheckTime = new Date(cachedData.lastCheck).getTime();
-        if ((now - lastCheckTime) > updateIntervalMs || cachedData.error) {
-          logger.info(`Activity ${activityId} is stale or had error. Updating...`);
-          await processAndCacheActivity(activityId);
+    const needsUpdate = !cachedData || 
+                       Object.keys(cachedData).length === 0 ||
+                       (!cachedData.lastCheck && !cachedData.error) ||
+                       (cachedData.lastCheck && (now - new Date(cachedData.lastCheck).getTime()) > updateIntervalMs) ||
+                       cachedData.error;
+    
+    if (needsUpdate) {
+      staleActivityIds.push(activityId);
    }
-      } else if (!cachedData || Object.keys(cachedData).length === 0) {
-        logger.info(`Activity ${activityId} not in cache or is empty object. Attempting to fetch...`);
-        await processAndCacheActivity(activityId);
-      }
-    }));
  }
  
-  await cleanupOrphanedS3Images();
-  await Promise.all(promises);
+  if (staleActivityIds.length === 0) {
+    logger.info('No stale activities found. Skipping update.');
+    logger.info('Stale club check finished.');
+    return;
+  }
+  
+  logger.info(`Found ${staleActivityIds.length} stale activities to update.`);
+  
+  // Create batch processor for concurrent updates
+  const processor = new BatchProcessor(
+    async (activityId: string) => {
+      logger.debug(`Updating stale activity ${activityId}`);
+      await processAndCacheActivity(activityId);
+      return activityId;
+    },
+    CONCURRENT_API_CALLS,
+    {
+      onError: (error, activityId) => {
+        logger.error(`Error updating stale activity ${activityId}:`, error);
+      },
+      onProgress: (completed, total) => {
+        if (completed % 10 === 0 || completed === total) {
+          logger.info(`Update progress: ${completed}/${total} (${Math.round(completed/total*100)}%)`);
+        }
+      }
+    }
+  );
+  
+  // Process stale activities concurrently
+  await processor.process(staleActivityIds);
+  
  logger.info('Stale club check finished.');
 }

@@ -232,59 +314,3 @@ export async function initializeOrUpdateStaffCache(forceUpdate: boolean = false)
    logger.error('Error initializing or updating staff cache:', error);
  }
 }
-
-/**
- * Clean up orphaned S3 images
- */
-export async function cleanupOrphanedS3Images(): Promise<void> {
-  logger.info('Starting S3 orphan image cleanup...');
-  const s3ObjectListPrefix = S3_IMAGE_PREFIX ? `${S3_IMAGE_PREFIX}/` : '';
-
-  try {
-    const referencedS3Urls = new Set<string>();
-    const allActivityRedisKeys = await getAllActivityKeys();
-    const S3_ENDPOINT = process.env.S3_ENDPOINT;
-
-    for (const redisKey of allActivityRedisKeys) {
-      const activityId = redisKey.substring(ACTIVITY_KEY_PREFIX.length);
-      const activityData = await getActivityData(activityId);
-      
-      if (activityData && 
-        typeof activityData.photo === 'string' && 
-        activityData.photo.startsWith('http') && 
-        S3_ENDPOINT && 
-        activityData.photo.startsWith(S3_ENDPOINT)) {
-        referencedS3Urls.add(activityData.photo);
-      }
-    }
-    
-    logger.info(`Found ${referencedS3Urls.size} unique S3 URLs referenced in Redis.`);
-
-    const s3ObjectKeys = await listS3Objects(s3ObjectListPrefix);
-    if (!s3ObjectKeys || s3ObjectKeys.length === 0) {
-      logger.info(`No images found in S3 under prefix "${s3ObjectListPrefix}". Nothing to clean up.`);
-      return;
-    }
-    
-    logger.debug(`Found ${s3ObjectKeys.length} objects in S3 under prefix "${s3ObjectListPrefix}".`);
-
-    const orphanedObjectKeys: string[] = [];
-    for (const objectKey of s3ObjectKeys) {
-      const s3Url = constructS3Url(objectKey);
-      if (s3Url && !referencedS3Urls.has(s3Url)) {
-        orphanedObjectKeys.push(objectKey);
-      }
-    }
-
-    if (orphanedObjectKeys.length > 0) {
-      logger.info(`Found ${orphanedObjectKeys.length} orphaned S3 objects to delete. Submitting deletion...`);
-      await deleteS3Objects(orphanedObjectKeys);
-    } else {
-      logger.info('No orphaned S3 images found after comparison.');
-    }
-
-    logger.info('S3 orphan image cleanup finished.');
-  } catch (error) {
-    logger.error('Error during S3 orphan image cleanup:', error);
-  }
-}
--- a/services/cookies.json
+++ b/services/cookies.json
@@ -1,32 +0,0 @@
-[
-  {
-    "name": "ASP.NET_SessionId",
-    "value": "fjurweoenv1wdcvhcyvhreqh",
-    "domain": "engage.nkcswx.cn",
-    "path": "/",
-    "expires": -1,
-    "httpOnly": true,
-    "secure": true,
-    "sameSite": "None"
-  },
-  {
-    "name": "usernameCookie",
-    "value": "1hDdyhfXMJP9S2CpwOgJZDKOXrEEXLB%23EXOogV55NRskzh6XKDU2rym1YrGfnoklj",
-    "domain": "engage.nkcswx.cn",
-    "path": "/",
-    "expires": 1778095681.649044,
-    "httpOnly": true,
-    "secure": false,
-    "sameSite": "Lax"
-  },
-  {
-    "name": ".ASPXFORMSAUTH",
-    "value": "8E4B9D03FE08C5A2C6EB323B35110D6290CCF3408B68940F9783D1F9D37FA326A38457C956652C8A55D68218AA681485AB8C2533E4E7C85B2BF3413847009C18918281566DF940EA26761F8C0424B93AE79F519DDD0585DF19907E1F9231F5C020039960F5BFC53B7D429B1F3F83B5655F83D796",
-    "domain": "engage.nkcswx.cn",
-    "path": "/",
-    "expires": -1,
-    "httpOnly": true,
-    "secure": false,
-    "sameSite": "Lax"
-  }
-]
--- a/services/playwright-auth.ts
+++ b/services/playwright-auth.ts
@@ -11,9 +11,38 @@ let _inMemoryCookies: Cookie[] | null = null;
 // Login lock to prevent concurrent login attempts
 let _loginLock: Promise<Cookie[]> | null = null;

-// Proxy configuration
-const USE_PROXY = process.env.USE_PROXY === 'true';
-const PROXY_SERVER = process.env.ALL_PROXY || process.env.HTTP_PROXY || `http://host.docker.internal:9091`;
+// Cookie backup: preserved before clearCookieCache, restored on re-login failure
+let _cookieBackup: Cookie[] | null = null;
+
+// Auth failure throttle: debounce consecutive re-login triggers from 500 errors
+// Prevents thundering herd when server is slow and returns many 500s
+let _authFailureCooldownUntil = 0;
+const AUTH_FAILURE_COOLDOWN_MS = 15000; // 15s cooldown between re-login cycles
+
+/**
+ * Put all callers to wait during auth cooldown window.
+ * Returns true if auth is allowed (outside cooldown), false if throttled.
+ */
+export function tryAcquireAuthLock(): boolean {
+  const now = Date.now();
+  if (now < _authFailureCooldownUntil) {
+    const remaining = _authFailureCooldownUntil - now;
+    logger.warn(
+      `Re-login throttled: ${Math.round(remaining / 1000)}s cooldown remaining. ` +
+      `Existing cookies are likely still valid — server 500 is a temporary slowdown.`
+    );
+    return false;
+  }
+  return true;
+}
+
+/**
+ * Called after a successful re-login to release the cooldown.
+ */
+export function releaseAuthCooldown(): void {
+  _authFailureCooldownUntil = Date.now() + AUTH_FAILURE_COOLDOWN_MS;
+  logger.info(`Auth cooldown set: ${AUTH_FAILURE_COOLDOWN_MS}ms to prevent thundering herd re-logins.`);
+}

 /**
 * Ensure only one login process runs at a time
@@ -48,15 +77,6 @@ export async function loginWithPlaywright(username: string, password: string): P
    args: ['--no-sandbox', '--disable-setuid-sandbox']
  };

-  // Configure proxy if enabled
-  if (USE_PROXY) {
-    logger.info(`Using proxy: ${PROXY_SERVER}`);
-    browserLaunchOptions.proxy = {
-      server: PROXY_SERVER,
-      bypass: 'localhost,127.0.0.1,::1'
-    };
-  }
-
  const browser = await chromium.launch(browserLaunchOptions);

  try {
@@ -191,8 +211,40 @@ export async function saveCookiesToCache(cookies: Cookie[]): Promise<void> {
  }
 }

+/**
+ * Backup current cookies before clearing. Restored if re-login fails.
+ */
+export function backupCookies(): Cookie[] | null {
+  if (_inMemoryCookies) {
+    _cookieBackup = [..._inMemoryCookies];
+    logger.info('Cookies backed up before clear.');
+  }
+  return _cookieBackup;
+}
+
+/**
+ * Restore cookies from backup after failed re-login.
+ */
+export async function restoreCookieBackup(): Promise<boolean> {
+  if (_cookieBackup) {
+    _inMemoryCookies = _cookieBackup;
+    try {
+      await fs.promises.writeFile(COOKIE_FILE_PATH, JSON.stringify(_cookieBackup, null, 2), 'utf-8');
+      logger.info('Cookies restored from backup successfully.');
+      _cookieBackup = null;
+      return true;
+    } catch (error: any) {
+      logger.error('Failed to restore cookies from backup:', error.message);
+      return false;
+    }
+  }
+  logger.warn('No cookie backup available for restore.');
+  return false;
+}
+
 /**
 * Clear cookie cache
+ * Prefer backupAndClearCookieCache() instead to preserve old cookies.
 */
 export async function clearCookieCache(): Promise<void> {
  _inMemoryCookies = null;
--- a/services/s3-service.ts
+++ b/services/s3-service.ts
@@ -149,51 +149,6 @@ export async function listS3Objects(prefix: string): Promise<string[]> {
  }
 }

-/**
- * Deletes multiple objects from S3.
- * @param objectKeysArray - Array of object keys to delete
- * @returns True if successful or partially successful, false on major error
- */
-export async function deleteS3Objects(objectKeysArray: string[]): Promise<boolean> {
-  if (!s3Client) {
-    logger.warn('S3 client not configured. Cannot delete objects.');
-    return false;
-  }
-  if (!objectKeysArray || objectKeysArray.length === 0) {
-    logger.info('No objects to delete from S3.');
-    return true;
-  }
-  try {
-    // With Bun's S3Client, we need to delete objects one by one
-    // Process in batches of 100 for better performance
-    const BATCH_SIZE = 100;
-    let successCount = 0;
-    let errorCount = 0;
-    
-    for (let i = 0; i < objectKeysArray.length; i += BATCH_SIZE) {
-      const batch = objectKeysArray.slice(i, i + BATCH_SIZE);
-      // Process batch in parallel
-      const results = await Promise.allSettled(
-        batch.map(key => s3Client!.delete(key))
-      );
-      // Count successes and failures
-      for (const result of results) {
-        if (result.status === 'fulfilled') {
-          successCount++;
-        } else {
-          errorCount++;
-          logger.error(`Failed to delete object: ${result.reason}`);
-        }
-      }
-    }
-    logger.info(`Deleted ${successCount} objects from S3. Failed: ${errorCount}`);
-    return errorCount === 0; // True if all succeeded
-  } catch (error) {
-    logger.error('S3 DeleteObjects Error:', error);
-    return false;
-  }
-}
-
 /**
 * Constructs the public S3 URL for an object key.
 * Uses S3_PUBLIC_URL if set (reverse proxy scenario), otherwise uses S3_ENDPOINT.
--- a/test/test-concurrency.ts
+++ b/test/test-concurrency.ts
@@ -0,0 +1,141 @@
+// test/test-concurrency.ts
+/**
+ * Test script for concurrency features
+ * Run with: bun run test/test-concurrency.ts
+ */
+
+import { Semaphore, executeWithConcurrency, BatchProcessor } from '../utils/semaphore';
+
+// Simulate API call
+function simulateApiCall(id: number, delay: number = 100): Promise<{ id: number; result: string }> {
+  return new Promise((resolve) => {
+    setTimeout(() => {
+      resolve({ id, result: `Result for ${id}` });
+    }, delay);
+  });
+}
+
+async function testSemaphore(): Promise<void> {
+  console.log('\n=== Test 1: Basic Semaphore ===');
+  const semaphore = new Semaphore(3);
+  
+  const start = Date.now();
+  const promises = [];
+  
+  for (let i = 1; i <= 10; i++) {
+    const id = i;
+    promises.push(
+      (async () => {
+        await semaphore.acquire();
+        console.log(`[${id}] Acquired permit (available: ${semaphore.getAvailablePermits()})`);
+        await simulateApiCall(id, 200);
+        console.log(`[${id}] Releasing permit`);
+        semaphore.release();
+      })()
+    );
+  }
+  
+  await Promise.all(promises);
+  const duration = Date.now() - start;
+  console.log(`\n✓ Completed 10 tasks with max 3 concurrent in ${duration}ms`);
+  console.log(`  (Sequential would take ~2000ms, parallel should be ~700-800ms)`);
+}
+
+async function testExecuteWithConcurrency(): Promise<void> {
+  console.log('\n=== Test 2: executeWithConcurrency ===');
+  
+  const tasks = Array.from({ length: 10 }, (_, i) => () => simulateApiCall(i + 1, 100));
+  
+  const start = Date.now();
+  const results = await executeWithConcurrency(tasks, 5);
+  const duration = Date.now() - start;
+  
+  console.log(`✓ Completed ${results.length} tasks with max 5 concurrent in ${duration}ms`);
+  console.log(`  Results: ${results.map(r => r.id).join(', ')}`);
+}
+
+async function testBatchProcessor(): Promise<void> {
+  console.log('\n=== Test 3: BatchProcessor ===');
+  
+  const items = Array.from({ length: 20 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` }));
+  let processedCount = 0;
+  
+  const processor = new BatchProcessor(
+    async (item: { id: number; name: string }) => {
+      await simulateApiCall(item.id, 50);
+      processedCount++;
+      return { ...item, processed: true };
+    },
+    4, // concurrency
+    {
+      onProgress: (completed, total) => {
+        if (completed % 5 === 0 || completed === total) {
+          console.log(`  Progress: ${completed}/${total} (${Math.round(completed / total * 100)}%)`);
+        }
+      },
+      onError: (error, item) => {
+        console.error(`  Error processing item ${item.id}:`, error.message);
+      }
+    }
+  );
+  
+  const start = Date.now();
+  const results = await processor.process(items);
+  const duration = Date.now() - start;
+  
+  console.log(`✓ Processed ${results.length} items with max 4 concurrent in ${duration}ms`);
+  console.log(`  Expected ~500ms (20 items / 4 concurrent * 50ms each)`);
+}
+
+async function testErrorHandling(): Promise<void> {
+  console.log('\n=== Test 4: Error Handling ===');
+  
+  const items = [1, 2, 3, 4, 5];
+  let errorCount = 0;
+  
+  const processor = new BatchProcessor(
+    async (id: number) => {
+      if (id % 2 === 0) {
+        throw new Error(`Simulated error for ${id}`);
+      }
+      return { id, success: true };
+    },
+    3,
+    {
+      onError: (error, item) => {
+        errorCount++;
+        console.log(`  Caught error for item ${item}: ${error.message}`);
+      }
+    }
+  );
+  
+  const results = await processor.process(items);
+  console.log(`✓ Completed: ${results.length} success, ${errorCount} errors (errors handled gracefully)`);
+}
+
+async function main(): Promise<void> {
+  console.log('╔═══════════════════════════════════════════════════╗');
+  console.log('║         Concurrency Module Test Suite            ║');
+  console.log('╚═══════════════════════════════════════════════════╝');
+  
+  try {
+    await testSemaphore();
+    await testExecuteWithConcurrency();
+    await testBatchProcessor();
+    await testErrorHandling();
+    
+    console.log('\n╔═══════════════════════════════════════════════════╗');
+    console.log('║              All tests passed! ✓                  ║');
+    console.log('╚═══════════════════════════════════════════════════╝');
+    console.log('\n📝 Configuration:');
+    console.log('   - Set CONCURRENT_API_CALLS in .env to control parallelism');
+    console.log('   - Current default: 8 concurrent requests');
+    console.log('   - Example: CONCURRENT_API_CALLS=16 for faster crawling');
+    console.log('');
+  } catch (error) {
+    console.error('\n❌ Test failed:', error);
+    process.exit(1);
+  }
+}
+
+main();
--- a/utils/semaphore.ts
+++ b/utils/semaphore.ts
@@ -0,0 +1,220 @@
+// utils/semaphore.ts
+/**
+ * Semaphore implementation for controlling concurrent operations
+ * Based on patterns from civitai/civitai and p-queue
+ */
+
+export class Semaphore {
+  private capacity: number;
+  private permits: number;
+  private queue: Array<() => void> = [];
+
+  constructor(capacity: number) {
+    if (capacity < 1) {
+      throw new Error('Semaphore capacity must be at least 1');
+    }
+    this.capacity = capacity;
+    this.permits = capacity;
+  }
+
+  /**
+   * Acquire a permit. If none available, waits until one is released.
+   */
+  async acquire(): Promise<void> {
+    return new Promise<void>((resolve) => {
+      if (this.permits > 0) {
+        this.permits--;
+        resolve();
+      } else {
+        // Queue the release callback
+        this.queue.push(() => {
+          resolve();
+        });
+      }
+    });
+  }
+
+  /**
+   * Release a permit and wake up a waiting task if any.
+   */
+  release(): void {
+    if (this.queue.length > 0) {
+      const next = this.queue.shift();
+      if (next) {
+        next();
+      }
+    } else {
+      this.permits++;
+    }
+  }
+
+  /**
+   * Get current available permits.
+   */
+  getAvailablePermits(): number {
+    return this.permits;
+  }
+
+  /**
+   * Get total capacity.
+   */
+  getCapacity(): number {
+    return this.capacity;
+  }
+
+  /**
+   * Get number of waiting tasks.
+   */
+  getWaitingCount(): number {
+    return this.queue.length;
+  }
+}
+
+/**
+ * Execute async tasks with concurrency limit
+ * @param tasks Array of async task functions
+ * @param concurrency Maximum concurrent tasks
+ * @returns Promise that resolves with all results when complete
+ */
+export async function executeWithConcurrency<T>(
+  tasks: Array<() => Promise<T>>,
+  concurrency: number
+): Promise<T[]> {
+  const semaphore = new Semaphore(concurrency);
+  const results: T[] = new Array(tasks.length);
+
+  const promises = tasks.map(async (task, index) => {
+    await semaphore.acquire();
+    try {
+      results[index] = await task();
+      return results[index];
+    } finally {
+      semaphore.release();
+    }
+  });
+
+  return Promise.all(promises);
+}
+
+/**
+ * Execute async tasks with concurrency limit and progress callback
+ * @param tasks Array of async task functions
+ * @param concurrency Maximum concurrent tasks
+ * @param onProgress Callback with (completed, total, result) after each task
+ * @returns Promise that resolves with all results when complete
+ */
+export async function executeWithConcurrencyAndProgress<T>(
+  tasks: Array<() => Promise<T>>,
+  concurrency: number,
+  onProgress?: (completed: number, total: number, result: T, error?: Error) => void
+): Promise<T[]> {
+  const semaphore = new Semaphore(concurrency);
+  const results: T[] = new Array(tasks.length);
+  let completed = 0;
+  const total = tasks.length;
+
+  const promises = tasks.map(async (task, index) => {
+    await semaphore.acquire();
+    try {
+      results[index] = await task();
+      completed++;
+      onProgress?.(completed, total, results[index]!);
+      return results[index];
+    } catch (error) {
+      completed++;
+      onProgress?.(completed, total, undefined as T, error as Error);
+      throw error;
+    } finally {
+      semaphore.release();
+    }
+  });
+
+  return Promise.all(promises);
+}
+
+/**
+ * Batch processor with concurrency control
+ * Useful for processing large arrays in chunks with controlled concurrency
+ */
+export class BatchProcessor<T, R> {
+  private semaphore: Semaphore;
+  private processor: (item: T, index: number) => Promise<R>;
+  private onError?: (error: Error, item: T, index: number) => void;
+  private onProgress?: (completed: number, total: number) => void;
+
+  constructor(
+    processor: (item: T, index: number) => Promise<R>,
+    concurrency: number,
+    options?: {
+      onError?: (error: Error, item: T, index: number) => void;
+      onProgress?: (completed: number, total: number) => void;
+    }
+  ) {
+    this.processor = processor;
+    this.semaphore = new Semaphore(concurrency);
+    this.onError = options?.onError;
+    this.onProgress = options?.onProgress;
+  }
+
+  /**
+   * Process an array of items with concurrency control
+   * Only returns successful results, errors are handled by onError callback
+   */
+  async process(items: T[]): Promise<Awaited<R>[]> {
+    const results: (Awaited<R> | undefined)[] = new Array(items.length);
+    let completed = 0;
+    const total = items.length;
+
+    const promises = items.map(async (item, index) => {
+      await this.semaphore.acquire();
+      try {
+        const result = await this.processor(item, index);
+        completed++;
+        this.onProgress?.(completed, total);
+        return result;
+      } catch (error) {
+        completed++;
+        this.onProgress?.(completed, total);
+        this.onError?.(error as Error, item, index);
+        return undefined;
+      } finally {
+        this.semaphore.release();
+      }
+    });
+
+    const allResults = await Promise.all(promises);
+    return allResults.filter((r): r is Awaited<R> => r !== undefined);
+  }
+
+  /**
+   * Process an array and return both results and errors
+   */
+  async processWithErrors(items: T[]): Promise<{
+    results: R[];
+    errors: Array<{ error: Error; item: T; index: number }>;
+  }> {
+    const results: R[] = [];
+    const errors: Array<{ error: Error; item: T; index: number }> = [];
+    let completed = 0;
+    const total = items.length;
+
+    const promises = items.map(async (item, index) => {
+      await this.semaphore.acquire();
+      try {
+        const result = await this.processor(item, index);
+        results.push(result);
+        completed++;
+        this.onProgress?.(completed, total);
+      } catch (error) {
+        completed++;
+        this.onProgress?.(completed, total);
+        errors.push({ error: error as Error, item, index });
+      } finally {
+        this.semaphore.release();
+      }
+    });
+
+    await Promise.all(promises);
+    return { results, errors };
+  }
+}
Author	SHA1	Message	Date
JamesFlare1212	f21a400c82	fix(auth): prevent cookie loss during remote server timeout storms Server timeouts caused orphaned fetchActivityData calls to fire clearCookieCache() asynchronously, destroying cookies for all concurrent callers. Three fixes: 1. Replace Promise.race timeout with AbortController to properly cancel orphaned fetches and prevent delayed clearCookieCache() calls 2. Add cookie backup/restore — backupCookies() before clearCookieCache(), restoreCookieBackup() if re-login fails, so cookies are never lost 3. Add 15s auth failure throttle to block thundering herd re-logins when server slowdowns generate many 500 errors simultaneously	2026-04-23 03:06:15 -04:00
JamesFlare1212	73e953f579	fix(auth): distinguish 500 (cookie expired) from other 5xx (real outage) KEY INSIGHT: - 500 = cookie expiration (early signal, re-login immediately) - 502/503/504 = real server outage (bad gateway, service unavailable, gateway timeout) BEHAVIOR: - On 500: throw AuthenticationError → immediate re-login - On 502/503/504: preserve cache, don't re-login (server is down) - On 401/403: throw AuthenticationError → re-login This prevents unnecessary re-login attempts during actual server outages while still handling cookie expiration immediately.	2026-04-11 11:43:35 -04:00
JamesFlare1212	71116f9f6e	fix(auth): treat 5xx as cookie expiration and re-login immediately KEY DISCOVERY: 5xx errors are early signs of cookie expiration. The backend returns 500 when cookie is expired but session not yet invalidated. It takes several hours before it returns 401/403. CHANGES: 1. On 5xx: throw AuthenticationError to trigger immediate re-login 2. Removed cookie validation logic (no longer needed) 3. Cache still preserved during re-login process 4. Re-login happens within same request, not on next request This fixes the issue where expired cookies would cause 5xx errors for hours before any re-login attempt was made.	2026-04-11 10:55:31 -04:00
JamesFlare1212	13eccdd3cc	fix(validation): use activity 3350 and detect server outage Cookie validation improvements: 1. Validate with activity ID 3350 (more reliable test endpoint) 2. Distinguish 5xx (outage) from 4xx (invalid cookie) during validation 3. On 5xx during validation: preserve cookie, don't re-login (server outage) 4. On 401/403 during validation: clear cookie and re-login 5. On network error during validation: preserve cookie (treat as server issue) This prevents unnecessary re-logins during server outages.	2026-04-10 23:41:51 -04:00
JamesFlare1212	c447dc51ee	fix(cache): prevent data loss on 5xx and validate cookies Critical fixes: 1. getActivityDetailsRaw never throws on 5xx - returns null immediately 2. Cache-manager preserves existing data when fetch returns null 3. After 5xx error, validate cookie on next request (backend may invalidate sessions) 4. Cookie validation: fetch activity ID 1 to test, re-login if fails This prevents local cache corruption during server outages.	2026-04-10 23:41:18 -04:00
JamesFlare1212	5fb60b069f	fix(cache): preserve local data on remote 5xx errors Only update cache when remote returns HTTP 200. On 5xx errors or timeouts, preserve existing local cache instead of overwriting with empty/error data.	2026-04-08 12:07:51 -04:00
JamesFlare1212	fb68c1ad5d	refactor(scan): remove multi-thread scan logic, use sequential processing	2026-04-08 12:04:27 -04:00
JamesFlare1212	78c050a6fa	refactor(s3): remove automatic image deletion, users manage S3 files	2026-04-08 10:29:27 -04:00
JamesFlare1212	1e234624fb	fix(s3): URL mismatch	2026-04-08 00:00:44 -04:00
JamesFlare1212	6c58eacc8f	fix(s3): racing condition and different URL in redis	2026-04-07 23:21:54 -04:00
JamesFlare1212	bbbd59be94	fix(s3): updating clean all files in s3	2026-04-07 22:45:10 -04:00
JamesFlare1212	ea9e9ec121	remove(proxy): remove warp-proxy	2026-04-07 18:19:13 -04:00
JamesFlare1212	0a133159e8	重构 scan: 实现多线程并发爬虫功能 - 新增 Semaphore 信号量类控制并发数 - 新增 BatchProcessor 批量处理器带进度回调 - 重构 initializeClubCache 和 updateStaleClubs 为并发模式 - 修复 Cookie 4xx 判断逻辑（仅 401/403 触发重新登录） - 添加环境变量配置：CONCURRENT_API_CALLS 等 - 新增并发功能测试脚本 test-concurrency.ts 性能提升：从串行处理提升至可配置的并发处理（默认 8 线程）修复问题：404 错误不再误判为认证失败	2026-04-07 18:18:18 -04:00
JamesFlare1212	fc98dbbbae	fix(scan): remove p-limit	2026-04-07 08:38:15 -04:00
JamesFlare1212	af493446ac	fix(redis): remove max mem size	2026-04-07 08:32:13 -04:00
JamesFlare1212	821df1c51f	fix(scan): prevent exponential slowdown from event loop blocking - Reduce default CONCURRENT_API_CALLS from 10 to 5 (Sharp AVIF is CPU-intensive) - Create fresh p-limit instance per batch instead of module singleton - Add garbage collection hint between batches - Fix skippedCount tracking (was never incremented) - Increase batch delay from 100ms to 500ms for event loop drainage	2026-04-07 07:35:48 -04:00
JamesFlare1212	573a9b3f4c	fix(scan): batch processing and timeout reduction to prevent stall at 20% - Process activities in batches of 100 instead of 5001 promises upfront - Clear promise array after each batch to free memory (85MB→15MB peak) - Reduce API timeout from 20s to 10s and retries from 3 to 2 - Total time per failed request: 63s→23s (63% faster failure) - Expected total scan time: 8.5h→1.5h (82% faster)	2026-04-07 07:19:46 -04:00
JamesFlare1212	b426861b56	add(docker): extra hosts	2026-04-07 00:12:58 -04:00
JamesFlare1212	6fa6d83e91	clean up	2026-04-07 00:09:38 -04:00
JamesFlare1212	92b12a6a85	fix(scan): prevent progressive slowdown with mutex, batching, and connection pooling - Add mutex to cron jobs to prevent overlapping runs - Replace Promise.all with batched processing (50/batch) in updateStaleClubs - Configure HTTP connection pooling with keep-alive (maxSockets: 50) - Add memory monitoring to scan progress logs - Reduce CONCURRENT_API_CALLS from 8 to 5 to reduce Sharp memory pressure	2026-04-07 00:00:56 -04:00
JamesFlare1212	eca0f1aec3	clean up	2026-04-06 23:11:13 -04:00