Compare commits
19 Commits
92b12a6a85
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f21a400c82 | ||
|
|
73e953f579 | ||
|
|
71116f9f6e | ||
|
|
13eccdd3cc | ||
|
|
c447dc51ee | ||
|
|
5fb60b069f | ||
|
|
fb68c1ad5d | ||
|
|
78c050a6fa | ||
|
|
1e234624fb | ||
|
|
6c58eacc8f | ||
|
|
bbbd59be94 | ||
|
|
ea9e9ec121 | ||
|
|
0a133159e8 | ||
|
|
fc98dbbbae | ||
|
|
af493446ac | ||
|
|
821df1c51f | ||
|
|
573a9b3f4c | ||
|
|
b426861b56 | ||
|
|
6fa6d83e91 |
5
bun.lock
5
bun.lock
@@ -11,7 +11,6 @@
|
||||
"crypto": "^1.0.1",
|
||||
"dotenv": "^16.5.0",
|
||||
"express": "^5.1.0",
|
||||
"p-limit": "^6.2.0",
|
||||
"pangu": "^4.0.7",
|
||||
"sharp": "^0.34.1",
|
||||
"uuid": "^11.1.0",
|
||||
@@ -202,8 +201,6 @@
|
||||
|
||||
"once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],
|
||||
|
||||
"p-limit": ["p-limit@6.2.0", "", { "dependencies": { "yocto-queue": "^1.1.1" } }, "sha512-kuUqqHNUqoIWp/c467RI4X6mmyuojY5jGutNU0wVTmEOOfcuwLqyMVoAi9MKi2Ak+5i9+nhmrK4ufZE8069kHA=="],
|
||||
|
||||
"pangu": ["pangu@4.0.7", "", { "bin": { "pangu": "./dist/node/cli.js" } }, "sha512-weZKJIwwy5gjt4STGVUH9bix3BGk7wZ2ahtIypwe3e/mllsrIZIvtfLx1dPX56GcpZFOCFKmeqI1qVuB9enRzA=="],
|
||||
|
||||
"parseurl": ["parseurl@1.3.3", "", {}, "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="],
|
||||
@@ -272,8 +269,6 @@
|
||||
|
||||
"wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="],
|
||||
|
||||
"yocto-queue": ["yocto-queue@1.2.1", "", {}, "sha512-AyeEbWOu/TAXdxlV9wmGcR0+yh2j3vYPGOECcIj2S7MkrLyC7ne+oye2BKTItt0ii2PHk4cDy+95+LshzbXnGg=="],
|
||||
|
||||
"form-data/mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="],
|
||||
|
||||
"form-data/mime-types/mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="],
|
||||
|
||||
@@ -1,18 +1,4 @@
|
||||
services:
|
||||
warp-proxy:
|
||||
image: ghcr.io/mon-ius/docker-warp-socks:v5
|
||||
container_name: dsas-cca-warp-proxy
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9091:9091"
|
||||
networks:
|
||||
- cca_network
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "15m"
|
||||
max-file: "3"
|
||||
|
||||
app:
|
||||
build:
|
||||
context: .
|
||||
@@ -28,23 +14,10 @@ services:
|
||||
environment:
|
||||
- NODE_ENV=production
|
||||
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
# Proxy configuration (only active when USE_PROXY=true)
|
||||
- USE_PROXY=${USE_PROXY:-false}
|
||||
- HTTP_PROXY=${HTTP_PROXY:-}
|
||||
- HTTPS_PROXY=${HTTPS_PROXY:-}
|
||||
- ALL_PROXY=${ALL_PROXY:-}
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
warp-proxy:
|
||||
condition: service_started
|
||||
volumes:
|
||||
- ./services/cookies.json:/usr/src/app/services/cookies.json
|
||||
networks:
|
||||
- cca_network
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
@@ -54,13 +27,10 @@ services:
|
||||
redis:
|
||||
image: "redis:8.0-alpine"
|
||||
container_name: dsas-cca-redis
|
||||
# Add memory limits to prevent OOM
|
||||
command: redis-server --requirepass "dsas-cca" --maxmemory 512mb --maxmemory-policy allkeys-lru
|
||||
command: redis-server --requirepass "dsas-cca"
|
||||
volumes:
|
||||
- ./redis_data:/data
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- cca_network
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "-a", "dsas-cca", "ping"]
|
||||
interval: 10s
|
||||
@@ -71,7 +41,3 @@ services:
|
||||
options:
|
||||
max-size: "15m"
|
||||
max-file: "3"
|
||||
|
||||
networks:
|
||||
cca_network:
|
||||
driver: bridge
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
// engage-api/get-activity.ts
|
||||
import axios from 'axios';
|
||||
import axios, { type AxiosRequestConfig } from 'axios';
|
||||
import { logger } from '../utils/logger';
|
||||
import {
|
||||
loginWithPlaywright,
|
||||
ensureSingleLogin,
|
||||
loadCachedCookies,
|
||||
saveCookiesToCache,
|
||||
clearCookieCache,
|
||||
getCachedCookieString
|
||||
getCachedCookieString,
|
||||
backupCookies,
|
||||
restoreCookieBackup,
|
||||
tryAcquireAuthLock,
|
||||
releaseAuthCooldown
|
||||
} from '../services/playwright-auth';
|
||||
|
||||
// Define interfaces for our data structures
|
||||
@@ -28,68 +30,6 @@ class AuthenticationError extends Error {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test cookie validity by calling API
|
||||
*/
|
||||
async function testCookieValidityWithApi(cookieString: string): Promise<boolean> {
|
||||
if (!cookieString) return false;
|
||||
logger.debug('Testing cookie validity via API...');
|
||||
|
||||
const MAX_RETRIES = 3;
|
||||
let attempt = 0;
|
||||
|
||||
while (attempt < MAX_RETRIES) {
|
||||
try {
|
||||
attempt++;
|
||||
const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
|
||||
const headers = {
|
||||
'Content-Type': 'application/json; charset=UTF-8',
|
||||
'Cookie': cookieString,
|
||||
'User-Agent': 'Mozilla/5.0 (Bun DSAS-CCA get-activity Module)',
|
||||
};
|
||||
const payload = {
|
||||
"activityID": "3350"
|
||||
};
|
||||
|
||||
logger.debug(`Attempt ${attempt}/${MAX_RETRIES}`);
|
||||
const response = await axios.post(url, payload, {
|
||||
headers,
|
||||
timeout: 20000
|
||||
});
|
||||
|
||||
// Check for 4xx errors (auth failures)
|
||||
if (response.status >= 400 && response.status < 500) {
|
||||
logger.warn(`Cookie test returned ${response.status}, likely invalid`);
|
||||
return false;
|
||||
}
|
||||
|
||||
logger.debug('Cookie test successful (API responded 2xx). Cookie is valid.');
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
logger.warn(`Cookie validity test failed (attempt ${attempt}/${MAX_RETRIES}).`);
|
||||
if (error.response) {
|
||||
// 4xx = auth failure (immediate fail)
|
||||
if (error.response.status >= 400 && error.response.status < 500) {
|
||||
logger.warn(`Cookie test API response status: ${error.response.status} (auth error)`);
|
||||
return false;
|
||||
}
|
||||
// 5xx = server error (retry with delay)
|
||||
logger.warn(`Cookie test API response status: ${error.response.status} (server error, retrying...)`);
|
||||
} else {
|
||||
// No response (000 status, network error, timeout)
|
||||
logger.warn(`Network/timeout error: ${error.message} (retrying...)`);
|
||||
}
|
||||
|
||||
if (attempt < MAX_RETRIES) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000 * (attempt + 1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.warn('Max retries reached. Cookie is likely invalid or expired.');
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get complete cookies using Playwright with single login lock
|
||||
*/
|
||||
@@ -108,12 +48,14 @@ async function getCompleteCookies(userName: string, userPwd: string): Promise<st
|
||||
|
||||
/**
|
||||
* Get activity details from API
|
||||
* Only returns data on HTTP 200. Returns null on any error (5xx, timeout, etc.)
|
||||
*/
|
||||
async function getActivityDetailsRaw(
|
||||
activityId: string,
|
||||
cookies: string,
|
||||
maxRetries: number = 3,
|
||||
timeoutMilliseconds: number = 20000
|
||||
timeoutMilliseconds: number = 10000,
|
||||
signal?: AbortSignal
|
||||
): Promise<string | null> {
|
||||
const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
|
||||
const headers = {
|
||||
@@ -127,12 +69,41 @@ async function getActivityDetailsRaw(
|
||||
};
|
||||
|
||||
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
||||
if (signal?.aborted) {
|
||||
logger.debug(`Activity ${activityId} aborted before attempt ${attempt + 1}`);
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Sending POST request to ${url}`);
|
||||
const response = await axios.post(url, payload, {
|
||||
headers,
|
||||
timeout: timeoutMilliseconds,
|
||||
responseType: 'text'
|
||||
responseType: 'text',
|
||||
signal,
|
||||
maxRedirects: 5
|
||||
});
|
||||
|
||||
// CRITICAL: Only accept HTTP 200. Reject all other status codes including 5xx
|
||||
if (response.status !== 200) {
|
||||
logger.error(`Non-200 status ${response.status} for activity ${activityId}. NOT updating cache to preserve local data.`);
|
||||
|
||||
// IMPORTANT: Only 500 is cookie expiration. Other 5xx (502/503/504) are real server outages.
|
||||
// The backend returns 500 when cookie is expired but session not yet invalidated.
|
||||
// It takes several hours before it returns 401/403.
|
||||
// 502/503/504 are real server errors (bad gateway, service unavailable, gateway timeout)
|
||||
if (response.status === 500) {
|
||||
logger.warn(`Server error 500 - this is cookie expiration. Throwing AuthenticationError to trigger immediate re-login.`);
|
||||
throw new AuthenticationError(`Received 500 for activity ${activityId} - expired cookie`, 500);
|
||||
} else if (response.status >= 500 && response.status < 600) {
|
||||
// Real server outage (502/503/504), preserve cache and don't re-login
|
||||
logger.error(`Real server outage ${response.status} - preserving local cache, not re-login.`);
|
||||
}
|
||||
|
||||
// Return null immediately on non-200 errors
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Received response status ${response.status}`);
|
||||
const outerData = JSON.parse(response.data);
|
||||
if (outerData && typeof outerData.d === 'string') {
|
||||
const innerData = JSON.parse(outerData.d);
|
||||
@@ -145,8 +116,11 @@ async function getActivityDetailsRaw(
|
||||
logger.error(`Unexpected API response structure for activity ${activityId}.`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
// Check if response status is in 4xx range (400-499) to trigger auth error
|
||||
if (error.response && error.response.status >= 400 && error.response.status < 500) {
|
||||
|
||||
// Only treat 401 (Unauthorized) and 403 (Forbidden) as authentication errors
|
||||
// 404 (Not Found) is valid - activity doesn't exist
|
||||
// Other 4xx/5xx errors should not trigger re-authentication
|
||||
if (error.response && (error.response.status === 401 || error.response.status === 403)) {
|
||||
logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`);
|
||||
throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status);
|
||||
}
|
||||
@@ -154,10 +128,21 @@ async function getActivityDetailsRaw(
|
||||
|
||||
if (error.response) {
|
||||
logger.error(`Status: ${error.response.status}, Data (getActivityDetailsRaw): ${ String(error.response.data).slice(0,100)}...`);
|
||||
// IMPORTANT: Only 500 is cookie expiration. Other 5xx (502/503/504) are real server outages.
|
||||
// The backend returns 500 when cookie is expired but session not yet invalidated.
|
||||
// 502/503/504 are real server errors (bad gateway, service unavailable, gateway timeout)
|
||||
if (error.response.status === 500) {
|
||||
logger.warn(`Server error 500 - this is cookie expiration. Throwing AuthenticationError to trigger immediate re-login.`);
|
||||
throw new AuthenticationError(`Received 500 for activity ${activityId} - expired cookie`, 500);
|
||||
} else if (error.response.status >= 500 && error.response.status < 600) {
|
||||
// Real server outage (502/503/504), preserve cache and don't re-login
|
||||
logger.error(`Real server outage ${error.response.status} - preserving local cache, not re-login.`);
|
||||
}
|
||||
}
|
||||
if (attempt === maxRetries - 1) {
|
||||
logger.error(`All ${maxRetries} retries failed for activity ${activityId}.`);
|
||||
throw error;
|
||||
// Don't throw on network/timeout errors, just return null to preserve cache
|
||||
return null;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000 * (attempt + 1)));
|
||||
}
|
||||
@@ -177,7 +162,8 @@ export async function fetchActivityData(
|
||||
activityId: string,
|
||||
userName: string,
|
||||
userPwd: string,
|
||||
forceLogin: boolean = false
|
||||
forceLogin: boolean = false,
|
||||
signal?: AbortSignal
|
||||
): Promise<any | null> {
|
||||
let currentCookie = forceLogin ? null : await getCachedCookieString();
|
||||
|
||||
@@ -187,17 +173,10 @@ export async function fetchActivityData(
|
||||
currentCookie = null;
|
||||
}
|
||||
|
||||
// Optimization: Skip pre-validation, directly request data
|
||||
// Only validate/re-login when we get 4xx error (fail-fast strategy)
|
||||
if (!currentCookie) {
|
||||
logger.info('No cached cookie found. Attempting login...');
|
||||
try {
|
||||
currentCookie = await getCompleteCookies(userName, userPwd);
|
||||
|
||||
const cookies = await loadCachedCookies();
|
||||
if (cookies) {
|
||||
await saveCookiesToCache(cookies);
|
||||
}
|
||||
} catch (loginError) {
|
||||
logger.error(`Login process failed: ${(loginError as Error).message}`);
|
||||
return null;
|
||||
@@ -209,10 +188,13 @@ export async function fetchActivityData(
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
logger.debug('Using cached cookie for API request.');
|
||||
|
||||
|
||||
try {
|
||||
const rawActivityDetailsString = await getActivityDetailsRaw(activityId, currentCookie);
|
||||
logger.debug(`Calling getActivityDetailsRaw for activity ${activityId}...`);
|
||||
const rawActivityDetailsString = await getActivityDetailsRaw(activityId, currentCookie, 3, 10000, signal);
|
||||
logger.debug(`getActivityDetailsRaw returned for activity ${activityId}`);
|
||||
if (rawActivityDetailsString) {
|
||||
const parsedOuter = JSON.parse(rawActivityDetailsString);
|
||||
return JSON.parse(parsedOuter.d);
|
||||
@@ -220,22 +202,28 @@ export async function fetchActivityData(
|
||||
logger.warn(`No data returned from getActivityDetailsRaw for activity ${activityId}, but no authentication error was thrown.`);
|
||||
return null;
|
||||
} catch (error) {
|
||||
if (signal?.aborted) {
|
||||
logger.debug(`Activity ${activityId} fetch aborted.`);
|
||||
return null;
|
||||
}
|
||||
if (error instanceof AuthenticationError) {
|
||||
// Cookie returned 4xx, now validate and re-login
|
||||
logger.warn(`API returned 4xx error (Status: ${error.status}). Cookie may be invalid. Attempting re-login and retry.`);
|
||||
// Throttle: prevent thundering herd from multiple 500 errors
|
||||
if (!tryAcquireAuthLock()) {
|
||||
logger.info(`Auth throttled for activity ${activityId}. Reusing current cookies — likely still valid.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Backup cookies before clearing so we can restore on re-login failure
|
||||
backupCookies();
|
||||
await clearCookieCache();
|
||||
|
||||
try {
|
||||
logger.info('Attempting re-login due to authentication failure...');
|
||||
currentCookie = await getCompleteCookies(userName, userPwd);
|
||||
|
||||
const cookies = await loadCachedCookies();
|
||||
if (cookies) {
|
||||
await saveCookiesToCache(cookies);
|
||||
}
|
||||
releaseAuthCooldown();
|
||||
|
||||
logger.info('Re-login successful. Retrying request for activity details...');
|
||||
const rawActivityDetailsStringRetry = await getActivityDetailsRaw(activityId, currentCookie);
|
||||
const rawActivityDetailsStringRetry = await getActivityDetailsRaw(activityId, currentCookie, 1, 10000, signal);
|
||||
if (rawActivityDetailsStringRetry) {
|
||||
const parsedOuterRetry = JSON.parse(rawActivityDetailsStringRetry);
|
||||
return JSON.parse(parsedOuterRetry.d);
|
||||
@@ -243,7 +231,9 @@ export async function fetchActivityData(
|
||||
logger.warn(`Still no details for activity ${activityId} after re-login and retry.`);
|
||||
return null;
|
||||
} catch (retryLoginOrFetchError) {
|
||||
logger.error(`Error during re-login or retry fetch for activity ${activityId}: ${(retryLoginOrFetchError as Error).message}`);
|
||||
logger.error(`Re-login or retry failed for activity ${activityId}: ${(retryLoginOrFetchError as Error).message}`);
|
||||
// Restore old cookies instead of leaving cache empty
|
||||
await restoreCookieBackup();
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
|
||||
36
example.env
36
example.env
@@ -11,26 +11,34 @@ S3_SECRET_ACCESS_KEY=
|
||||
S3_REGION=
|
||||
S3_PUBLIC_URL_PREFIX=files
|
||||
REDIS_URL=redis://:dsas-cca@redis:6379
|
||||
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
|
||||
|
||||
# ============================================================================
|
||||
# CRAWLER CONCURRENCY CONFIGURATION
|
||||
# ============================================================================
|
||||
MIN_ACTIVITY_ID_SCAN=3000
|
||||
MAX_ACTIVITY_ID_SCAN=8000
|
||||
CONCURRENT_API_CALLS=16
|
||||
|
||||
# Maximum concurrent API calls during crawling (default: 8)
|
||||
# Higher values = faster crawling but more server load
|
||||
CONCURRENT_API_CALLS=8
|
||||
|
||||
# Request timeout in milliseconds (default: 25000 = 25 seconds)
|
||||
CRAWLER_REQUEST_TIMEOUT_MS=25000
|
||||
|
||||
# Maximum retries per request on transient errors (default: 3)
|
||||
CRAWLER_MAX_RETRIES=3
|
||||
|
||||
# Delay between retries in milliseconds (default: 1000 = 1 second)
|
||||
CRAWLER_RETRY_DELAY_MS=1000
|
||||
|
||||
# Rate limit: maximum requests per minute (default: unlimited)
|
||||
# Set to 0 for no limit
|
||||
CRAWLER_REQUESTS_PER_MINUTE=0
|
||||
STAFF_UPDATE_INTERVAL_MINS=360
|
||||
CLUB_UPDATE_INTERVAL_MINS=360
|
||||
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
|
||||
|
||||
# Cache TTL Configuration (in seconds)
|
||||
ACTIVITY_CACHE_TTL=86400 # 24 hours for normal activity data
|
||||
STAFF_CACHE_TTL=86400 # 24 hours for staff data
|
||||
ERROR_CACHE_TTL=3600 # 1 hour for error states (allows retry)
|
||||
|
||||
# Proxy Configuration (Optional)
|
||||
# Set USE_PROXY=true to enable proxy for Playwright requests
|
||||
USE_PROXY=false
|
||||
# Custom proxy server (default: socks5://warp-proxy:9091 when using warp-proxy service)
|
||||
# Examples:
|
||||
# HTTP: http://proxy.example.com:8080
|
||||
# SOCKS5: socks5://proxy.example.com:1080
|
||||
# Warp: socks5://warp-proxy:9091
|
||||
ALL_PROXY=
|
||||
HTTP_PROXY=
|
||||
HTTPS_PROXY=
|
||||
|
||||
42
index.ts
42
index.ts
@@ -38,8 +38,7 @@ import { extractBase64Image } from './utils/image-processor';
|
||||
import {
|
||||
initializeClubCache,
|
||||
updateStaleClubs,
|
||||
initializeOrUpdateStaffCache,
|
||||
cleanupOrphanedS3Images
|
||||
initializeOrUpdateStaffCache
|
||||
} from './services/cache-manager';
|
||||
import { logger } from './utils/logger';
|
||||
import type { ActivityData } from './models/activity'
|
||||
@@ -191,9 +190,11 @@ app.get('/v1/activity/list', async (req: Request, res: Response) => {
|
||||
return res.json({});
|
||||
}
|
||||
|
||||
const allActivities = await Promise.all(
|
||||
activityKeys.map(async k => getActivityData(k.substring(ACTIVITY_KEY_PREFIX.length)))
|
||||
);
|
||||
const allActivities: (ActivityData | null)[] = [];
|
||||
for (const k of activityKeys) {
|
||||
const activityData = await getActivityData(k.substring(ACTIVITY_KEY_PREFIX.length));
|
||||
allActivities.push(activityData);
|
||||
}
|
||||
|
||||
/* ---------- gather available filter values for validation ---------- */
|
||||
const availableCategories = new Set<string>();
|
||||
@@ -261,13 +262,13 @@ app.get('/v1/activity/category', async (_req: Request, res: Response) => {
|
||||
logger.info('No activity keys found in Redis for categories.');
|
||||
return res.json({});
|
||||
}
|
||||
// Fetch all activity data in parallel
|
||||
const allActivityDataPromises = activityKeys.map(async (key) => {
|
||||
// Fetch all activity data sequentially
|
||||
const allActivities: (ActivityData | null)[] = [];
|
||||
for (const key of activityKeys) {
|
||||
const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
|
||||
return getActivityData(activityId);
|
||||
});
|
||||
|
||||
const allActivities = await Promise.all(allActivityDataPromises);
|
||||
const activityData = await getActivityData(activityId);
|
||||
allActivities.push(activityData);
|
||||
}
|
||||
|
||||
allActivities.forEach((activityData: ActivityData | null) => {
|
||||
if (activityData &&
|
||||
@@ -301,13 +302,13 @@ app.get('/v1/activity/academicYear', async (_req: Request, res: Response) => {
|
||||
logger.info('No activity keys found in Redis for academic years.');
|
||||
return res.json({});
|
||||
}
|
||||
// 1. Fetch all activity data in parallel
|
||||
const allActivities = await Promise.all(
|
||||
activityKeys.map(async (key) => {
|
||||
const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
|
||||
return getActivityData(activityId);
|
||||
})
|
||||
);
|
||||
// 1. Fetch all activity data sequentially
|
||||
const allActivities: (ActivityData | null)[] = [];
|
||||
for (const key of activityKeys) {
|
||||
const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
|
||||
const activityData = await getActivityData(activityId);
|
||||
allActivities.push(activityData);
|
||||
}
|
||||
// 2. Count activities per academic year
|
||||
allActivities.forEach((activityData: ActivityData | null) => {
|
||||
if (
|
||||
@@ -419,7 +420,10 @@ async function performBackgroundTasks(): Promise<void> {
|
||||
try {
|
||||
await initializeClubCache();
|
||||
await initializeOrUpdateStaffCache(true);
|
||||
await cleanupOrphanedS3Images();
|
||||
// NOTE: Removed immediate cleanupOrphanedS3Images() call.
|
||||
// Cleanup will run during periodic updateStaleClubs() instead.
|
||||
// Running cleanup immediately after initialization caused race condition
|
||||
// where newly uploaded images were deleted before they could be referenced.
|
||||
|
||||
logger.info(`Setting up periodic club cache updates every ${CLUB_CHECK_INTERVAL_SECONDS} seconds.`);
|
||||
setInterval(() => {
|
||||
|
||||
@@ -24,7 +24,6 @@
|
||||
"crypto": "^1.0.1",
|
||||
"dotenv": "^16.5.0",
|
||||
"express": "^5.1.0",
|
||||
"p-limit": "^6.2.0",
|
||||
"pangu": "^4.0.7",
|
||||
"sharp": "^0.34.1",
|
||||
"uuid": "^11.1.0"
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
// services/cache-manager.ts
|
||||
import { config } from 'dotenv';
|
||||
import pLimit from 'p-limit';
|
||||
import { fetchActivityData } from '../engage-api/get-activity';
|
||||
import { structActivityData } from '../engage-api/struct-activity';
|
||||
import { structStaffData } from '../engage-api/struct-staff';
|
||||
@@ -12,9 +11,10 @@ import {
|
||||
getAllActivityKeys,
|
||||
ACTIVITY_KEY_PREFIX
|
||||
} from './redis-service';
|
||||
import { uploadImageFromBase64, listS3Objects, deleteS3Objects, constructS3Url } from './s3-service';
|
||||
import { uploadImageFromBase64, listS3Objects, constructS3Url } from './s3-service';
|
||||
import { extractBase64Image } from '../utils/image-processor';
|
||||
import { logger } from '../utils/logger';
|
||||
import { BatchProcessor } from '../utils/semaphore';
|
||||
|
||||
import type { ActivityData } from '../models/activity';
|
||||
|
||||
@@ -25,36 +25,78 @@ const USERNAME = process.env.API_USERNAME;
|
||||
const PASSWORD = process.env.API_PASSWORD;
|
||||
const MIN_ACTIVITY_ID_SCAN = parseInt(process.env.MIN_ACTIVITY_ID_SCAN || '0', 10);
|
||||
const MAX_ACTIVITY_ID_SCAN = parseInt(process.env.MAX_ACTIVITY_ID_SCAN || '9999', 10);
|
||||
const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '10', 10);
|
||||
const CLUB_UPDATE_INTERVAL_MINS = parseInt(process.env.CLUB_UPDATE_INTERVAL_MINS || '60', 10);
|
||||
const STAFF_UPDATE_INTERVAL_MINS = parseInt(process.env.STAFF_UPDATE_INTERVAL_MINS || '60', 10);
|
||||
const FIXED_STAFF_ACTIVITY_ID = process.env.FIXED_STAFF_ACTIVITY_ID;
|
||||
const S3_IMAGE_PREFIX = (process.env.S3_PUBLIC_URL_PREFIX || 'files').replace(/\/$/, '');
|
||||
|
||||
// Limit concurrent API calls
|
||||
const limit = pLimit(CONCURRENT_API_CALLS);
|
||||
// Crawler concurrency configuration
|
||||
const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '8', 10);
|
||||
const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10);
|
||||
const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10);
|
||||
const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10);
|
||||
|
||||
// Module-level counter for skipped activities (reset at start of each scan)
|
||||
let skippedCount = 0;
|
||||
|
||||
/**
|
||||
* Process and cache a single activity
|
||||
* @param activityId - The activity ID to process
|
||||
* @param forceUpdate - If true, update cache even on fetch failure (default: false)
|
||||
* @returns The processed activity data
|
||||
*/
|
||||
async function processAndCacheActivity(activityId: string): Promise<ActivityData> {
|
||||
async function processAndCacheActivity(activityId: string, forceUpdate: boolean = false): Promise<ActivityData> {
|
||||
logger.debug(`Processing activity ID: ${activityId}`);
|
||||
try {
|
||||
if (!USERNAME || !PASSWORD) {
|
||||
throw new Error('API username or password not configured');
|
||||
}
|
||||
|
||||
const activityJson = await fetchActivityData(activityId, USERNAME, PASSWORD);
|
||||
// Add timeout protection via AbortController - properly cancels orphaned fetches
|
||||
logger.debug(`Fetching activity data for ID: ${activityId}`);
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(
|
||||
() => controller.abort(),
|
||||
CRAWLER_REQUEST_TIMEOUT_MS + 5000
|
||||
);
|
||||
|
||||
let activityJson: any = null;
|
||||
try {
|
||||
activityJson = await fetchActivityData(
|
||||
activityId,
|
||||
USERNAME,
|
||||
PASSWORD,
|
||||
false,
|
||||
controller.signal
|
||||
);
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
|
||||
if (controller.signal.aborted) {
|
||||
logger.warn(`Request for activity ${activityId} timed out after ${CRAWLER_REQUEST_TIMEOUT_MS + 5000}ms. Cancelling orphaned fetch.`);
|
||||
// Preserve existing cache on timeout
|
||||
const existingData = await getActivityData(activityId);
|
||||
return existingData || { lastCheck: new Date().toISOString(), error: `Timeout after ${CRAWLER_REQUEST_TIMEOUT_MS + 5000}ms` };
|
||||
}
|
||||
let structuredActivity: ActivityData;
|
||||
|
||||
if (!activityJson) {
|
||||
logger.info(`No data found for activity ID ${activityId} from engage API. Caching as empty.`);
|
||||
structuredActivity = {
|
||||
lastCheck: new Date().toISOString(),
|
||||
source: 'api-fetch-empty'
|
||||
};
|
||||
// CRITICAL: Only cache empty data if forceUpdate is true
|
||||
// This prevents 5xx errors from overwriting valid local data
|
||||
if (forceUpdate) {
|
||||
logger.info(`No data found for activity ID ${activityId} from engage API. Force updating cache.`);
|
||||
structuredActivity = {
|
||||
lastCheck: new Date().toISOString(),
|
||||
source: 'api-fetch-empty'
|
||||
};
|
||||
await setActivityData(activityId, structuredActivity);
|
||||
return structuredActivity;
|
||||
} else {
|
||||
logger.warn(`No data for activity ${activityId}. Preserving existing cache - NOT updating.`);
|
||||
const existingData = await getActivityData(activityId);
|
||||
return existingData || { lastCheck: new Date().toISOString(), source: 'cache-preserved' };
|
||||
}
|
||||
} else {
|
||||
structuredActivity = await structActivityData(activityJson);
|
||||
if (structuredActivity && structuredActivity.photo &&
|
||||
@@ -82,17 +124,24 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
|
||||
return structuredActivity;
|
||||
} catch (error) {
|
||||
logger.error(`Error processing activity ID ${activityId}:`, error);
|
||||
const errorData: ActivityData = {
|
||||
lastCheck: new Date().toISOString(),
|
||||
error: "Failed to fetch or process"
|
||||
};
|
||||
await setActivityData(activityId, errorData);
|
||||
return errorData;
|
||||
// CRITICAL: On error, preserve existing cache instead of overwriting with error data
|
||||
if (forceUpdate) {
|
||||
const errorData: ActivityData = {
|
||||
lastCheck: new Date().toISOString(),
|
||||
error: "Failed to fetch or process"
|
||||
};
|
||||
await setActivityData(activityId, errorData);
|
||||
return errorData;
|
||||
} else {
|
||||
logger.warn(`Error fetching activity ${activityId}. Preserving existing cache.`);
|
||||
const existingData = await getActivityData(activityId);
|
||||
return existingData || { lastCheck: new Date().toISOString(), error: (error as Error).message };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single activity for initialization (extracted for Promise.allSettled)
|
||||
* Process a single activity for initialization
|
||||
* @param activityId - The activity ID to process
|
||||
*/
|
||||
async function processSingleActivity(activityId: string): Promise<void> {
|
||||
@@ -105,103 +154,118 @@ async function processSingleActivity(activityId: string): Promise<void> {
|
||||
|
||||
logger.debug(`Initializing cache for activity ID: ${activityId}`);
|
||||
await processAndCacheActivity(activityId);
|
||||
} else {
|
||||
skippedCount++;
|
||||
}
|
||||
// else: skip (already cached)
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the club cache by scanning through all activity IDs
|
||||
* Processed concurrently with controlled parallelism
|
||||
*/
|
||||
export async function initializeClubCache(): Promise<void> {
|
||||
logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`);
|
||||
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
|
||||
|
||||
const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1;
|
||||
let processedCount = 0;
|
||||
let successCount = 0;
|
||||
let errorCount = 0;
|
||||
let skippedCount = 0;
|
||||
skippedCount = 0; // Reset for this run
|
||||
|
||||
const promises: Promise<void>[] = [];
|
||||
// Generate array of activity IDs
|
||||
const activityIds = Array.from(
|
||||
{ length: totalIds },
|
||||
(_, i) => String(MIN_ACTIVITY_ID_SCAN + i)
|
||||
);
|
||||
|
||||
for (let i = MIN_ACTIVITY_ID_SCAN; i <= MAX_ACTIVITY_ID_SCAN; i++) {
|
||||
const activityId = String(i);
|
||||
promises.push(
|
||||
limit(() =>
|
||||
processSingleActivity(activityId)
|
||||
.then(() => {
|
||||
successCount++;
|
||||
processedCount++;
|
||||
if (processedCount % 100 === 0) {
|
||||
const mem = process.memoryUsage();
|
||||
logger.info(`Progress: ${processedCount}/${totalIds} (${Math.round(processedCount/totalIds*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB`);
|
||||
}
|
||||
})
|
||||
.catch((error: unknown) => {
|
||||
errorCount++;
|
||||
processedCount++;
|
||||
logger.error(`Error processing activity ID ${activityId}:`, error);
|
||||
if (processedCount % 100 === 0) {
|
||||
const mem = process.memoryUsage();
|
||||
logger.info(`Progress: ${processedCount}/${totalIds} (${Math.round(processedCount/totalIds*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB`);
|
||||
}
|
||||
})
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Use allSettled to prevent single hung promise from blocking all
|
||||
await Promise.allSettled(promises);
|
||||
// Create batch processor with concurrency control
|
||||
const processor = new BatchProcessor(
|
||||
async (activityId: string) => {
|
||||
await processSingleActivity(activityId);
|
||||
return activityId;
|
||||
},
|
||||
CONCURRENT_API_CALLS,
|
||||
{
|
||||
onError: (error, activityId) => {
|
||||
errorCount++;
|
||||
logger.error(`Error processing activity ID ${activityId}:`, error);
|
||||
},
|
||||
onProgress: (completed, total) => {
|
||||
if (completed % 100 === 0 || completed === total) {
|
||||
const mem = process.memoryUsage();
|
||||
logger.info(`Progress: ${completed}/${total} (${Math.round(completed/total*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB | Concurrent: ${CONCURRENT_API_CALLS}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Process all activities concurrently
|
||||
const results = await processor.process(activityIds);
|
||||
successCount = results.length;
|
||||
|
||||
logger.info(`Initial club cache population finished.`);
|
||||
logger.info(`Summary: Total: ${totalIds}, Processed: ${processedCount}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
|
||||
logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update stale clubs in the cache
|
||||
* Processed concurrently with controlled parallelism
|
||||
*/
|
||||
export async function updateStaleClubs(): Promise<void> {
|
||||
logger.info('Starting stale club check...');
|
||||
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
|
||||
const now = Date.now();
|
||||
const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000;
|
||||
const promises: Promise<void>[] = [];
|
||||
const activityKeys = await getAllActivityKeys();
|
||||
|
||||
|
||||
// Identify stale activities
|
||||
const staleActivityIds: string[] = [];
|
||||
for (const key of activityKeys) {
|
||||
const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
|
||||
promises.push(limit(async () => {
|
||||
const cachedData = await getActivityData(activityId);
|
||||
|
||||
if (cachedData && cachedData.lastCheck) {
|
||||
const lastCheckTime = new Date(cachedData.lastCheck).getTime();
|
||||
if ((now - lastCheckTime) > updateIntervalMs || cachedData.error) {
|
||||
logger.info(`Activity ${activityId} is stale or had error. Updating...`);
|
||||
await processAndCacheActivity(activityId);
|
||||
}
|
||||
} else if (!cachedData || Object.keys(cachedData).length === 0) {
|
||||
logger.info(`Activity ${activityId} not in cache or is empty object. Attempting to fetch...`);
|
||||
await processAndCacheActivity(activityId);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
await cleanupOrphanedS3Images();
|
||||
|
||||
// Process promises in batches to prevent event loop blocking
|
||||
const BATCH_SIZE = 50;
|
||||
for (let i = 0; i < promises.length; i += BATCH_SIZE) {
|
||||
const batch = promises.slice(i, i + BATCH_SIZE);
|
||||
const batchNum = Math.floor(i / BATCH_SIZE) + 1;
|
||||
const totalBatches = Math.ceil(promises.length / BATCH_SIZE);
|
||||
const cachedData = await getActivityData(activityId);
|
||||
|
||||
await Promise.all(batch);
|
||||
logger.info(`Stale check batch ${batchNum}/${totalBatches} complete`);
|
||||
const needsUpdate = !cachedData ||
|
||||
Object.keys(cachedData).length === 0 ||
|
||||
(!cachedData.lastCheck && !cachedData.error) ||
|
||||
(cachedData.lastCheck && (now - new Date(cachedData.lastCheck).getTime()) > updateIntervalMs) ||
|
||||
cachedData.error;
|
||||
|
||||
// Small delay between batches to prevent event loop blocking
|
||||
if (i + BATCH_SIZE < promises.length) {
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
if (needsUpdate) {
|
||||
staleActivityIds.push(activityId);
|
||||
}
|
||||
}
|
||||
|
||||
if (staleActivityIds.length === 0) {
|
||||
logger.info('No stale activities found. Skipping update.');
|
||||
logger.info('Stale club check finished.');
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info(`Found ${staleActivityIds.length} stale activities to update.`);
|
||||
|
||||
// Create batch processor for concurrent updates
|
||||
const processor = new BatchProcessor(
|
||||
async (activityId: string) => {
|
||||
logger.debug(`Updating stale activity ${activityId}`);
|
||||
await processAndCacheActivity(activityId);
|
||||
return activityId;
|
||||
},
|
||||
CONCURRENT_API_CALLS,
|
||||
{
|
||||
onError: (error, activityId) => {
|
||||
logger.error(`Error updating stale activity ${activityId}:`, error);
|
||||
},
|
||||
onProgress: (completed, total) => {
|
||||
if (completed % 10 === 0 || completed === total) {
|
||||
logger.info(`Update progress: ${completed}/${total} (${Math.round(completed/total*100)}%)`);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Process stale activities concurrently
|
||||
await processor.process(staleActivityIds);
|
||||
|
||||
logger.info('Stale club check finished.');
|
||||
}
|
||||
|
||||
@@ -250,59 +314,3 @@ export async function initializeOrUpdateStaffCache(forceUpdate: boolean = false)
|
||||
logger.error('Error initializing or updating staff cache:', error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up orphaned S3 images
|
||||
*/
|
||||
export async function cleanupOrphanedS3Images(): Promise<void> {
|
||||
logger.info('Starting S3 orphan image cleanup...');
|
||||
const s3ObjectListPrefix = S3_IMAGE_PREFIX ? `${S3_IMAGE_PREFIX}/` : '';
|
||||
|
||||
try {
|
||||
const referencedS3Urls = new Set<string>();
|
||||
const allActivityRedisKeys = await getAllActivityKeys();
|
||||
const S3_ENDPOINT = process.env.S3_ENDPOINT;
|
||||
|
||||
for (const redisKey of allActivityRedisKeys) {
|
||||
const activityId = redisKey.substring(ACTIVITY_KEY_PREFIX.length);
|
||||
const activityData = await getActivityData(activityId);
|
||||
|
||||
if (activityData &&
|
||||
typeof activityData.photo === 'string' &&
|
||||
activityData.photo.startsWith('http') &&
|
||||
S3_ENDPOINT &&
|
||||
activityData.photo.startsWith(S3_ENDPOINT)) {
|
||||
referencedS3Urls.add(activityData.photo);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(`Found ${referencedS3Urls.size} unique S3 URLs referenced in Redis.`);
|
||||
|
||||
const s3ObjectKeys = await listS3Objects(s3ObjectListPrefix);
|
||||
if (!s3ObjectKeys || s3ObjectKeys.length === 0) {
|
||||
logger.info(`No images found in S3 under prefix "${s3ObjectListPrefix}". Nothing to clean up.`);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.debug(`Found ${s3ObjectKeys.length} objects in S3 under prefix "${s3ObjectListPrefix}".`);
|
||||
|
||||
const orphanedObjectKeys: string[] = [];
|
||||
for (const objectKey of s3ObjectKeys) {
|
||||
const s3Url = constructS3Url(objectKey);
|
||||
if (s3Url && !referencedS3Urls.has(s3Url)) {
|
||||
orphanedObjectKeys.push(objectKey);
|
||||
}
|
||||
}
|
||||
|
||||
if (orphanedObjectKeys.length > 0) {
|
||||
logger.info(`Found ${orphanedObjectKeys.length} orphaned S3 objects to delete. Submitting deletion...`);
|
||||
await deleteS3Objects(orphanedObjectKeys);
|
||||
} else {
|
||||
logger.info('No orphaned S3 images found after comparison.');
|
||||
}
|
||||
|
||||
logger.info('S3 orphan image cleanup finished.');
|
||||
} catch (error) {
|
||||
logger.error('Error during S3 orphan image cleanup:', error);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,9 +11,38 @@ let _inMemoryCookies: Cookie[] | null = null;
|
||||
// Login lock to prevent concurrent login attempts
|
||||
let _loginLock: Promise<Cookie[]> | null = null;
|
||||
|
||||
// Proxy configuration
|
||||
const USE_PROXY = process.env.USE_PROXY === 'true';
|
||||
const PROXY_SERVER = process.env.ALL_PROXY || process.env.HTTP_PROXY || `http://host.docker.internal:9091`;
|
||||
// Cookie backup: preserved before clearCookieCache, restored on re-login failure
|
||||
let _cookieBackup: Cookie[] | null = null;
|
||||
|
||||
// Auth failure throttle: debounce consecutive re-login triggers from 500 errors
|
||||
// Prevents thundering herd when server is slow and returns many 500s
|
||||
let _authFailureCooldownUntil = 0;
|
||||
const AUTH_FAILURE_COOLDOWN_MS = 15000; // 15s cooldown between re-login cycles
|
||||
|
||||
/**
|
||||
* Put all callers to wait during auth cooldown window.
|
||||
* Returns true if auth is allowed (outside cooldown), false if throttled.
|
||||
*/
|
||||
export function tryAcquireAuthLock(): boolean {
|
||||
const now = Date.now();
|
||||
if (now < _authFailureCooldownUntil) {
|
||||
const remaining = _authFailureCooldownUntil - now;
|
||||
logger.warn(
|
||||
`Re-login throttled: ${Math.round(remaining / 1000)}s cooldown remaining. ` +
|
||||
`Existing cookies are likely still valid — server 500 is a temporary slowdown.`
|
||||
);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called after a successful re-login to release the cooldown.
|
||||
*/
|
||||
export function releaseAuthCooldown(): void {
|
||||
_authFailureCooldownUntil = Date.now() + AUTH_FAILURE_COOLDOWN_MS;
|
||||
logger.info(`Auth cooldown set: ${AUTH_FAILURE_COOLDOWN_MS}ms to prevent thundering herd re-logins.`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure only one login process runs at a time
|
||||
@@ -48,15 +77,6 @@ export async function loginWithPlaywright(username: string, password: string): P
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||||
};
|
||||
|
||||
// Configure proxy if enabled
|
||||
if (USE_PROXY) {
|
||||
logger.info(`Using proxy: ${PROXY_SERVER}`);
|
||||
browserLaunchOptions.proxy = {
|
||||
server: PROXY_SERVER,
|
||||
bypass: 'localhost,127.0.0.1,::1'
|
||||
};
|
||||
}
|
||||
|
||||
const browser = await chromium.launch(browserLaunchOptions);
|
||||
|
||||
try {
|
||||
@@ -191,8 +211,40 @@ export async function saveCookiesToCache(cookies: Cookie[]): Promise<void> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Backup current cookies before clearing. Restored if re-login fails.
|
||||
*/
|
||||
export function backupCookies(): Cookie[] | null {
|
||||
if (_inMemoryCookies) {
|
||||
_cookieBackup = [..._inMemoryCookies];
|
||||
logger.info('Cookies backed up before clear.');
|
||||
}
|
||||
return _cookieBackup;
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore cookies from backup after failed re-login.
|
||||
*/
|
||||
export async function restoreCookieBackup(): Promise<boolean> {
|
||||
if (_cookieBackup) {
|
||||
_inMemoryCookies = _cookieBackup;
|
||||
try {
|
||||
await fs.promises.writeFile(COOKIE_FILE_PATH, JSON.stringify(_cookieBackup, null, 2), 'utf-8');
|
||||
logger.info('Cookies restored from backup successfully.');
|
||||
_cookieBackup = null;
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
logger.error('Failed to restore cookies from backup:', error.message);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
logger.warn('No cookie backup available for restore.');
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear cookie cache
|
||||
* Prefer backupAndClearCookieCache() instead to preserve old cookies.
|
||||
*/
|
||||
export async function clearCookieCache(): Promise<void> {
|
||||
_inMemoryCookies = null;
|
||||
|
||||
@@ -149,51 +149,6 @@ export async function listS3Objects(prefix: string): Promise<string[]> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes multiple objects from S3.
|
||||
* @param objectKeysArray - Array of object keys to delete
|
||||
* @returns True if successful or partially successful, false on major error
|
||||
*/
|
||||
export async function deleteS3Objects(objectKeysArray: string[]): Promise<boolean> {
|
||||
if (!s3Client) {
|
||||
logger.warn('S3 client not configured. Cannot delete objects.');
|
||||
return false;
|
||||
}
|
||||
if (!objectKeysArray || objectKeysArray.length === 0) {
|
||||
logger.info('No objects to delete from S3.');
|
||||
return true;
|
||||
}
|
||||
try {
|
||||
// With Bun's S3Client, we need to delete objects one by one
|
||||
// Process in batches of 100 for better performance
|
||||
const BATCH_SIZE = 100;
|
||||
let successCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (let i = 0; i < objectKeysArray.length; i += BATCH_SIZE) {
|
||||
const batch = objectKeysArray.slice(i, i + BATCH_SIZE);
|
||||
// Process batch in parallel
|
||||
const results = await Promise.allSettled(
|
||||
batch.map(key => s3Client!.delete(key))
|
||||
);
|
||||
// Count successes and failures
|
||||
for (const result of results) {
|
||||
if (result.status === 'fulfilled') {
|
||||
successCount++;
|
||||
} else {
|
||||
errorCount++;
|
||||
logger.error(`Failed to delete object: ${result.reason}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.info(`Deleted ${successCount} objects from S3. Failed: ${errorCount}`);
|
||||
return errorCount === 0; // True if all succeeded
|
||||
} catch (error) {
|
||||
logger.error('S3 DeleteObjects Error:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the public S3 URL for an object key.
|
||||
* Uses S3_PUBLIC_URL if set (reverse proxy scenario), otherwise uses S3_ENDPOINT.
|
||||
|
||||
141
test/test-concurrency.ts
Normal file
141
test/test-concurrency.ts
Normal file
@@ -0,0 +1,141 @@
|
||||
// test/test-concurrency.ts
|
||||
/**
|
||||
* Test script for concurrency features
|
||||
* Run with: bun run test/test-concurrency.ts
|
||||
*/
|
||||
|
||||
import { Semaphore, executeWithConcurrency, BatchProcessor } from '../utils/semaphore';
|
||||
|
||||
// Simulate API call
|
||||
function simulateApiCall(id: number, delay: number = 100): Promise<{ id: number; result: string }> {
|
||||
return new Promise((resolve) => {
|
||||
setTimeout(() => {
|
||||
resolve({ id, result: `Result for ${id}` });
|
||||
}, delay);
|
||||
});
|
||||
}
|
||||
|
||||
async function testSemaphore(): Promise<void> {
|
||||
console.log('\n=== Test 1: Basic Semaphore ===');
|
||||
const semaphore = new Semaphore(3);
|
||||
|
||||
const start = Date.now();
|
||||
const promises = [];
|
||||
|
||||
for (let i = 1; i <= 10; i++) {
|
||||
const id = i;
|
||||
promises.push(
|
||||
(async () => {
|
||||
await semaphore.acquire();
|
||||
console.log(`[${id}] Acquired permit (available: ${semaphore.getAvailablePermits()})`);
|
||||
await simulateApiCall(id, 200);
|
||||
console.log(`[${id}] Releasing permit`);
|
||||
semaphore.release();
|
||||
})()
|
||||
);
|
||||
}
|
||||
|
||||
await Promise.all(promises);
|
||||
const duration = Date.now() - start;
|
||||
console.log(`\n✓ Completed 10 tasks with max 3 concurrent in ${duration}ms`);
|
||||
console.log(` (Sequential would take ~2000ms, parallel should be ~700-800ms)`);
|
||||
}
|
||||
|
||||
async function testExecuteWithConcurrency(): Promise<void> {
|
||||
console.log('\n=== Test 2: executeWithConcurrency ===');
|
||||
|
||||
const tasks = Array.from({ length: 10 }, (_, i) => () => simulateApiCall(i + 1, 100));
|
||||
|
||||
const start = Date.now();
|
||||
const results = await executeWithConcurrency(tasks, 5);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
console.log(`✓ Completed ${results.length} tasks with max 5 concurrent in ${duration}ms`);
|
||||
console.log(` Results: ${results.map(r => r.id).join(', ')}`);
|
||||
}
|
||||
|
||||
async function testBatchProcessor(): Promise<void> {
|
||||
console.log('\n=== Test 3: BatchProcessor ===');
|
||||
|
||||
const items = Array.from({ length: 20 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` }));
|
||||
let processedCount = 0;
|
||||
|
||||
const processor = new BatchProcessor(
|
||||
async (item: { id: number; name: string }) => {
|
||||
await simulateApiCall(item.id, 50);
|
||||
processedCount++;
|
||||
return { ...item, processed: true };
|
||||
},
|
||||
4, // concurrency
|
||||
{
|
||||
onProgress: (completed, total) => {
|
||||
if (completed % 5 === 0 || completed === total) {
|
||||
console.log(` Progress: ${completed}/${total} (${Math.round(completed / total * 100)}%)`);
|
||||
}
|
||||
},
|
||||
onError: (error, item) => {
|
||||
console.error(` Error processing item ${item.id}:`, error.message);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
const start = Date.now();
|
||||
const results = await processor.process(items);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
console.log(`✓ Processed ${results.length} items with max 4 concurrent in ${duration}ms`);
|
||||
console.log(` Expected ~500ms (20 items / 4 concurrent * 50ms each)`);
|
||||
}
|
||||
|
||||
async function testErrorHandling(): Promise<void> {
|
||||
console.log('\n=== Test 4: Error Handling ===');
|
||||
|
||||
const items = [1, 2, 3, 4, 5];
|
||||
let errorCount = 0;
|
||||
|
||||
const processor = new BatchProcessor(
|
||||
async (id: number) => {
|
||||
if (id % 2 === 0) {
|
||||
throw new Error(`Simulated error for ${id}`);
|
||||
}
|
||||
return { id, success: true };
|
||||
},
|
||||
3,
|
||||
{
|
||||
onError: (error, item) => {
|
||||
errorCount++;
|
||||
console.log(` Caught error for item ${item}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
const results = await processor.process(items);
|
||||
console.log(`✓ Completed: ${results.length} success, ${errorCount} errors (errors handled gracefully)`);
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log('╔═══════════════════════════════════════════════════╗');
|
||||
console.log('║ Concurrency Module Test Suite ║');
|
||||
console.log('╚═══════════════════════════════════════════════════╝');
|
||||
|
||||
try {
|
||||
await testSemaphore();
|
||||
await testExecuteWithConcurrency();
|
||||
await testBatchProcessor();
|
||||
await testErrorHandling();
|
||||
|
||||
console.log('\n╔═══════════════════════════════════════════════════╗');
|
||||
console.log('║ All tests passed! ✓ ║');
|
||||
console.log('╚═══════════════════════════════════════════════════╝');
|
||||
console.log('\n📝 Configuration:');
|
||||
console.log(' - Set CONCURRENT_API_CALLS in .env to control parallelism');
|
||||
console.log(' - Current default: 8 concurrent requests');
|
||||
console.log(' - Example: CONCURRENT_API_CALLS=16 for faster crawling');
|
||||
console.log('');
|
||||
} catch (error) {
|
||||
console.error('\n❌ Test failed:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
220
utils/semaphore.ts
Normal file
220
utils/semaphore.ts
Normal file
@@ -0,0 +1,220 @@
|
||||
// utils/semaphore.ts
|
||||
/**
|
||||
* Semaphore implementation for controlling concurrent operations
|
||||
* Based on patterns from civitai/civitai and p-queue
|
||||
*/
|
||||
|
||||
export class Semaphore {
|
||||
private capacity: number;
|
||||
private permits: number;
|
||||
private queue: Array<() => void> = [];
|
||||
|
||||
constructor(capacity: number) {
|
||||
if (capacity < 1) {
|
||||
throw new Error('Semaphore capacity must be at least 1');
|
||||
}
|
||||
this.capacity = capacity;
|
||||
this.permits = capacity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Acquire a permit. If none available, waits until one is released.
|
||||
*/
|
||||
async acquire(): Promise<void> {
|
||||
return new Promise<void>((resolve) => {
|
||||
if (this.permits > 0) {
|
||||
this.permits--;
|
||||
resolve();
|
||||
} else {
|
||||
// Queue the release callback
|
||||
this.queue.push(() => {
|
||||
resolve();
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Release a permit and wake up a waiting task if any.
|
||||
*/
|
||||
release(): void {
|
||||
if (this.queue.length > 0) {
|
||||
const next = this.queue.shift();
|
||||
if (next) {
|
||||
next();
|
||||
}
|
||||
} else {
|
||||
this.permits++;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current available permits.
|
||||
*/
|
||||
getAvailablePermits(): number {
|
||||
return this.permits;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get total capacity.
|
||||
*/
|
||||
getCapacity(): number {
|
||||
return this.capacity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get number of waiting tasks.
|
||||
*/
|
||||
getWaitingCount(): number {
|
||||
return this.queue.length;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute async tasks with concurrency limit
|
||||
* @param tasks Array of async task functions
|
||||
* @param concurrency Maximum concurrent tasks
|
||||
* @returns Promise that resolves with all results when complete
|
||||
*/
|
||||
export async function executeWithConcurrency<T>(
|
||||
tasks: Array<() => Promise<T>>,
|
||||
concurrency: number
|
||||
): Promise<T[]> {
|
||||
const semaphore = new Semaphore(concurrency);
|
||||
const results: T[] = new Array(tasks.length);
|
||||
|
||||
const promises = tasks.map(async (task, index) => {
|
||||
await semaphore.acquire();
|
||||
try {
|
||||
results[index] = await task();
|
||||
return results[index];
|
||||
} finally {
|
||||
semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
return Promise.all(promises);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute async tasks with concurrency limit and progress callback
|
||||
* @param tasks Array of async task functions
|
||||
* @param concurrency Maximum concurrent tasks
|
||||
* @param onProgress Callback with (completed, total, result) after each task
|
||||
* @returns Promise that resolves with all results when complete
|
||||
*/
|
||||
export async function executeWithConcurrencyAndProgress<T>(
|
||||
tasks: Array<() => Promise<T>>,
|
||||
concurrency: number,
|
||||
onProgress?: (completed: number, total: number, result: T, error?: Error) => void
|
||||
): Promise<T[]> {
|
||||
const semaphore = new Semaphore(concurrency);
|
||||
const results: T[] = new Array(tasks.length);
|
||||
let completed = 0;
|
||||
const total = tasks.length;
|
||||
|
||||
const promises = tasks.map(async (task, index) => {
|
||||
await semaphore.acquire();
|
||||
try {
|
||||
results[index] = await task();
|
||||
completed++;
|
||||
onProgress?.(completed, total, results[index]!);
|
||||
return results[index];
|
||||
} catch (error) {
|
||||
completed++;
|
||||
onProgress?.(completed, total, undefined as T, error as Error);
|
||||
throw error;
|
||||
} finally {
|
||||
semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
return Promise.all(promises);
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch processor with concurrency control
|
||||
* Useful for processing large arrays in chunks with controlled concurrency
|
||||
*/
|
||||
export class BatchProcessor<T, R> {
|
||||
private semaphore: Semaphore;
|
||||
private processor: (item: T, index: number) => Promise<R>;
|
||||
private onError?: (error: Error, item: T, index: number) => void;
|
||||
private onProgress?: (completed: number, total: number) => void;
|
||||
|
||||
constructor(
|
||||
processor: (item: T, index: number) => Promise<R>,
|
||||
concurrency: number,
|
||||
options?: {
|
||||
onError?: (error: Error, item: T, index: number) => void;
|
||||
onProgress?: (completed: number, total: number) => void;
|
||||
}
|
||||
) {
|
||||
this.processor = processor;
|
||||
this.semaphore = new Semaphore(concurrency);
|
||||
this.onError = options?.onError;
|
||||
this.onProgress = options?.onProgress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an array of items with concurrency control
|
||||
* Only returns successful results, errors are handled by onError callback
|
||||
*/
|
||||
async process(items: T[]): Promise<Awaited<R>[]> {
|
||||
const results: (Awaited<R> | undefined)[] = new Array(items.length);
|
||||
let completed = 0;
|
||||
const total = items.length;
|
||||
|
||||
const promises = items.map(async (item, index) => {
|
||||
await this.semaphore.acquire();
|
||||
try {
|
||||
const result = await this.processor(item, index);
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
return result;
|
||||
} catch (error) {
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
this.onError?.(error as Error, item, index);
|
||||
return undefined;
|
||||
} finally {
|
||||
this.semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
const allResults = await Promise.all(promises);
|
||||
return allResults.filter((r): r is Awaited<R> => r !== undefined);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an array and return both results and errors
|
||||
*/
|
||||
async processWithErrors(items: T[]): Promise<{
|
||||
results: R[];
|
||||
errors: Array<{ error: Error; item: T; index: number }>;
|
||||
}> {
|
||||
const results: R[] = [];
|
||||
const errors: Array<{ error: Error; item: T; index: number }> = [];
|
||||
let completed = 0;
|
||||
const total = items.length;
|
||||
|
||||
const promises = items.map(async (item, index) => {
|
||||
await this.semaphore.acquire();
|
||||
try {
|
||||
const result = await this.processor(item, index);
|
||||
results.push(result);
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
} catch (error) {
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
errors.push({ error: error as Error, item, index });
|
||||
} finally {
|
||||
this.semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(promises);
|
||||
return { results, errors };
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user