重构 scan: 实现多线程并发爬虫功能
- 新增 Semaphore 信号量类控制并发数 - 新增 BatchProcessor 批量处理器带进度回调 - 重构 initializeClubCache 和 updateStaleClubs 为并发模式 - 修复 Cookie 4xx 判断逻辑(仅 401/403 触发重新登录) - 添加环境变量配置:CONCURRENT_API_CALLS 等 - 新增并发功能测试脚本 test-concurrency.ts 性能提升:从串行处理提升至可配置的并发处理(默认 8 线程) 修复问题:404 错误不再误判为认证失败
This commit is contained in:
@@ -2,7 +2,6 @@
|
|||||||
import axios from 'axios';
|
import axios from 'axios';
|
||||||
import { logger } from '../utils/logger';
|
import { logger } from '../utils/logger';
|
||||||
import {
|
import {
|
||||||
loginWithPlaywright,
|
|
||||||
ensureSingleLogin,
|
ensureSingleLogin,
|
||||||
loadCachedCookies,
|
loadCachedCookies,
|
||||||
saveCookiesToCache,
|
saveCookiesToCache,
|
||||||
@@ -28,68 +27,6 @@ class AuthenticationError extends Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Test cookie validity by calling API
|
|
||||||
*/
|
|
||||||
async function testCookieValidityWithApi(cookieString: string): Promise<boolean> {
|
|
||||||
if (!cookieString) return false;
|
|
||||||
logger.debug('Testing cookie validity via API...');
|
|
||||||
|
|
||||||
const MAX_RETRIES = 3;
|
|
||||||
let attempt = 0;
|
|
||||||
|
|
||||||
while (attempt < MAX_RETRIES) {
|
|
||||||
try {
|
|
||||||
attempt++;
|
|
||||||
const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
|
|
||||||
const headers = {
|
|
||||||
'Content-Type': 'application/json; charset=UTF-8',
|
|
||||||
'Cookie': cookieString,
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Bun DSAS-CCA get-activity Module)',
|
|
||||||
};
|
|
||||||
const payload = {
|
|
||||||
"activityID": "3350"
|
|
||||||
};
|
|
||||||
|
|
||||||
logger.debug(`Attempt ${attempt}/${MAX_RETRIES}`);
|
|
||||||
const response = await axios.post(url, payload, {
|
|
||||||
headers,
|
|
||||||
timeout: 10000
|
|
||||||
});
|
|
||||||
|
|
||||||
// Check for 4xx errors (auth failures)
|
|
||||||
if (response.status >= 400 && response.status < 500) {
|
|
||||||
logger.warn(`Cookie test returned ${response.status}, likely invalid`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug('Cookie test successful (API responded 2xx). Cookie is valid.');
|
|
||||||
return true;
|
|
||||||
} catch (error: any) {
|
|
||||||
logger.warn(`Cookie validity test failed (attempt ${attempt}/${MAX_RETRIES}).`);
|
|
||||||
if (error.response) {
|
|
||||||
// 4xx = auth failure (immediate fail)
|
|
||||||
if (error.response.status >= 400 && error.response.status < 500) {
|
|
||||||
logger.warn(`Cookie test API response status: ${error.response.status} (auth error)`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// 5xx = server error (retry with delay)
|
|
||||||
logger.warn(`Cookie test API response status: ${error.response.status} (server error, retrying...)`);
|
|
||||||
} else {
|
|
||||||
// No response (000 status, network error, timeout)
|
|
||||||
logger.warn(`Network/timeout error: ${error.message} (retrying...)`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (attempt < MAX_RETRIES) {
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000 * (attempt + 1)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.warn('Max retries reached. Cookie is likely invalid or expired.');
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get complete cookies using Playwright with single login lock
|
* Get complete cookies using Playwright with single login lock
|
||||||
*/
|
*/
|
||||||
@@ -112,7 +49,7 @@ async function getCompleteCookies(userName: string, userPwd: string): Promise<st
|
|||||||
async function getActivityDetailsRaw(
|
async function getActivityDetailsRaw(
|
||||||
activityId: string,
|
activityId: string,
|
||||||
cookies: string,
|
cookies: string,
|
||||||
maxRetries: number = 2,
|
maxRetries: number = 3,
|
||||||
timeoutMilliseconds: number = 10000
|
timeoutMilliseconds: number = 10000
|
||||||
): Promise<string | null> {
|
): Promise<string | null> {
|
||||||
const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
|
const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
|
||||||
@@ -128,11 +65,15 @@ async function getActivityDetailsRaw(
|
|||||||
|
|
||||||
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
||||||
try {
|
try {
|
||||||
|
logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Sending POST request to ${url}`);
|
||||||
const response = await axios.post(url, payload, {
|
const response = await axios.post(url, payload, {
|
||||||
headers,
|
headers,
|
||||||
timeout: timeoutMilliseconds,
|
timeout: timeoutMilliseconds,
|
||||||
responseType: 'text'
|
responseType: 'text',
|
||||||
|
// Add additional timeout safety
|
||||||
|
maxRedirects: 5
|
||||||
});
|
});
|
||||||
|
logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Received response status ${response.status}`);
|
||||||
const outerData = JSON.parse(response.data);
|
const outerData = JSON.parse(response.data);
|
||||||
if (outerData && typeof outerData.d === 'string') {
|
if (outerData && typeof outerData.d === 'string') {
|
||||||
const innerData = JSON.parse(outerData.d);
|
const innerData = JSON.parse(outerData.d);
|
||||||
@@ -145,8 +86,10 @@ async function getActivityDetailsRaw(
|
|||||||
logger.error(`Unexpected API response structure for activity ${activityId}.`);
|
logger.error(`Unexpected API response structure for activity ${activityId}.`);
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
// Check if response status is in 4xx range (400-499) to trigger auth error
|
// Only treat 401 (Unauthorized) and 403 (Forbidden) as authentication errors
|
||||||
if (error.response && error.response.status >= 400 && error.response.status < 500) {
|
// 404 (Not Found) is valid - activity doesn't exist
|
||||||
|
// Other 4xx errors should not trigger re-authentication
|
||||||
|
if (error.response && (error.response.status === 401 || error.response.status === 403)) {
|
||||||
logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`);
|
logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`);
|
||||||
throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status);
|
throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status);
|
||||||
}
|
}
|
||||||
@@ -177,7 +120,7 @@ export async function fetchActivityData(
|
|||||||
activityId: string,
|
activityId: string,
|
||||||
userName: string,
|
userName: string,
|
||||||
userPwd: string,
|
userPwd: string,
|
||||||
forceLogin: boolean = false
|
forceLogin: boolean = false,
|
||||||
): Promise<any | null> {
|
): Promise<any | null> {
|
||||||
let currentCookie = forceLogin ? null : await getCachedCookieString();
|
let currentCookie = forceLogin ? null : await getCachedCookieString();
|
||||||
|
|
||||||
@@ -212,7 +155,9 @@ export async function fetchActivityData(
|
|||||||
logger.debug('Using cached cookie for API request.');
|
logger.debug('Using cached cookie for API request.');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
logger.debug(`Calling getActivityDetailsRaw for activity ${activityId}...`);
|
||||||
const rawActivityDetailsString = await getActivityDetailsRaw(activityId, currentCookie);
|
const rawActivityDetailsString = await getActivityDetailsRaw(activityId, currentCookie);
|
||||||
|
logger.debug(`getActivityDetailsRaw returned for activity ${activityId}`);
|
||||||
if (rawActivityDetailsString) {
|
if (rawActivityDetailsString) {
|
||||||
const parsedOuter = JSON.parse(rawActivityDetailsString);
|
const parsedOuter = JSON.parse(rawActivityDetailsString);
|
||||||
return JSON.parse(parsedOuter.d);
|
return JSON.parse(parsedOuter.d);
|
||||||
|
|||||||
37
example.env
37
example.env
@@ -11,26 +11,35 @@ S3_SECRET_ACCESS_KEY=
|
|||||||
S3_REGION=
|
S3_REGION=
|
||||||
S3_PUBLIC_URL_PREFIX=files
|
S3_PUBLIC_URL_PREFIX=files
|
||||||
REDIS_URL=redis://:dsas-cca@redis:6379
|
REDIS_URL=redis://:dsas-cca@redis:6379
|
||||||
|
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CRAWLER CONCURRENCY CONFIGURATION
|
||||||
|
# ============================================================================
|
||||||
MIN_ACTIVITY_ID_SCAN=3000
|
MIN_ACTIVITY_ID_SCAN=3000
|
||||||
MAX_ACTIVITY_ID_SCAN=8000
|
MAX_ACTIVITY_ID_SCAN=8000
|
||||||
CONCURRENT_API_CALLS=16
|
|
||||||
|
# Maximum concurrent API calls during crawling (default: 8)
|
||||||
|
# Higher values = faster crawling but more server load
|
||||||
|
# Set to 1 for sequential processing (slow but safe)
|
||||||
|
CONCURRENT_API_CALLS=8
|
||||||
|
|
||||||
|
# Request timeout in milliseconds (default: 25000 = 25 seconds)
|
||||||
|
CRAWLER_REQUEST_TIMEOUT_MS=25000
|
||||||
|
|
||||||
|
# Maximum retries per request on transient errors (default: 3)
|
||||||
|
CRAWLER_MAX_RETRIES=3
|
||||||
|
|
||||||
|
# Delay between retries in milliseconds (default: 1000 = 1 second)
|
||||||
|
CRAWLER_RETRY_DELAY_MS=1000
|
||||||
|
|
||||||
|
# Rate limit: maximum requests per minute (default: unlimited)
|
||||||
|
# Set to 0 for no limit
|
||||||
|
CRAWLER_REQUESTS_PER_MINUTE=0
|
||||||
STAFF_UPDATE_INTERVAL_MINS=360
|
STAFF_UPDATE_INTERVAL_MINS=360
|
||||||
CLUB_UPDATE_INTERVAL_MINS=360
|
CLUB_UPDATE_INTERVAL_MINS=360
|
||||||
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
|
|
||||||
|
|
||||||
# Cache TTL Configuration (in seconds)
|
# Cache TTL Configuration (in seconds)
|
||||||
ACTIVITY_CACHE_TTL=86400 # 24 hours for normal activity data
|
ACTIVITY_CACHE_TTL=86400 # 24 hours for normal activity data
|
||||||
STAFF_CACHE_TTL=86400 # 24 hours for staff data
|
STAFF_CACHE_TTL=86400 # 24 hours for staff data
|
||||||
ERROR_CACHE_TTL=3600 # 1 hour for error states (allows retry)
|
ERROR_CACHE_TTL=3600 # 1 hour for error states (allows retry)
|
||||||
|
|
||||||
# Proxy Configuration (Optional)
|
|
||||||
# Set USE_PROXY=true to enable proxy for Playwright requests
|
|
||||||
USE_PROXY=false
|
|
||||||
# Custom proxy server (default: socks5://warp-proxy:9091 when using warp-proxy service)
|
|
||||||
# Examples:
|
|
||||||
# HTTP: http://proxy.example.com:8080
|
|
||||||
# SOCKS5: socks5://proxy.example.com:1080
|
|
||||||
# Warp: socks5://warp-proxy:9091
|
|
||||||
ALL_PROXY=
|
|
||||||
HTTP_PROXY=
|
|
||||||
HTTPS_PROXY=
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import {
|
|||||||
import { uploadImageFromBase64, listS3Objects, deleteS3Objects, constructS3Url } from './s3-service';
|
import { uploadImageFromBase64, listS3Objects, deleteS3Objects, constructS3Url } from './s3-service';
|
||||||
import { extractBase64Image } from '../utils/image-processor';
|
import { extractBase64Image } from '../utils/image-processor';
|
||||||
import { logger } from '../utils/logger';
|
import { logger } from '../utils/logger';
|
||||||
|
import { BatchProcessor, executeWithConcurrencyAndProgress } from '../utils/semaphore';
|
||||||
|
|
||||||
import type { ActivityData } from '../models/activity';
|
import type { ActivityData } from '../models/activity';
|
||||||
|
|
||||||
@@ -24,13 +25,17 @@ const USERNAME = process.env.API_USERNAME;
|
|||||||
const PASSWORD = process.env.API_PASSWORD;
|
const PASSWORD = process.env.API_PASSWORD;
|
||||||
const MIN_ACTIVITY_ID_SCAN = parseInt(process.env.MIN_ACTIVITY_ID_SCAN || '0', 10);
|
const MIN_ACTIVITY_ID_SCAN = parseInt(process.env.MIN_ACTIVITY_ID_SCAN || '0', 10);
|
||||||
const MAX_ACTIVITY_ID_SCAN = parseInt(process.env.MAX_ACTIVITY_ID_SCAN || '9999', 10);
|
const MAX_ACTIVITY_ID_SCAN = parseInt(process.env.MAX_ACTIVITY_ID_SCAN || '9999', 10);
|
||||||
// Default 5 concurrent calls (was 10) - Sharp AVIF processing is CPU-intensive and can block event loop
|
|
||||||
const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '5', 10);
|
|
||||||
const CLUB_UPDATE_INTERVAL_MINS = parseInt(process.env.CLUB_UPDATE_INTERVAL_MINS || '60', 10);
|
const CLUB_UPDATE_INTERVAL_MINS = parseInt(process.env.CLUB_UPDATE_INTERVAL_MINS || '60', 10);
|
||||||
const STAFF_UPDATE_INTERVAL_MINS = parseInt(process.env.STAFF_UPDATE_INTERVAL_MINS || '60', 10);
|
const STAFF_UPDATE_INTERVAL_MINS = parseInt(process.env.STAFF_UPDATE_INTERVAL_MINS || '60', 10);
|
||||||
const FIXED_STAFF_ACTIVITY_ID = process.env.FIXED_STAFF_ACTIVITY_ID;
|
const FIXED_STAFF_ACTIVITY_ID = process.env.FIXED_STAFF_ACTIVITY_ID;
|
||||||
const S3_IMAGE_PREFIX = (process.env.S3_PUBLIC_URL_PREFIX || 'files').replace(/\/$/, '');
|
const S3_IMAGE_PREFIX = (process.env.S3_PUBLIC_URL_PREFIX || 'files').replace(/\/$/, '');
|
||||||
|
|
||||||
|
// Crawler concurrency configuration
|
||||||
|
const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '8', 10);
|
||||||
|
const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10);
|
||||||
|
const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10);
|
||||||
|
const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10);
|
||||||
|
|
||||||
// Module-level counter for skipped activities (reset at start of each scan)
|
// Module-level counter for skipped activities (reset at start of each scan)
|
||||||
let skippedCount = 0;
|
let skippedCount = 0;
|
||||||
|
|
||||||
@@ -46,7 +51,14 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
|
|||||||
throw new Error('API username or password not configured');
|
throw new Error('API username or password not configured');
|
||||||
}
|
}
|
||||||
|
|
||||||
const activityJson = await fetchActivityData(activityId, USERNAME, PASSWORD);
|
// Add timeout protection for the entire fetch operation
|
||||||
|
logger.debug(`Fetching activity data for ID: ${activityId}`);
|
||||||
|
const activityJson = await Promise.race([
|
||||||
|
fetchActivityData(activityId, USERNAME, PASSWORD, false),
|
||||||
|
new Promise((_, reject) =>
|
||||||
|
setTimeout(() => reject(new Error(`Timeout fetching activity ${activityId} after ${CRAWLER_REQUEST_TIMEOUT_MS}ms`)), CRAWLER_REQUEST_TIMEOUT_MS + 5000)
|
||||||
|
)
|
||||||
|
]);
|
||||||
let structuredActivity: ActivityData;
|
let structuredActivity: ActivityData;
|
||||||
|
|
||||||
if (!activityJson) {
|
if (!activityJson) {
|
||||||
@@ -92,7 +104,7 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Process a single activity for initialization (extracted for Promise.allSettled)
|
* Process a single activity for initialization
|
||||||
* @param activityId - The activity ID to process
|
* @param activityId - The activity ID to process
|
||||||
*/
|
*/
|
||||||
async function processSingleActivity(activityId: string): Promise<void> {
|
async function processSingleActivity(activityId: string): Promise<void> {
|
||||||
@@ -112,66 +124,112 @@ async function processSingleActivity(activityId: string): Promise<void> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the club cache by scanning through all activity IDs
|
* Initialize the club cache by scanning through all activity IDs
|
||||||
* Processed sequentially in single-threaded mode
|
* Processed concurrently with controlled parallelism
|
||||||
*/
|
*/
|
||||||
export async function initializeClubCache(): Promise<void> {
|
export async function initializeClubCache(): Promise<void> {
|
||||||
logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN} (single-threaded mode)`);
|
logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`);
|
||||||
|
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
|
||||||
|
|
||||||
const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1;
|
const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1;
|
||||||
let processedCount = 0;
|
|
||||||
let successCount = 0;
|
let successCount = 0;
|
||||||
let errorCount = 0;
|
let errorCount = 0;
|
||||||
skippedCount = 0; // Reset for this run
|
skippedCount = 0; // Reset for this run
|
||||||
|
|
||||||
for (let i = MIN_ACTIVITY_ID_SCAN; i <= MAX_ACTIVITY_ID_SCAN; i++) {
|
// Generate array of activity IDs
|
||||||
const activityId = String(i);
|
const activityIds = Array.from(
|
||||||
|
{ length: totalIds },
|
||||||
try {
|
(_, i) => String(MIN_ACTIVITY_ID_SCAN + i)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create batch processor with concurrency control
|
||||||
|
const processor = new BatchProcessor(
|
||||||
|
async (activityId: string) => {
|
||||||
await processSingleActivity(activityId);
|
await processSingleActivity(activityId);
|
||||||
successCount++;
|
return activityId;
|
||||||
} catch (error) {
|
},
|
||||||
errorCount++;
|
CONCURRENT_API_CALLS,
|
||||||
logger.error(`Error processing activity ID ${activityId}:`, error);
|
{
|
||||||
|
onError: (error, activityId) => {
|
||||||
|
errorCount++;
|
||||||
|
logger.error(`Error processing activity ID ${activityId}:`, error);
|
||||||
|
},
|
||||||
|
onProgress: (completed, total) => {
|
||||||
|
if (completed % 100 === 0 || completed === total) {
|
||||||
|
const mem = process.memoryUsage();
|
||||||
|
logger.info(`Progress: ${completed}/${total} (${Math.round(completed/total*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB | Concurrent: ${CONCURRENT_API_CALLS}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
);
|
||||||
processedCount++;
|
|
||||||
|
// Process all activities concurrently
|
||||||
if (processedCount % 100 === 0) {
|
const results = await processor.process(activityIds);
|
||||||
const mem = process.memoryUsage();
|
successCount = results.length;
|
||||||
logger.info(`Progress: ${processedCount}/${totalIds} (${Math.round(processedCount/totalIds*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(`Initial club cache population finished.`);
|
logger.info(`Initial club cache population finished.`);
|
||||||
logger.info(`Summary: Total: ${totalIds}, Processed: ${processedCount}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
|
logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Update stale clubs in the cache
|
* Update stale clubs in the cache
|
||||||
* Processed sequentially in single-threaded mode
|
* Processed concurrently with controlled parallelism
|
||||||
*/
|
*/
|
||||||
export async function updateStaleClubs(): Promise<void> {
|
export async function updateStaleClubs(): Promise<void> {
|
||||||
logger.info('Starting stale club check (single-threaded mode)...');
|
logger.info('Starting stale club check...');
|
||||||
|
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000;
|
const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000;
|
||||||
const activityKeys = await getAllActivityKeys();
|
const activityKeys = await getAllActivityKeys();
|
||||||
|
|
||||||
|
// Identify stale activities
|
||||||
|
const staleActivityIds: string[] = [];
|
||||||
for (const key of activityKeys) {
|
for (const key of activityKeys) {
|
||||||
const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
|
const activityId = key.substring(ACTIVITY_KEY_PREFIX.length);
|
||||||
const cachedData = await getActivityData(activityId);
|
const cachedData = await getActivityData(activityId);
|
||||||
|
|
||||||
if (cachedData && cachedData.lastCheck) {
|
const needsUpdate = !cachedData ||
|
||||||
const lastCheckTime = new Date(cachedData.lastCheck).getTime();
|
Object.keys(cachedData).length === 0 ||
|
||||||
if ((now - lastCheckTime) > updateIntervalMs || cachedData.error) {
|
(!cachedData.lastCheck && !cachedData.error) ||
|
||||||
logger.info(`Activity ${activityId} is stale or had error. Updating...`);
|
(cachedData.lastCheck && (now - new Date(cachedData.lastCheck).getTime()) > updateIntervalMs) ||
|
||||||
await processAndCacheActivity(activityId);
|
cachedData.error;
|
||||||
}
|
|
||||||
} else if (!cachedData || Object.keys(cachedData).length === 0) {
|
if (needsUpdate) {
|
||||||
logger.info(`Activity ${activityId} not in cache or is empty object. Attempting to fetch...`);
|
staleActivityIds.push(activityId);
|
||||||
await processAndCacheActivity(activityId);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (staleActivityIds.length === 0) {
|
||||||
|
logger.info('No stale activities found. Skipping update.');
|
||||||
|
await cleanupOrphanedS3Images();
|
||||||
|
logger.info('Stale club check finished.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`Found ${staleActivityIds.length} stale activities to update.`);
|
||||||
|
|
||||||
|
// Create batch processor for concurrent updates
|
||||||
|
const processor = new BatchProcessor(
|
||||||
|
async (activityId: string) => {
|
||||||
|
logger.debug(`Updating stale activity ${activityId}`);
|
||||||
|
await processAndCacheActivity(activityId);
|
||||||
|
return activityId;
|
||||||
|
},
|
||||||
|
CONCURRENT_API_CALLS,
|
||||||
|
{
|
||||||
|
onError: (error, activityId) => {
|
||||||
|
logger.error(`Error updating stale activity ${activityId}:`, error);
|
||||||
|
},
|
||||||
|
onProgress: (completed, total) => {
|
||||||
|
if (completed % 10 === 0 || completed === total) {
|
||||||
|
logger.info(`Update progress: ${completed}/${total} (${Math.round(completed/total*100)}%)`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// Process stale activities concurrently
|
||||||
|
await processor.process(staleActivityIds);
|
||||||
|
|
||||||
await cleanupOrphanedS3Images();
|
await cleanupOrphanedS3Images();
|
||||||
|
|
||||||
logger.info('Stale club check finished.');
|
logger.info('Stale club check finished.');
|
||||||
|
|||||||
141
test/test-concurrency.ts
Normal file
141
test/test-concurrency.ts
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
// test/test-concurrency.ts
|
||||||
|
/**
|
||||||
|
* Test script for concurrency features
|
||||||
|
* Run with: bun run test/test-concurrency.ts
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Semaphore, executeWithConcurrency, BatchProcessor } from '../utils/semaphore';
|
||||||
|
|
||||||
|
// Simulate API call
|
||||||
|
function simulateApiCall(id: number, delay: number = 100): Promise<{ id: number; result: string }> {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
setTimeout(() => {
|
||||||
|
resolve({ id, result: `Result for ${id}` });
|
||||||
|
}, delay);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testSemaphore(): Promise<void> {
|
||||||
|
console.log('\n=== Test 1: Basic Semaphore ===');
|
||||||
|
const semaphore = new Semaphore(3);
|
||||||
|
|
||||||
|
const start = Date.now();
|
||||||
|
const promises = [];
|
||||||
|
|
||||||
|
for (let i = 1; i <= 10; i++) {
|
||||||
|
const id = i;
|
||||||
|
promises.push(
|
||||||
|
(async () => {
|
||||||
|
await semaphore.acquire();
|
||||||
|
console.log(`[${id}] Acquired permit (available: ${semaphore.getAvailablePermits()})`);
|
||||||
|
await simulateApiCall(id, 200);
|
||||||
|
console.log(`[${id}] Releasing permit`);
|
||||||
|
semaphore.release();
|
||||||
|
})()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
await Promise.all(promises);
|
||||||
|
const duration = Date.now() - start;
|
||||||
|
console.log(`\n✓ Completed 10 tasks with max 3 concurrent in ${duration}ms`);
|
||||||
|
console.log(` (Sequential would take ~2000ms, parallel should be ~700-800ms)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testExecuteWithConcurrency(): Promise<void> {
|
||||||
|
console.log('\n=== Test 2: executeWithConcurrency ===');
|
||||||
|
|
||||||
|
const tasks = Array.from({ length: 10 }, (_, i) => () => simulateApiCall(i + 1, 100));
|
||||||
|
|
||||||
|
const start = Date.now();
|
||||||
|
const results = await executeWithConcurrency(tasks, 5);
|
||||||
|
const duration = Date.now() - start;
|
||||||
|
|
||||||
|
console.log(`✓ Completed ${results.length} tasks with max 5 concurrent in ${duration}ms`);
|
||||||
|
console.log(` Results: ${results.map(r => r.id).join(', ')}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testBatchProcessor(): Promise<void> {
|
||||||
|
console.log('\n=== Test 3: BatchProcessor ===');
|
||||||
|
|
||||||
|
const items = Array.from({ length: 20 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` }));
|
||||||
|
let processedCount = 0;
|
||||||
|
|
||||||
|
const processor = new BatchProcessor(
|
||||||
|
async (item: { id: number; name: string }) => {
|
||||||
|
await simulateApiCall(item.id, 50);
|
||||||
|
processedCount++;
|
||||||
|
return { ...item, processed: true };
|
||||||
|
},
|
||||||
|
4, // concurrency
|
||||||
|
{
|
||||||
|
onProgress: (completed, total) => {
|
||||||
|
if (completed % 5 === 0 || completed === total) {
|
||||||
|
console.log(` Progress: ${completed}/${total} (${Math.round(completed / total * 100)}%)`);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
onError: (error, item) => {
|
||||||
|
console.error(` Error processing item ${item.id}:`, error.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const start = Date.now();
|
||||||
|
const results = await processor.process(items);
|
||||||
|
const duration = Date.now() - start;
|
||||||
|
|
||||||
|
console.log(`✓ Processed ${results.length} items with max 4 concurrent in ${duration}ms`);
|
||||||
|
console.log(` Expected ~500ms (20 items / 4 concurrent * 50ms each)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function testErrorHandling(): Promise<void> {
|
||||||
|
console.log('\n=== Test 4: Error Handling ===');
|
||||||
|
|
||||||
|
const items = [1, 2, 3, 4, 5];
|
||||||
|
let errorCount = 0;
|
||||||
|
|
||||||
|
const processor = new BatchProcessor(
|
||||||
|
async (id: number) => {
|
||||||
|
if (id % 2 === 0) {
|
||||||
|
throw new Error(`Simulated error for ${id}`);
|
||||||
|
}
|
||||||
|
return { id, success: true };
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
{
|
||||||
|
onError: (error, item) => {
|
||||||
|
errorCount++;
|
||||||
|
console.log(` Caught error for item ${item}: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await processor.process(items);
|
||||||
|
console.log(`✓ Completed: ${results.length} success, ${errorCount} errors (errors handled gracefully)`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
console.log('╔═══════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ Concurrency Module Test Suite ║');
|
||||||
|
console.log('╚═══════════════════════════════════════════════════╝');
|
||||||
|
|
||||||
|
try {
|
||||||
|
await testSemaphore();
|
||||||
|
await testExecuteWithConcurrency();
|
||||||
|
await testBatchProcessor();
|
||||||
|
await testErrorHandling();
|
||||||
|
|
||||||
|
console.log('\n╔═══════════════════════════════════════════════════╗');
|
||||||
|
console.log('║ All tests passed! ✓ ║');
|
||||||
|
console.log('╚═══════════════════════════════════════════════════╝');
|
||||||
|
console.log('\n📝 Configuration:');
|
||||||
|
console.log(' - Set CONCURRENT_API_CALLS in .env to control parallelism');
|
||||||
|
console.log(' - Current default: 8 concurrent requests');
|
||||||
|
console.log(' - Example: CONCURRENT_API_CALLS=16 for faster crawling');
|
||||||
|
console.log('');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('\n❌ Test failed:', error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main();
|
||||||
220
utils/semaphore.ts
Normal file
220
utils/semaphore.ts
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
// utils/semaphore.ts
|
||||||
|
/**
|
||||||
|
* Semaphore implementation for controlling concurrent operations
|
||||||
|
* Based on patterns from civitai/civitai and p-queue
|
||||||
|
*/
|
||||||
|
|
||||||
|
export class Semaphore {
|
||||||
|
private capacity: number;
|
||||||
|
private permits: number;
|
||||||
|
private queue: Array<() => void> = [];
|
||||||
|
|
||||||
|
constructor(capacity: number) {
|
||||||
|
if (capacity < 1) {
|
||||||
|
throw new Error('Semaphore capacity must be at least 1');
|
||||||
|
}
|
||||||
|
this.capacity = capacity;
|
||||||
|
this.permits = capacity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Acquire a permit. If none available, waits until one is released.
|
||||||
|
*/
|
||||||
|
async acquire(): Promise<void> {
|
||||||
|
return new Promise<void>((resolve) => {
|
||||||
|
if (this.permits > 0) {
|
||||||
|
this.permits--;
|
||||||
|
resolve();
|
||||||
|
} else {
|
||||||
|
// Queue the release callback
|
||||||
|
this.queue.push(() => {
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Release a permit and wake up a waiting task if any.
|
||||||
|
*/
|
||||||
|
release(): void {
|
||||||
|
if (this.queue.length > 0) {
|
||||||
|
const next = this.queue.shift();
|
||||||
|
if (next) {
|
||||||
|
next();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
this.permits++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current available permits.
|
||||||
|
*/
|
||||||
|
getAvailablePermits(): number {
|
||||||
|
return this.permits;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get total capacity.
|
||||||
|
*/
|
||||||
|
getCapacity(): number {
|
||||||
|
return this.capacity;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get number of waiting tasks.
|
||||||
|
*/
|
||||||
|
getWaitingCount(): number {
|
||||||
|
return this.queue.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute async tasks with concurrency limit
|
||||||
|
* @param tasks Array of async task functions
|
||||||
|
* @param concurrency Maximum concurrent tasks
|
||||||
|
* @returns Promise that resolves with all results when complete
|
||||||
|
*/
|
||||||
|
export async function executeWithConcurrency<T>(
|
||||||
|
tasks: Array<() => Promise<T>>,
|
||||||
|
concurrency: number
|
||||||
|
): Promise<T[]> {
|
||||||
|
const semaphore = new Semaphore(concurrency);
|
||||||
|
const results: T[] = new Array(tasks.length);
|
||||||
|
|
||||||
|
const promises = tasks.map(async (task, index) => {
|
||||||
|
await semaphore.acquire();
|
||||||
|
try {
|
||||||
|
results[index] = await task();
|
||||||
|
return results[index];
|
||||||
|
} finally {
|
||||||
|
semaphore.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return Promise.all(promises);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute async tasks with concurrency limit and progress callback
|
||||||
|
* @param tasks Array of async task functions
|
||||||
|
* @param concurrency Maximum concurrent tasks
|
||||||
|
* @param onProgress Callback with (completed, total, result) after each task
|
||||||
|
* @returns Promise that resolves with all results when complete
|
||||||
|
*/
|
||||||
|
export async function executeWithConcurrencyAndProgress<T>(
|
||||||
|
tasks: Array<() => Promise<T>>,
|
||||||
|
concurrency: number,
|
||||||
|
onProgress?: (completed: number, total: number, result: T, error?: Error) => void
|
||||||
|
): Promise<T[]> {
|
||||||
|
const semaphore = new Semaphore(concurrency);
|
||||||
|
const results: T[] = new Array(tasks.length);
|
||||||
|
let completed = 0;
|
||||||
|
const total = tasks.length;
|
||||||
|
|
||||||
|
const promises = tasks.map(async (task, index) => {
|
||||||
|
await semaphore.acquire();
|
||||||
|
try {
|
||||||
|
results[index] = await task();
|
||||||
|
completed++;
|
||||||
|
onProgress?.(completed, total, results[index]!);
|
||||||
|
return results[index];
|
||||||
|
} catch (error) {
|
||||||
|
completed++;
|
||||||
|
onProgress?.(completed, total, undefined as T, error as Error);
|
||||||
|
throw error;
|
||||||
|
} finally {
|
||||||
|
semaphore.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return Promise.all(promises);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Batch processor with concurrency control
|
||||||
|
* Useful for processing large arrays in chunks with controlled concurrency
|
||||||
|
*/
|
||||||
|
export class BatchProcessor<T, R> {
|
||||||
|
private semaphore: Semaphore;
|
||||||
|
private processor: (item: T, index: number) => Promise<R>;
|
||||||
|
private onError?: (error: Error, item: T, index: number) => void;
|
||||||
|
private onProgress?: (completed: number, total: number) => void;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
processor: (item: T, index: number) => Promise<R>,
|
||||||
|
concurrency: number,
|
||||||
|
options?: {
|
||||||
|
onError?: (error: Error, item: T, index: number) => void;
|
||||||
|
onProgress?: (completed: number, total: number) => void;
|
||||||
|
}
|
||||||
|
) {
|
||||||
|
this.processor = processor;
|
||||||
|
this.semaphore = new Semaphore(concurrency);
|
||||||
|
this.onError = options?.onError;
|
||||||
|
this.onProgress = options?.onProgress;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process an array of items with concurrency control
|
||||||
|
* Only returns successful results, errors are handled by onError callback
|
||||||
|
*/
|
||||||
|
async process(items: T[]): Promise<Awaited<R>[]> {
|
||||||
|
const results: (Awaited<R> | undefined)[] = new Array(items.length);
|
||||||
|
let completed = 0;
|
||||||
|
const total = items.length;
|
||||||
|
|
||||||
|
const promises = items.map(async (item, index) => {
|
||||||
|
await this.semaphore.acquire();
|
||||||
|
try {
|
||||||
|
const result = await this.processor(item, index);
|
||||||
|
completed++;
|
||||||
|
this.onProgress?.(completed, total);
|
||||||
|
return result;
|
||||||
|
} catch (error) {
|
||||||
|
completed++;
|
||||||
|
this.onProgress?.(completed, total);
|
||||||
|
this.onError?.(error as Error, item, index);
|
||||||
|
return undefined;
|
||||||
|
} finally {
|
||||||
|
this.semaphore.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const allResults = await Promise.all(promises);
|
||||||
|
return allResults.filter((r): r is Awaited<R> => r !== undefined);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process an array and return both results and errors
|
||||||
|
*/
|
||||||
|
async processWithErrors(items: T[]): Promise<{
|
||||||
|
results: R[];
|
||||||
|
errors: Array<{ error: Error; item: T; index: number }>;
|
||||||
|
}> {
|
||||||
|
const results: R[] = [];
|
||||||
|
const errors: Array<{ error: Error; item: T; index: number }> = [];
|
||||||
|
let completed = 0;
|
||||||
|
const total = items.length;
|
||||||
|
|
||||||
|
const promises = items.map(async (item, index) => {
|
||||||
|
await this.semaphore.acquire();
|
||||||
|
try {
|
||||||
|
const result = await this.processor(item, index);
|
||||||
|
results.push(result);
|
||||||
|
completed++;
|
||||||
|
this.onProgress?.(completed, total);
|
||||||
|
} catch (error) {
|
||||||
|
completed++;
|
||||||
|
this.onProgress?.(completed, total);
|
||||||
|
errors.push({ error: error as Error, item, index });
|
||||||
|
} finally {
|
||||||
|
this.semaphore.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.all(promises);
|
||||||
|
return { results, errors };
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user