重构 scan: 实现多线程并发爬虫功能

- 新增 Semaphore 信号量类控制并发数
- 新增 BatchProcessor 批量处理器带进度回调
- 重构 initializeClubCache 和 updateStaleClubs 为并发模式
- 修复 Cookie 4xx 判断逻辑(仅 401/403 触发重新登录)
- 添加环境变量配置:CONCURRENT_API_CALLS 等
- 新增并发功能测试脚本 test-concurrency.ts

性能提升:从串行处理提升至可配置的并发处理(默认 8 线程)
修复问题:404 错误不再误判为认证失败
This commit is contained in:
JamesFlare1212
2026-04-07 18:18:18 -04:00
parent fc98dbbbae
commit 0a133159e8
5 changed files with 492 additions and 119 deletions

View File

@@ -2,7 +2,6 @@
import axios from 'axios';
import { logger } from '../utils/logger';
import {
loginWithPlaywright,
ensureSingleLogin,
loadCachedCookies,
saveCookiesToCache,
@@ -28,68 +27,6 @@ class AuthenticationError extends Error {
}
}
/**
* Test cookie validity by calling API
*/
async function testCookieValidityWithApi(cookieString: string): Promise<boolean> {
if (!cookieString) return false;
logger.debug('Testing cookie validity via API...');
const MAX_RETRIES = 3;
let attempt = 0;
while (attempt < MAX_RETRIES) {
try {
attempt++;
const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
const headers = {
'Content-Type': 'application/json; charset=UTF-8',
'Cookie': cookieString,
'User-Agent': 'Mozilla/5.0 (Bun DSAS-CCA get-activity Module)',
};
const payload = {
"activityID": "3350"
};
logger.debug(`Attempt ${attempt}/${MAX_RETRIES}`);
const response = await axios.post(url, payload, {
headers,
timeout: 10000
});
// Check for 4xx errors (auth failures)
if (response.status >= 400 && response.status < 500) {
logger.warn(`Cookie test returned ${response.status}, likely invalid`);
return false;
}
logger.debug('Cookie test successful (API responded 2xx). Cookie is valid.');
return true;
} catch (error: any) {
logger.warn(`Cookie validity test failed (attempt ${attempt}/${MAX_RETRIES}).`);
if (error.response) {
// 4xx = auth failure (immediate fail)
if (error.response.status >= 400 && error.response.status < 500) {
logger.warn(`Cookie test API response status: ${error.response.status} (auth error)`);
return false;
}
// 5xx = server error (retry with delay)
logger.warn(`Cookie test API response status: ${error.response.status} (server error, retrying...)`);
} else {
// No response (000 status, network error, timeout)
logger.warn(`Network/timeout error: ${error.message} (retrying...)`);
}
if (attempt < MAX_RETRIES) {
await new Promise(resolve => setTimeout(resolve, 1000 * (attempt + 1)));
}
}
}
logger.warn('Max retries reached. Cookie is likely invalid or expired.');
return false;
}
/**
* Get complete cookies using Playwright with single login lock
*/
@@ -112,7 +49,7 @@ async function getCompleteCookies(userName: string, userPwd: string): Promise<st
async function getActivityDetailsRaw(
activityId: string,
cookies: string,
maxRetries: number = 2,
maxRetries: number = 3,
timeoutMilliseconds: number = 10000
): Promise<string | null> {
const url = 'https://engage.nkcswx.cn/Services/ActivitiesService.asmx/GetActivityDetails';
@@ -128,11 +65,15 @@ async function getActivityDetailsRaw(
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Sending POST request to ${url}`);
const response = await axios.post(url, payload, {
headers,
timeout: timeoutMilliseconds,
responseType: 'text'
responseType: 'text',
// Add additional timeout safety
maxRedirects: 5
});
logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Received response status ${response.status}`);
const outerData = JSON.parse(response.data);
if (outerData && typeof outerData.d === 'string') {
const innerData = JSON.parse(outerData.d);
@@ -145,8 +86,10 @@ async function getActivityDetailsRaw(
logger.error(`Unexpected API response structure for activity ${activityId}.`);
}
} catch (error: any) {
// Check if response status is in 4xx range (400-499) to trigger auth error
if (error.response && error.response.status >= 400 && error.response.status < 500) {
// Only treat 401 (Unauthorized) and 403 (Forbidden) as authentication errors
// 404 (Not Found) is valid - activity doesn't exist
// Other 4xx errors should not trigger re-authentication
if (error.response && (error.response.status === 401 || error.response.status === 403)) {
logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`);
throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status);
}
@@ -177,7 +120,7 @@ export async function fetchActivityData(
activityId: string,
userName: string,
userPwd: string,
forceLogin: boolean = false
forceLogin: boolean = false,
): Promise<any | null> {
let currentCookie = forceLogin ? null : await getCachedCookieString();
@@ -212,7 +155,9 @@ export async function fetchActivityData(
logger.debug('Using cached cookie for API request.');
try {
logger.debug(`Calling getActivityDetailsRaw for activity ${activityId}...`);
const rawActivityDetailsString = await getActivityDetailsRaw(activityId, currentCookie);
logger.debug(`getActivityDetailsRaw returned for activity ${activityId}`);
if (rawActivityDetailsString) {
const parsedOuter = JSON.parse(rawActivityDetailsString);
return JSON.parse(parsedOuter.d);