重构 scan: 实现多线程并发爬虫功能

- 新增 Semaphore 信号量类控制并发数
- 新增 BatchProcessor 批量处理器带进度回调
- 重构 initializeClubCache 和 updateStaleClubs 为并发模式
- 修复 Cookie 4xx 判断逻辑(仅 401/403 触发重新登录)
- 添加环境变量配置:CONCURRENT_API_CALLS 等
- 新增并发功能测试脚本 test-concurrency.ts

性能提升:从串行处理提升至可配置的并发处理(默认 8 线程)
修复问题:404 错误不再误判为认证失败
This commit is contained in:
JamesFlare1212
2026-04-07 18:18:18 -04:00
parent fc98dbbbae
commit 0a133159e8
5 changed files with 492 additions and 119 deletions

View File

@@ -11,26 +11,35 @@ S3_SECRET_ACCESS_KEY=
S3_REGION=
S3_PUBLIC_URL_PREFIX=files
REDIS_URL=redis://:dsas-cca@redis:6379
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
# ============================================================================
# CRAWLER CONCURRENCY CONFIGURATION
# ============================================================================
MIN_ACTIVITY_ID_SCAN=3000
MAX_ACTIVITY_ID_SCAN=8000
CONCURRENT_API_CALLS=16
# Maximum concurrent API calls during crawling (default: 8)
# Higher values = faster crawling but more server load
# Set to 1 for sequential processing (slow but safe)
CONCURRENT_API_CALLS=8
# Request timeout in milliseconds (default: 25000 = 25 seconds)
CRAWLER_REQUEST_TIMEOUT_MS=25000
# Maximum retries per request on transient errors (default: 3)
CRAWLER_MAX_RETRIES=3
# Delay between retries in milliseconds (default: 1000 = 1 second)
CRAWLER_RETRY_DELAY_MS=1000
# Rate limit: maximum requests per minute (default: unlimited)
# Set to 0 for no limit
CRAWLER_REQUESTS_PER_MINUTE=0
STAFF_UPDATE_INTERVAL_MINS=360
CLUB_UPDATE_INTERVAL_MINS=360
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
# Cache TTL Configuration (in seconds)
ACTIVITY_CACHE_TTL=86400 # 24 hours for normal activity data
STAFF_CACHE_TTL=86400 # 24 hours for staff data
ERROR_CACHE_TTL=3600 # 1 hour for error states (allows retry)
# Proxy Configuration (Optional)
# Set USE_PROXY=true to enable proxy for Playwright requests
USE_PROXY=false
# Custom proxy server (default: socks5://warp-proxy:9091 when using warp-proxy service)
# Examples:
# HTTP: http://proxy.example.com:8080
# SOCKS5: socks5://proxy.example.com:1080
# Warp: socks5://warp-proxy:9091
ALL_PROXY=
HTTP_PROXY=
HTTPS_PROXY=