重构 scan: 实现多线程并发爬虫功能
- 新增 Semaphore 信号量类控制并发数 - 新增 BatchProcessor 批量处理器带进度回调 - 重构 initializeClubCache 和 updateStaleClubs 为并发模式 - 修复 Cookie 4xx 判断逻辑(仅 401/403 触发重新登录) - 添加环境变量配置:CONCURRENT_API_CALLS 等 - 新增并发功能测试脚本 test-concurrency.ts 性能提升:从串行处理提升至可配置的并发处理(默认 8 线程) 修复问题:404 错误不再误判为认证失败
This commit is contained in:
37
example.env
37
example.env
@@ -11,26 +11,35 @@ S3_SECRET_ACCESS_KEY=
|
||||
S3_REGION=
|
||||
S3_PUBLIC_URL_PREFIX=files
|
||||
REDIS_URL=redis://:dsas-cca@redis:6379
|
||||
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
|
||||
|
||||
# ============================================================================
|
||||
# CRAWLER CONCURRENCY CONFIGURATION
|
||||
# ============================================================================
|
||||
MIN_ACTIVITY_ID_SCAN=3000
|
||||
MAX_ACTIVITY_ID_SCAN=8000
|
||||
CONCURRENT_API_CALLS=16
|
||||
|
||||
# Maximum concurrent API calls during crawling (default: 8)
|
||||
# Higher values = faster crawling but more server load
|
||||
# Set to 1 for sequential processing (slow but safe)
|
||||
CONCURRENT_API_CALLS=8
|
||||
|
||||
# Request timeout in milliseconds (default: 25000 = 25 seconds)
|
||||
CRAWLER_REQUEST_TIMEOUT_MS=25000
|
||||
|
||||
# Maximum retries per request on transient errors (default: 3)
|
||||
CRAWLER_MAX_RETRIES=3
|
||||
|
||||
# Delay between retries in milliseconds (default: 1000 = 1 second)
|
||||
CRAWLER_RETRY_DELAY_MS=1000
|
||||
|
||||
# Rate limit: maximum requests per minute (default: unlimited)
|
||||
# Set to 0 for no limit
|
||||
CRAWLER_REQUESTS_PER_MINUTE=0
|
||||
STAFF_UPDATE_INTERVAL_MINS=360
|
||||
CLUB_UPDATE_INTERVAL_MINS=360
|
||||
LOG_LEVEL=info # Example: 'debug', 'info', 'warn', 'error'
|
||||
|
||||
# Cache TTL Configuration (in seconds)
|
||||
ACTIVITY_CACHE_TTL=86400 # 24 hours for normal activity data
|
||||
STAFF_CACHE_TTL=86400 # 24 hours for staff data
|
||||
ERROR_CACHE_TTL=3600 # 1 hour for error states (allows retry)
|
||||
|
||||
# Proxy Configuration (Optional)
|
||||
# Set USE_PROXY=true to enable proxy for Playwright requests
|
||||
USE_PROXY=false
|
||||
# Custom proxy server (default: socks5://warp-proxy:9091 when using warp-proxy service)
|
||||
# Examples:
|
||||
# HTTP: http://proxy.example.com:8080
|
||||
# SOCKS5: socks5://proxy.example.com:1080
|
||||
# Warp: socks5://warp-proxy:9091
|
||||
ALL_PROXY=
|
||||
HTTP_PROXY=
|
||||
HTTPS_PROXY=
|
||||
|
||||
Reference in New Issue
Block a user