diff --git a/example.env b/example.env index a03dc22..e655d71 100644 --- a/example.env +++ b/example.env @@ -21,7 +21,6 @@ MAX_ACTIVITY_ID_SCAN=8000 # Maximum concurrent API calls during crawling (default: 8) # Higher values = faster crawling but more server load -# Set to 1 for sequential processing (slow but safe) CONCURRENT_API_CALLS=8 # Request timeout in milliseconds (default: 25000 = 25 seconds) diff --git a/services/cache-manager.ts b/services/cache-manager.ts index 27a7728..f6e85ca 100644 --- a/services/cache-manager.ts +++ b/services/cache-manager.ts @@ -14,7 +14,7 @@ import { import { uploadImageFromBase64, listS3Objects, constructS3Url } from './s3-service'; import { extractBase64Image } from '../utils/image-processor'; import { logger } from '../utils/logger'; -import { BatchProcessor, executeWithConcurrencyAndProgress } from '../utils/semaphore'; + import type { ActivityData } from '../models/activity'; @@ -61,8 +61,7 @@ function extractObjectKeyFromUrl(url: string): string | null { } } -// Crawler concurrency configuration -const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '8', 10); +// Crawler configuration const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10); const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10); const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10); @@ -155,11 +154,10 @@ async function processSingleActivity(activityId: string): Promise { /** * Initialize the club cache by scanning through all activity IDs - * Processed concurrently with controlled parallelism + * Processed sequentially */ export async function initializeClubCache(): Promise { logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`); - logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`); const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1; let successCount = 0; @@ -172,30 +170,23 @@ export async function initializeClubCache(): Promise { (_, i) => String(MIN_ACTIVITY_ID_SCAN + i) ); - // Create batch processor with concurrency control - const processor = new BatchProcessor( - async (activityId: string) => { + // Process all activities sequentially + for (let i = 0; i < activityIds.length; i++) { + const activityId = activityIds[i]!; + try { await processSingleActivity(activityId); - return activityId; - }, - CONCURRENT_API_CALLS, - { - onError: (error, activityId) => { - errorCount++; - logger.error(`Error processing activity ID ${activityId}:`, error); - }, - onProgress: (completed, total) => { - if (completed % 100 === 0 || completed === total) { - const mem = process.memoryUsage(); - logger.info(`Progress: ${completed}/${total} (${Math.round(completed/total*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB | Concurrent: ${CONCURRENT_API_CALLS}`); - } - } + successCount++; + } catch (error) { + errorCount++; + logger.error(`Error processing activity ID ${activityId}:`, error); } - ); - - // Process all activities concurrently - const results = await processor.process(activityIds); - successCount = results.length; + + // Log progress every 100 activities or at completion + if ((i + 1) % 100 === 0 || i === activityIds.length - 1) { + const mem = process.memoryUsage(); + logger.info(`Progress: ${i + 1}/${totalIds} (${Math.round((i + 1) / totalIds * 100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed / 1024 / 1024)}MB`); + } + } logger.info(`Initial club cache population finished.`); logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`); @@ -203,11 +194,10 @@ export async function initializeClubCache(): Promise { /** * Update stale clubs in the cache - * Processed concurrently with controlled parallelism + * Processed sequentially */ export async function updateStaleClubs(): Promise { logger.info('Starting stale club check...'); - logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`); const now = Date.now(); const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000; const activityKeys = await getAllActivityKeys(); @@ -237,28 +227,21 @@ export async function updateStaleClubs(): Promise { logger.info(`Found ${staleActivityIds.length} stale activities to update.`); - // Create batch processor for concurrent updates - const processor = new BatchProcessor( - async (activityId: string) => { + // Process stale activities sequentially + for (let i = 0; i < staleActivityIds.length; i++) { + const activityId = staleActivityIds[i]!; + try { logger.debug(`Updating stale activity ${activityId}`); await processAndCacheActivity(activityId); - return activityId; - }, - CONCURRENT_API_CALLS, - { - onError: (error, activityId) => { - logger.error(`Error updating stale activity ${activityId}:`, error); - }, - onProgress: (completed, total) => { - if (completed % 10 === 0 || completed === total) { - logger.info(`Update progress: ${completed}/${total} (${Math.round(completed/total*100)}%)`); - } - } + } catch (error) { + logger.error(`Error updating stale activity ${activityId}:`, error); } - ); - - // Process stale activities concurrently - await processor.process(staleActivityIds); + + // Log progress every 10 activities or at completion + if ((i + 1) % 10 === 0 || i === staleActivityIds.length - 1) { + logger.info(`Update progress: ${i + 1}/${staleActivityIds.length} (${Math.round((i + 1) / staleActivityIds.length * 100)}%)`); + } + } logger.info('Stale club check finished.'); } diff --git a/test/test-concurrency.ts b/test/test-concurrency.ts deleted file mode 100644 index 1787e78..0000000 --- a/test/test-concurrency.ts +++ /dev/null @@ -1,141 +0,0 @@ -// test/test-concurrency.ts -/** - * Test script for concurrency features - * Run with: bun run test/test-concurrency.ts - */ - -import { Semaphore, executeWithConcurrency, BatchProcessor } from '../utils/semaphore'; - -// Simulate API call -function simulateApiCall(id: number, delay: number = 100): Promise<{ id: number; result: string }> { - return new Promise((resolve) => { - setTimeout(() => { - resolve({ id, result: `Result for ${id}` }); - }, delay); - }); -} - -async function testSemaphore(): Promise { - console.log('\n=== Test 1: Basic Semaphore ==='); - const semaphore = new Semaphore(3); - - const start = Date.now(); - const promises = []; - - for (let i = 1; i <= 10; i++) { - const id = i; - promises.push( - (async () => { - await semaphore.acquire(); - console.log(`[${id}] Acquired permit (available: ${semaphore.getAvailablePermits()})`); - await simulateApiCall(id, 200); - console.log(`[${id}] Releasing permit`); - semaphore.release(); - })() - ); - } - - await Promise.all(promises); - const duration = Date.now() - start; - console.log(`\n✓ Completed 10 tasks with max 3 concurrent in ${duration}ms`); - console.log(` (Sequential would take ~2000ms, parallel should be ~700-800ms)`); -} - -async function testExecuteWithConcurrency(): Promise { - console.log('\n=== Test 2: executeWithConcurrency ==='); - - const tasks = Array.from({ length: 10 }, (_, i) => () => simulateApiCall(i + 1, 100)); - - const start = Date.now(); - const results = await executeWithConcurrency(tasks, 5); - const duration = Date.now() - start; - - console.log(`✓ Completed ${results.length} tasks with max 5 concurrent in ${duration}ms`); - console.log(` Results: ${results.map(r => r.id).join(', ')}`); -} - -async function testBatchProcessor(): Promise { - console.log('\n=== Test 3: BatchProcessor ==='); - - const items = Array.from({ length: 20 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` })); - let processedCount = 0; - - const processor = new BatchProcessor( - async (item: { id: number; name: string }) => { - await simulateApiCall(item.id, 50); - processedCount++; - return { ...item, processed: true }; - }, - 4, // concurrency - { - onProgress: (completed, total) => { - if (completed % 5 === 0 || completed === total) { - console.log(` Progress: ${completed}/${total} (${Math.round(completed / total * 100)}%)`); - } - }, - onError: (error, item) => { - console.error(` Error processing item ${item.id}:`, error.message); - } - } - ); - - const start = Date.now(); - const results = await processor.process(items); - const duration = Date.now() - start; - - console.log(`✓ Processed ${results.length} items with max 4 concurrent in ${duration}ms`); - console.log(` Expected ~500ms (20 items / 4 concurrent * 50ms each)`); -} - -async function testErrorHandling(): Promise { - console.log('\n=== Test 4: Error Handling ==='); - - const items = [1, 2, 3, 4, 5]; - let errorCount = 0; - - const processor = new BatchProcessor( - async (id: number) => { - if (id % 2 === 0) { - throw new Error(`Simulated error for ${id}`); - } - return { id, success: true }; - }, - 3, - { - onError: (error, item) => { - errorCount++; - console.log(` Caught error for item ${item}: ${error.message}`); - } - } - ); - - const results = await processor.process(items); - console.log(`✓ Completed: ${results.length} success, ${errorCount} errors (errors handled gracefully)`); -} - -async function main(): Promise { - console.log('╔═══════════════════════════════════════════════════╗'); - console.log('║ Concurrency Module Test Suite ║'); - console.log('╚═══════════════════════════════════════════════════╝'); - - try { - await testSemaphore(); - await testExecuteWithConcurrency(); - await testBatchProcessor(); - await testErrorHandling(); - - console.log('\n╔═══════════════════════════════════════════════════╗'); - console.log('║ All tests passed! ✓ ║'); - console.log('╚═══════════════════════════════════════════════════╝'); - console.log('\n📝 Configuration:'); - console.log(' - Set CONCURRENT_API_CALLS in .env to control parallelism'); - console.log(' - Current default: 8 concurrent requests'); - console.log(' - Example: CONCURRENT_API_CALLS=16 for faster crawling'); - console.log(''); - } catch (error) { - console.error('\n❌ Test failed:', error); - process.exit(1); - } -} - -main(); diff --git a/utils/semaphore.ts b/utils/semaphore.ts deleted file mode 100644 index 2f05305..0000000 --- a/utils/semaphore.ts +++ /dev/null @@ -1,220 +0,0 @@ -// utils/semaphore.ts -/** - * Semaphore implementation for controlling concurrent operations - * Based on patterns from civitai/civitai and p-queue - */ - -export class Semaphore { - private capacity: number; - private permits: number; - private queue: Array<() => void> = []; - - constructor(capacity: number) { - if (capacity < 1) { - throw new Error('Semaphore capacity must be at least 1'); - } - this.capacity = capacity; - this.permits = capacity; - } - - /** - * Acquire a permit. If none available, waits until one is released. - */ - async acquire(): Promise { - return new Promise((resolve) => { - if (this.permits > 0) { - this.permits--; - resolve(); - } else { - // Queue the release callback - this.queue.push(() => { - resolve(); - }); - } - }); - } - - /** - * Release a permit and wake up a waiting task if any. - */ - release(): void { - if (this.queue.length > 0) { - const next = this.queue.shift(); - if (next) { - next(); - } - } else { - this.permits++; - } - } - - /** - * Get current available permits. - */ - getAvailablePermits(): number { - return this.permits; - } - - /** - * Get total capacity. - */ - getCapacity(): number { - return this.capacity; - } - - /** - * Get number of waiting tasks. - */ - getWaitingCount(): number { - return this.queue.length; - } -} - -/** - * Execute async tasks with concurrency limit - * @param tasks Array of async task functions - * @param concurrency Maximum concurrent tasks - * @returns Promise that resolves with all results when complete - */ -export async function executeWithConcurrency( - tasks: Array<() => Promise>, - concurrency: number -): Promise { - const semaphore = new Semaphore(concurrency); - const results: T[] = new Array(tasks.length); - - const promises = tasks.map(async (task, index) => { - await semaphore.acquire(); - try { - results[index] = await task(); - return results[index]; - } finally { - semaphore.release(); - } - }); - - return Promise.all(promises); -} - -/** - * Execute async tasks with concurrency limit and progress callback - * @param tasks Array of async task functions - * @param concurrency Maximum concurrent tasks - * @param onProgress Callback with (completed, total, result) after each task - * @returns Promise that resolves with all results when complete - */ -export async function executeWithConcurrencyAndProgress( - tasks: Array<() => Promise>, - concurrency: number, - onProgress?: (completed: number, total: number, result: T, error?: Error) => void -): Promise { - const semaphore = new Semaphore(concurrency); - const results: T[] = new Array(tasks.length); - let completed = 0; - const total = tasks.length; - - const promises = tasks.map(async (task, index) => { - await semaphore.acquire(); - try { - results[index] = await task(); - completed++; - onProgress?.(completed, total, results[index]!); - return results[index]; - } catch (error) { - completed++; - onProgress?.(completed, total, undefined as T, error as Error); - throw error; - } finally { - semaphore.release(); - } - }); - - return Promise.all(promises); -} - -/** - * Batch processor with concurrency control - * Useful for processing large arrays in chunks with controlled concurrency - */ -export class BatchProcessor { - private semaphore: Semaphore; - private processor: (item: T, index: number) => Promise; - private onError?: (error: Error, item: T, index: number) => void; - private onProgress?: (completed: number, total: number) => void; - - constructor( - processor: (item: T, index: number) => Promise, - concurrency: number, - options?: { - onError?: (error: Error, item: T, index: number) => void; - onProgress?: (completed: number, total: number) => void; - } - ) { - this.processor = processor; - this.semaphore = new Semaphore(concurrency); - this.onError = options?.onError; - this.onProgress = options?.onProgress; - } - - /** - * Process an array of items with concurrency control - * Only returns successful results, errors are handled by onError callback - */ - async process(items: T[]): Promise[]> { - const results: (Awaited | undefined)[] = new Array(items.length); - let completed = 0; - const total = items.length; - - const promises = items.map(async (item, index) => { - await this.semaphore.acquire(); - try { - const result = await this.processor(item, index); - completed++; - this.onProgress?.(completed, total); - return result; - } catch (error) { - completed++; - this.onProgress?.(completed, total); - this.onError?.(error as Error, item, index); - return undefined; - } finally { - this.semaphore.release(); - } - }); - - const allResults = await Promise.all(promises); - return allResults.filter((r): r is Awaited => r !== undefined); - } - - /** - * Process an array and return both results and errors - */ - async processWithErrors(items: T[]): Promise<{ - results: R[]; - errors: Array<{ error: Error; item: T; index: number }>; - }> { - const results: R[] = []; - const errors: Array<{ error: Error; item: T; index: number }> = []; - let completed = 0; - const total = items.length; - - const promises = items.map(async (item, index) => { - await this.semaphore.acquire(); - try { - const result = await this.processor(item, index); - results.push(result); - completed++; - this.onProgress?.(completed, total); - } catch (error) { - completed++; - this.onProgress?.(completed, total); - errors.push({ error: error as Error, item, index }); - } finally { - this.semaphore.release(); - } - }); - - await Promise.all(promises); - return { results, errors }; - } -}