refactor(scan): remove multi-thread scan logic, use sequential processing
This commit is contained in:
@@ -21,7 +21,6 @@ MAX_ACTIVITY_ID_SCAN=8000
|
||||
|
||||
# Maximum concurrent API calls during crawling (default: 8)
|
||||
# Higher values = faster crawling but more server load
|
||||
# Set to 1 for sequential processing (slow but safe)
|
||||
CONCURRENT_API_CALLS=8
|
||||
|
||||
# Request timeout in milliseconds (default: 25000 = 25 seconds)
|
||||
|
||||
@@ -14,7 +14,7 @@ import {
|
||||
import { uploadImageFromBase64, listS3Objects, constructS3Url } from './s3-service';
|
||||
import { extractBase64Image } from '../utils/image-processor';
|
||||
import { logger } from '../utils/logger';
|
||||
import { BatchProcessor, executeWithConcurrencyAndProgress } from '../utils/semaphore';
|
||||
|
||||
|
||||
import type { ActivityData } from '../models/activity';
|
||||
|
||||
@@ -61,8 +61,7 @@ function extractObjectKeyFromUrl(url: string): string | null {
|
||||
}
|
||||
}
|
||||
|
||||
// Crawler concurrency configuration
|
||||
const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '8', 10);
|
||||
// Crawler configuration
|
||||
const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10);
|
||||
const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10);
|
||||
const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10);
|
||||
@@ -155,11 +154,10 @@ async function processSingleActivity(activityId: string): Promise<void> {
|
||||
|
||||
/**
|
||||
* Initialize the club cache by scanning through all activity IDs
|
||||
* Processed concurrently with controlled parallelism
|
||||
* Processed sequentially
|
||||
*/
|
||||
export async function initializeClubCache(): Promise<void> {
|
||||
logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`);
|
||||
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
|
||||
|
||||
const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1;
|
||||
let successCount = 0;
|
||||
@@ -172,30 +170,23 @@ export async function initializeClubCache(): Promise<void> {
|
||||
(_, i) => String(MIN_ACTIVITY_ID_SCAN + i)
|
||||
);
|
||||
|
||||
// Create batch processor with concurrency control
|
||||
const processor = new BatchProcessor(
|
||||
async (activityId: string) => {
|
||||
// Process all activities sequentially
|
||||
for (let i = 0; i < activityIds.length; i++) {
|
||||
const activityId = activityIds[i]!;
|
||||
try {
|
||||
await processSingleActivity(activityId);
|
||||
return activityId;
|
||||
},
|
||||
CONCURRENT_API_CALLS,
|
||||
{
|
||||
onError: (error, activityId) => {
|
||||
successCount++;
|
||||
} catch (error) {
|
||||
errorCount++;
|
||||
logger.error(`Error processing activity ID ${activityId}:`, error);
|
||||
},
|
||||
onProgress: (completed, total) => {
|
||||
if (completed % 100 === 0 || completed === total) {
|
||||
const mem = process.memoryUsage();
|
||||
logger.info(`Progress: ${completed}/${total} (${Math.round(completed/total*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB | Concurrent: ${CONCURRENT_API_CALLS}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Process all activities concurrently
|
||||
const results = await processor.process(activityIds);
|
||||
successCount = results.length;
|
||||
// Log progress every 100 activities or at completion
|
||||
if ((i + 1) % 100 === 0 || i === activityIds.length - 1) {
|
||||
const mem = process.memoryUsage();
|
||||
logger.info(`Progress: ${i + 1}/${totalIds} (${Math.round((i + 1) / totalIds * 100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed / 1024 / 1024)}MB`);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(`Initial club cache population finished.`);
|
||||
logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
|
||||
@@ -203,11 +194,10 @@ export async function initializeClubCache(): Promise<void> {
|
||||
|
||||
/**
|
||||
* Update stale clubs in the cache
|
||||
* Processed concurrently with controlled parallelism
|
||||
* Processed sequentially
|
||||
*/
|
||||
export async function updateStaleClubs(): Promise<void> {
|
||||
logger.info('Starting stale club check...');
|
||||
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
|
||||
const now = Date.now();
|
||||
const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000;
|
||||
const activityKeys = await getAllActivityKeys();
|
||||
@@ -237,28 +227,21 @@ export async function updateStaleClubs(): Promise<void> {
|
||||
|
||||
logger.info(`Found ${staleActivityIds.length} stale activities to update.`);
|
||||
|
||||
// Create batch processor for concurrent updates
|
||||
const processor = new BatchProcessor(
|
||||
async (activityId: string) => {
|
||||
// Process stale activities sequentially
|
||||
for (let i = 0; i < staleActivityIds.length; i++) {
|
||||
const activityId = staleActivityIds[i]!;
|
||||
try {
|
||||
logger.debug(`Updating stale activity ${activityId}`);
|
||||
await processAndCacheActivity(activityId);
|
||||
return activityId;
|
||||
},
|
||||
CONCURRENT_API_CALLS,
|
||||
{
|
||||
onError: (error, activityId) => {
|
||||
} catch (error) {
|
||||
logger.error(`Error updating stale activity ${activityId}:`, error);
|
||||
},
|
||||
onProgress: (completed, total) => {
|
||||
if (completed % 10 === 0 || completed === total) {
|
||||
logger.info(`Update progress: ${completed}/${total} (${Math.round(completed/total*100)}%)`);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Process stale activities concurrently
|
||||
await processor.process(staleActivityIds);
|
||||
// Log progress every 10 activities or at completion
|
||||
if ((i + 1) % 10 === 0 || i === staleActivityIds.length - 1) {
|
||||
logger.info(`Update progress: ${i + 1}/${staleActivityIds.length} (${Math.round((i + 1) / staleActivityIds.length * 100)}%)`);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info('Stale club check finished.');
|
||||
}
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
// test/test-concurrency.ts
|
||||
/**
|
||||
* Test script for concurrency features
|
||||
* Run with: bun run test/test-concurrency.ts
|
||||
*/
|
||||
|
||||
import { Semaphore, executeWithConcurrency, BatchProcessor } from '../utils/semaphore';
|
||||
|
||||
// Simulate API call
|
||||
function simulateApiCall(id: number, delay: number = 100): Promise<{ id: number; result: string }> {
|
||||
return new Promise((resolve) => {
|
||||
setTimeout(() => {
|
||||
resolve({ id, result: `Result for ${id}` });
|
||||
}, delay);
|
||||
});
|
||||
}
|
||||
|
||||
async function testSemaphore(): Promise<void> {
|
||||
console.log('\n=== Test 1: Basic Semaphore ===');
|
||||
const semaphore = new Semaphore(3);
|
||||
|
||||
const start = Date.now();
|
||||
const promises = [];
|
||||
|
||||
for (let i = 1; i <= 10; i++) {
|
||||
const id = i;
|
||||
promises.push(
|
||||
(async () => {
|
||||
await semaphore.acquire();
|
||||
console.log(`[${id}] Acquired permit (available: ${semaphore.getAvailablePermits()})`);
|
||||
await simulateApiCall(id, 200);
|
||||
console.log(`[${id}] Releasing permit`);
|
||||
semaphore.release();
|
||||
})()
|
||||
);
|
||||
}
|
||||
|
||||
await Promise.all(promises);
|
||||
const duration = Date.now() - start;
|
||||
console.log(`\n✓ Completed 10 tasks with max 3 concurrent in ${duration}ms`);
|
||||
console.log(` (Sequential would take ~2000ms, parallel should be ~700-800ms)`);
|
||||
}
|
||||
|
||||
async function testExecuteWithConcurrency(): Promise<void> {
|
||||
console.log('\n=== Test 2: executeWithConcurrency ===');
|
||||
|
||||
const tasks = Array.from({ length: 10 }, (_, i) => () => simulateApiCall(i + 1, 100));
|
||||
|
||||
const start = Date.now();
|
||||
const results = await executeWithConcurrency(tasks, 5);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
console.log(`✓ Completed ${results.length} tasks with max 5 concurrent in ${duration}ms`);
|
||||
console.log(` Results: ${results.map(r => r.id).join(', ')}`);
|
||||
}
|
||||
|
||||
async function testBatchProcessor(): Promise<void> {
|
||||
console.log('\n=== Test 3: BatchProcessor ===');
|
||||
|
||||
const items = Array.from({ length: 20 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` }));
|
||||
let processedCount = 0;
|
||||
|
||||
const processor = new BatchProcessor(
|
||||
async (item: { id: number; name: string }) => {
|
||||
await simulateApiCall(item.id, 50);
|
||||
processedCount++;
|
||||
return { ...item, processed: true };
|
||||
},
|
||||
4, // concurrency
|
||||
{
|
||||
onProgress: (completed, total) => {
|
||||
if (completed % 5 === 0 || completed === total) {
|
||||
console.log(` Progress: ${completed}/${total} (${Math.round(completed / total * 100)}%)`);
|
||||
}
|
||||
},
|
||||
onError: (error, item) => {
|
||||
console.error(` Error processing item ${item.id}:`, error.message);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
const start = Date.now();
|
||||
const results = await processor.process(items);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
console.log(`✓ Processed ${results.length} items with max 4 concurrent in ${duration}ms`);
|
||||
console.log(` Expected ~500ms (20 items / 4 concurrent * 50ms each)`);
|
||||
}
|
||||
|
||||
async function testErrorHandling(): Promise<void> {
|
||||
console.log('\n=== Test 4: Error Handling ===');
|
||||
|
||||
const items = [1, 2, 3, 4, 5];
|
||||
let errorCount = 0;
|
||||
|
||||
const processor = new BatchProcessor(
|
||||
async (id: number) => {
|
||||
if (id % 2 === 0) {
|
||||
throw new Error(`Simulated error for ${id}`);
|
||||
}
|
||||
return { id, success: true };
|
||||
},
|
||||
3,
|
||||
{
|
||||
onError: (error, item) => {
|
||||
errorCount++;
|
||||
console.log(` Caught error for item ${item}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
const results = await processor.process(items);
|
||||
console.log(`✓ Completed: ${results.length} success, ${errorCount} errors (errors handled gracefully)`);
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log('╔═══════════════════════════════════════════════════╗');
|
||||
console.log('║ Concurrency Module Test Suite ║');
|
||||
console.log('╚═══════════════════════════════════════════════════╝');
|
||||
|
||||
try {
|
||||
await testSemaphore();
|
||||
await testExecuteWithConcurrency();
|
||||
await testBatchProcessor();
|
||||
await testErrorHandling();
|
||||
|
||||
console.log('\n╔═══════════════════════════════════════════════════╗');
|
||||
console.log('║ All tests passed! ✓ ║');
|
||||
console.log('╚═══════════════════════════════════════════════════╝');
|
||||
console.log('\n📝 Configuration:');
|
||||
console.log(' - Set CONCURRENT_API_CALLS in .env to control parallelism');
|
||||
console.log(' - Current default: 8 concurrent requests');
|
||||
console.log(' - Example: CONCURRENT_API_CALLS=16 for faster crawling');
|
||||
console.log('');
|
||||
} catch (error) {
|
||||
console.error('\n❌ Test failed:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -1,220 +0,0 @@
|
||||
// utils/semaphore.ts
|
||||
/**
|
||||
* Semaphore implementation for controlling concurrent operations
|
||||
* Based on patterns from civitai/civitai and p-queue
|
||||
*/
|
||||
|
||||
export class Semaphore {
|
||||
private capacity: number;
|
||||
private permits: number;
|
||||
private queue: Array<() => void> = [];
|
||||
|
||||
constructor(capacity: number) {
|
||||
if (capacity < 1) {
|
||||
throw new Error('Semaphore capacity must be at least 1');
|
||||
}
|
||||
this.capacity = capacity;
|
||||
this.permits = capacity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Acquire a permit. If none available, waits until one is released.
|
||||
*/
|
||||
async acquire(): Promise<void> {
|
||||
return new Promise<void>((resolve) => {
|
||||
if (this.permits > 0) {
|
||||
this.permits--;
|
||||
resolve();
|
||||
} else {
|
||||
// Queue the release callback
|
||||
this.queue.push(() => {
|
||||
resolve();
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Release a permit and wake up a waiting task if any.
|
||||
*/
|
||||
release(): void {
|
||||
if (this.queue.length > 0) {
|
||||
const next = this.queue.shift();
|
||||
if (next) {
|
||||
next();
|
||||
}
|
||||
} else {
|
||||
this.permits++;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current available permits.
|
||||
*/
|
||||
getAvailablePermits(): number {
|
||||
return this.permits;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get total capacity.
|
||||
*/
|
||||
getCapacity(): number {
|
||||
return this.capacity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get number of waiting tasks.
|
||||
*/
|
||||
getWaitingCount(): number {
|
||||
return this.queue.length;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute async tasks with concurrency limit
|
||||
* @param tasks Array of async task functions
|
||||
* @param concurrency Maximum concurrent tasks
|
||||
* @returns Promise that resolves with all results when complete
|
||||
*/
|
||||
export async function executeWithConcurrency<T>(
|
||||
tasks: Array<() => Promise<T>>,
|
||||
concurrency: number
|
||||
): Promise<T[]> {
|
||||
const semaphore = new Semaphore(concurrency);
|
||||
const results: T[] = new Array(tasks.length);
|
||||
|
||||
const promises = tasks.map(async (task, index) => {
|
||||
await semaphore.acquire();
|
||||
try {
|
||||
results[index] = await task();
|
||||
return results[index];
|
||||
} finally {
|
||||
semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
return Promise.all(promises);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute async tasks with concurrency limit and progress callback
|
||||
* @param tasks Array of async task functions
|
||||
* @param concurrency Maximum concurrent tasks
|
||||
* @param onProgress Callback with (completed, total, result) after each task
|
||||
* @returns Promise that resolves with all results when complete
|
||||
*/
|
||||
export async function executeWithConcurrencyAndProgress<T>(
|
||||
tasks: Array<() => Promise<T>>,
|
||||
concurrency: number,
|
||||
onProgress?: (completed: number, total: number, result: T, error?: Error) => void
|
||||
): Promise<T[]> {
|
||||
const semaphore = new Semaphore(concurrency);
|
||||
const results: T[] = new Array(tasks.length);
|
||||
let completed = 0;
|
||||
const total = tasks.length;
|
||||
|
||||
const promises = tasks.map(async (task, index) => {
|
||||
await semaphore.acquire();
|
||||
try {
|
||||
results[index] = await task();
|
||||
completed++;
|
||||
onProgress?.(completed, total, results[index]!);
|
||||
return results[index];
|
||||
} catch (error) {
|
||||
completed++;
|
||||
onProgress?.(completed, total, undefined as T, error as Error);
|
||||
throw error;
|
||||
} finally {
|
||||
semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
return Promise.all(promises);
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch processor with concurrency control
|
||||
* Useful for processing large arrays in chunks with controlled concurrency
|
||||
*/
|
||||
export class BatchProcessor<T, R> {
|
||||
private semaphore: Semaphore;
|
||||
private processor: (item: T, index: number) => Promise<R>;
|
||||
private onError?: (error: Error, item: T, index: number) => void;
|
||||
private onProgress?: (completed: number, total: number) => void;
|
||||
|
||||
constructor(
|
||||
processor: (item: T, index: number) => Promise<R>,
|
||||
concurrency: number,
|
||||
options?: {
|
||||
onError?: (error: Error, item: T, index: number) => void;
|
||||
onProgress?: (completed: number, total: number) => void;
|
||||
}
|
||||
) {
|
||||
this.processor = processor;
|
||||
this.semaphore = new Semaphore(concurrency);
|
||||
this.onError = options?.onError;
|
||||
this.onProgress = options?.onProgress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an array of items with concurrency control
|
||||
* Only returns successful results, errors are handled by onError callback
|
||||
*/
|
||||
async process(items: T[]): Promise<Awaited<R>[]> {
|
||||
const results: (Awaited<R> | undefined)[] = new Array(items.length);
|
||||
let completed = 0;
|
||||
const total = items.length;
|
||||
|
||||
const promises = items.map(async (item, index) => {
|
||||
await this.semaphore.acquire();
|
||||
try {
|
||||
const result = await this.processor(item, index);
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
return result;
|
||||
} catch (error) {
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
this.onError?.(error as Error, item, index);
|
||||
return undefined;
|
||||
} finally {
|
||||
this.semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
const allResults = await Promise.all(promises);
|
||||
return allResults.filter((r): r is Awaited<R> => r !== undefined);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an array and return both results and errors
|
||||
*/
|
||||
async processWithErrors(items: T[]): Promise<{
|
||||
results: R[];
|
||||
errors: Array<{ error: Error; item: T; index: number }>;
|
||||
}> {
|
||||
const results: R[] = [];
|
||||
const errors: Array<{ error: Error; item: T; index: number }> = [];
|
||||
let completed = 0;
|
||||
const total = items.length;
|
||||
|
||||
const promises = items.map(async (item, index) => {
|
||||
await this.semaphore.acquire();
|
||||
try {
|
||||
const result = await this.processor(item, index);
|
||||
results.push(result);
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
} catch (error) {
|
||||
completed++;
|
||||
this.onProgress?.(completed, total);
|
||||
errors.push({ error: error as Error, item, index });
|
||||
} finally {
|
||||
this.semaphore.release();
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(promises);
|
||||
return { results, errors };
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user