Compare commits

...

2 Commits

Author SHA1 Message Date
JamesFlare1212
5fb60b069f fix(cache): preserve local data on remote 5xx errors
Only update cache when remote returns HTTP 200. On 5xx errors or timeouts,
preserve existing local cache instead of overwriting with empty/error data.
2026-04-08 12:07:51 -04:00
JamesFlare1212
fb68c1ad5d refactor(scan): remove multi-thread scan logic, use sequential processing 2026-04-08 12:04:27 -04:00
5 changed files with 77 additions and 423 deletions

View File

@@ -45,6 +45,7 @@ async function getCompleteCookies(userName: string, userPwd: string): Promise<st
/** /**
* Get activity details from API * Get activity details from API
* Only returns data on HTTP 200. Returns null on any error (5xx, timeout, etc.)
*/ */
async function getActivityDetailsRaw( async function getActivityDetailsRaw(
activityId: string, activityId: string,
@@ -73,6 +74,16 @@ async function getActivityDetailsRaw(
// Add additional timeout safety // Add additional timeout safety
maxRedirects: 5 maxRedirects: 5
}); });
// CRITICAL: Only accept HTTP 200. Reject all other status codes including 5xx
if (response.status !== 200) {
logger.error(`Non-200 status ${response.status} for activity ${activityId}. NOT updating cache to preserve local data.`);
if (attempt === maxRetries - 1) {
logger.error(`All ${maxRetries} retries failed with non-200 status for activity ${activityId}.`);
}
return null;
}
logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Received response status ${response.status}`); logger.debug(`Attempt ${attempt + 1}/${maxRetries} for activity ${activityId} - Received response status ${response.status}`);
const outerData = JSON.parse(response.data); const outerData = JSON.parse(response.data);
if (outerData && typeof outerData.d === 'string') { if (outerData && typeof outerData.d === 'string') {
@@ -88,7 +99,7 @@ async function getActivityDetailsRaw(
} catch (error: any) { } catch (error: any) {
// Only treat 401 (Unauthorized) and 403 (Forbidden) as authentication errors // Only treat 401 (Unauthorized) and 403 (Forbidden) as authentication errors
// 404 (Not Found) is valid - activity doesn't exist // 404 (Not Found) is valid - activity doesn't exist
// Other 4xx errors should not trigger re-authentication // Other 4xx/5xx errors should not trigger re-authentication
if (error.response && (error.response.status === 401 || error.response.status === 403)) { if (error.response && (error.response.status === 401 || error.response.status === 403)) {
logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`); logger.warn(`Authentication error (${error.response.status}) while fetching activity ${activityId}. Cookie may be invalid.`);
throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status); throw new AuthenticationError(`Received ${error.response.status} for activity ${activityId}`, error.response.status);
@@ -97,6 +108,10 @@ async function getActivityDetailsRaw(
if (error.response) { if (error.response) {
logger.error(`Status: ${error.response.status}, Data (getActivityDetailsRaw): ${ String(error.response.data).slice(0,100)}...`); logger.error(`Status: ${error.response.status}, Data (getActivityDetailsRaw): ${ String(error.response.data).slice(0,100)}...`);
// CRITICAL: 5xx errors should NOT update cache
if (error.response.status >= 500 && error.response.status < 600) {
logger.error(`Server error ${error.response.status} - preserving local cache, not updating.`);
}
} }
if (attempt === maxRetries - 1) { if (attempt === maxRetries - 1) {
logger.error(`All ${maxRetries} retries failed for activity ${activityId}.`); logger.error(`All ${maxRetries} retries failed for activity ${activityId}.`);

View File

@@ -21,7 +21,6 @@ MAX_ACTIVITY_ID_SCAN=8000
# Maximum concurrent API calls during crawling (default: 8) # Maximum concurrent API calls during crawling (default: 8)
# Higher values = faster crawling but more server load # Higher values = faster crawling but more server load
# Set to 1 for sequential processing (slow but safe)
CONCURRENT_API_CALLS=8 CONCURRENT_API_CALLS=8
# Request timeout in milliseconds (default: 25000 = 25 seconds) # Request timeout in milliseconds (default: 25000 = 25 seconds)

View File

@@ -14,7 +14,7 @@ import {
import { uploadImageFromBase64, listS3Objects, constructS3Url } from './s3-service'; import { uploadImageFromBase64, listS3Objects, constructS3Url } from './s3-service';
import { extractBase64Image } from '../utils/image-processor'; import { extractBase64Image } from '../utils/image-processor';
import { logger } from '../utils/logger'; import { logger } from '../utils/logger';
import { BatchProcessor, executeWithConcurrencyAndProgress } from '../utils/semaphore';
import type { ActivityData } from '../models/activity'; import type { ActivityData } from '../models/activity';
@@ -61,8 +61,7 @@ function extractObjectKeyFromUrl(url: string): string | null {
} }
} }
// Crawler concurrency configuration // Crawler configuration
const CONCURRENT_API_CALLS = parseInt(process.env.CONCURRENT_API_CALLS || '8', 10);
const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10); const CRAWLER_REQUEST_TIMEOUT_MS = parseInt(process.env.CRAWLER_REQUEST_TIMEOUT_MS || '25000', 10);
const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10); const CRAWLER_MAX_RETRIES = parseInt(process.env.CRAWLER_MAX_RETRIES || '3', 10);
const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10); const CRAWLER_RETRY_DELAY_MS = parseInt(process.env.CRAWLER_RETRY_DELAY_MS || '1000', 10);
@@ -73,9 +72,10 @@ let skippedCount = 0;
/** /**
* Process and cache a single activity * Process and cache a single activity
* @param activityId - The activity ID to process * @param activityId - The activity ID to process
* @param forceUpdate - If true, update cache even on fetch failure (default: false)
* @returns The processed activity data * @returns The processed activity data
*/ */
async function processAndCacheActivity(activityId: string): Promise<ActivityData> { async function processAndCacheActivity(activityId: string, forceUpdate: boolean = false): Promise<ActivityData> {
logger.debug(`Processing activity ID: ${activityId}`); logger.debug(`Processing activity ID: ${activityId}`);
try { try {
if (!USERNAME || !PASSWORD) { if (!USERNAME || !PASSWORD) {
@@ -93,11 +93,21 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
let structuredActivity: ActivityData; let structuredActivity: ActivityData;
if (!activityJson) { if (!activityJson) {
logger.info(`No data found for activity ID ${activityId} from engage API. Caching as empty.`); // CRITICAL: Only cache empty data if forceUpdate is true
// This prevents 5xx errors from overwriting valid local data
if (forceUpdate) {
logger.info(`No data found for activity ID ${activityId} from engage API. Force updating cache.`);
structuredActivity = { structuredActivity = {
lastCheck: new Date().toISOString(), lastCheck: new Date().toISOString(),
source: 'api-fetch-empty' source: 'api-fetch-empty'
}; };
await setActivityData(activityId, structuredActivity);
return structuredActivity;
} else {
logger.warn(`No data for activity ${activityId}. Preserving existing cache - NOT updating.`);
const existingData = await getActivityData(activityId);
return existingData || { lastCheck: new Date().toISOString(), source: 'cache-preserved' };
}
} else { } else {
structuredActivity = await structActivityData(activityJson); structuredActivity = await structActivityData(activityJson);
if (structuredActivity && structuredActivity.photo && if (structuredActivity && structuredActivity.photo &&
@@ -125,12 +135,19 @@ async function processAndCacheActivity(activityId: string): Promise<ActivityData
return structuredActivity; return structuredActivity;
} catch (error) { } catch (error) {
logger.error(`Error processing activity ID ${activityId}:`, error); logger.error(`Error processing activity ID ${activityId}:`, error);
// CRITICAL: On error, preserve existing cache instead of overwriting with error data
if (forceUpdate) {
const errorData: ActivityData = { const errorData: ActivityData = {
lastCheck: new Date().toISOString(), lastCheck: new Date().toISOString(),
error: "Failed to fetch or process" error: "Failed to fetch or process"
}; };
await setActivityData(activityId, errorData); await setActivityData(activityId, errorData);
return errorData; return errorData;
} else {
logger.warn(`Error fetching activity ${activityId}. Preserving existing cache.`);
const existingData = await getActivityData(activityId);
return existingData || { lastCheck: new Date().toISOString(), error: (error as Error).message };
}
} }
} }
@@ -155,11 +172,10 @@ async function processSingleActivity(activityId: string): Promise<void> {
/** /**
* Initialize the club cache by scanning through all activity IDs * Initialize the club cache by scanning through all activity IDs
* Processed concurrently with controlled parallelism * Processed sequentially
*/ */
export async function initializeClubCache(): Promise<void> { export async function initializeClubCache(): Promise<void> {
logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`); logger.info(`Starting initial club cache population from ID ${MIN_ACTIVITY_ID_SCAN} to ${MAX_ACTIVITY_ID_SCAN}`);
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1; const totalIds = MAX_ACTIVITY_ID_SCAN - MIN_ACTIVITY_ID_SCAN + 1;
let successCount = 0; let successCount = 0;
@@ -172,30 +188,23 @@ export async function initializeClubCache(): Promise<void> {
(_, i) => String(MIN_ACTIVITY_ID_SCAN + i) (_, i) => String(MIN_ACTIVITY_ID_SCAN + i)
); );
// Create batch processor with concurrency control // Process all activities sequentially
const processor = new BatchProcessor( for (let i = 0; i < activityIds.length; i++) {
async (activityId: string) => { const activityId = activityIds[i]!;
try {
await processSingleActivity(activityId); await processSingleActivity(activityId);
return activityId; successCount++;
}, } catch (error) {
CONCURRENT_API_CALLS,
{
onError: (error, activityId) => {
errorCount++; errorCount++;
logger.error(`Error processing activity ID ${activityId}:`, error); logger.error(`Error processing activity ID ${activityId}:`, error);
},
onProgress: (completed, total) => {
if (completed % 100 === 0 || completed === total) {
const mem = process.memoryUsage();
logger.info(`Progress: ${completed}/${total} (${Math.round(completed/total*100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed/1024/1024)}MB | Concurrent: ${CONCURRENT_API_CALLS}`);
} }
}
}
);
// Process all activities concurrently // Log progress every 100 activities or at completion
const results = await processor.process(activityIds); if ((i + 1) % 100 === 0 || i === activityIds.length - 1) {
successCount = results.length; const mem = process.memoryUsage();
logger.info(`Progress: ${i + 1}/${totalIds} (${Math.round((i + 1) / totalIds * 100)}%) - Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount} | Heap: ${Math.round(mem.heapUsed / 1024 / 1024)}MB`);
}
}
logger.info(`Initial club cache population finished.`); logger.info(`Initial club cache population finished.`);
logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`); logger.info(`Summary: Total: ${totalIds}, Processed: ${activityIds.length}, Success: ${successCount}, Skipped: ${skippedCount}, Errors: ${errorCount}`);
@@ -203,11 +212,10 @@ export async function initializeClubCache(): Promise<void> {
/** /**
* Update stale clubs in the cache * Update stale clubs in the cache
* Processed concurrently with controlled parallelism * Processed sequentially
*/ */
export async function updateStaleClubs(): Promise<void> { export async function updateStaleClubs(): Promise<void> {
logger.info('Starting stale club check...'); logger.info('Starting stale club check...');
logger.info(`Concurrency: ${CONCURRENT_API_CALLS} parallel requests`);
const now = Date.now(); const now = Date.now();
const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000; const updateIntervalMs = CLUB_UPDATE_INTERVAL_MINS * 60 * 1000;
const activityKeys = await getAllActivityKeys(); const activityKeys = await getAllActivityKeys();
@@ -237,28 +245,21 @@ export async function updateStaleClubs(): Promise<void> {
logger.info(`Found ${staleActivityIds.length} stale activities to update.`); logger.info(`Found ${staleActivityIds.length} stale activities to update.`);
// Create batch processor for concurrent updates // Process stale activities sequentially
const processor = new BatchProcessor( for (let i = 0; i < staleActivityIds.length; i++) {
async (activityId: string) => { const activityId = staleActivityIds[i]!;
try {
logger.debug(`Updating stale activity ${activityId}`); logger.debug(`Updating stale activity ${activityId}`);
await processAndCacheActivity(activityId); await processAndCacheActivity(activityId);
return activityId; } catch (error) {
},
CONCURRENT_API_CALLS,
{
onError: (error, activityId) => {
logger.error(`Error updating stale activity ${activityId}:`, error); logger.error(`Error updating stale activity ${activityId}:`, error);
},
onProgress: (completed, total) => {
if (completed % 10 === 0 || completed === total) {
logger.info(`Update progress: ${completed}/${total} (${Math.round(completed/total*100)}%)`);
} }
}
}
);
// Process stale activities concurrently // Log progress every 10 activities or at completion
await processor.process(staleActivityIds); if ((i + 1) % 10 === 0 || i === staleActivityIds.length - 1) {
logger.info(`Update progress: ${i + 1}/${staleActivityIds.length} (${Math.round((i + 1) / staleActivityIds.length * 100)}%)`);
}
}
logger.info('Stale club check finished.'); logger.info('Stale club check finished.');
} }

View File

@@ -1,141 +0,0 @@
// test/test-concurrency.ts
/**
* Test script for concurrency features
* Run with: bun run test/test-concurrency.ts
*/
import { Semaphore, executeWithConcurrency, BatchProcessor } from '../utils/semaphore';
// Simulate API call
function simulateApiCall(id: number, delay: number = 100): Promise<{ id: number; result: string }> {
return new Promise((resolve) => {
setTimeout(() => {
resolve({ id, result: `Result for ${id}` });
}, delay);
});
}
async function testSemaphore(): Promise<void> {
console.log('\n=== Test 1: Basic Semaphore ===');
const semaphore = new Semaphore(3);
const start = Date.now();
const promises = [];
for (let i = 1; i <= 10; i++) {
const id = i;
promises.push(
(async () => {
await semaphore.acquire();
console.log(`[${id}] Acquired permit (available: ${semaphore.getAvailablePermits()})`);
await simulateApiCall(id, 200);
console.log(`[${id}] Releasing permit`);
semaphore.release();
})()
);
}
await Promise.all(promises);
const duration = Date.now() - start;
console.log(`\n✓ Completed 10 tasks with max 3 concurrent in ${duration}ms`);
console.log(` (Sequential would take ~2000ms, parallel should be ~700-800ms)`);
}
async function testExecuteWithConcurrency(): Promise<void> {
console.log('\n=== Test 2: executeWithConcurrency ===');
const tasks = Array.from({ length: 10 }, (_, i) => () => simulateApiCall(i + 1, 100));
const start = Date.now();
const results = await executeWithConcurrency(tasks, 5);
const duration = Date.now() - start;
console.log(`✓ Completed ${results.length} tasks with max 5 concurrent in ${duration}ms`);
console.log(` Results: ${results.map(r => r.id).join(', ')}`);
}
async function testBatchProcessor(): Promise<void> {
console.log('\n=== Test 3: BatchProcessor ===');
const items = Array.from({ length: 20 }, (_, i) => ({ id: i + 1, name: `Item ${i + 1}` }));
let processedCount = 0;
const processor = new BatchProcessor(
async (item: { id: number; name: string }) => {
await simulateApiCall(item.id, 50);
processedCount++;
return { ...item, processed: true };
},
4, // concurrency
{
onProgress: (completed, total) => {
if (completed % 5 === 0 || completed === total) {
console.log(` Progress: ${completed}/${total} (${Math.round(completed / total * 100)}%)`);
}
},
onError: (error, item) => {
console.error(` Error processing item ${item.id}:`, error.message);
}
}
);
const start = Date.now();
const results = await processor.process(items);
const duration = Date.now() - start;
console.log(`✓ Processed ${results.length} items with max 4 concurrent in ${duration}ms`);
console.log(` Expected ~500ms (20 items / 4 concurrent * 50ms each)`);
}
async function testErrorHandling(): Promise<void> {
console.log('\n=== Test 4: Error Handling ===');
const items = [1, 2, 3, 4, 5];
let errorCount = 0;
const processor = new BatchProcessor(
async (id: number) => {
if (id % 2 === 0) {
throw new Error(`Simulated error for ${id}`);
}
return { id, success: true };
},
3,
{
onError: (error, item) => {
errorCount++;
console.log(` Caught error for item ${item}: ${error.message}`);
}
}
);
const results = await processor.process(items);
console.log(`✓ Completed: ${results.length} success, ${errorCount} errors (errors handled gracefully)`);
}
async function main(): Promise<void> {
console.log('╔═══════════════════════════════════════════════════╗');
console.log('║ Concurrency Module Test Suite ║');
console.log('╚═══════════════════════════════════════════════════╝');
try {
await testSemaphore();
await testExecuteWithConcurrency();
await testBatchProcessor();
await testErrorHandling();
console.log('\n╔═══════════════════════════════════════════════════╗');
console.log('║ All tests passed! ✓ ║');
console.log('╚═══════════════════════════════════════════════════╝');
console.log('\n📝 Configuration:');
console.log(' - Set CONCURRENT_API_CALLS in .env to control parallelism');
console.log(' - Current default: 8 concurrent requests');
console.log(' - Example: CONCURRENT_API_CALLS=16 for faster crawling');
console.log('');
} catch (error) {
console.error('\n❌ Test failed:', error);
process.exit(1);
}
}
main();

View File

@@ -1,220 +0,0 @@
// utils/semaphore.ts
/**
* Semaphore implementation for controlling concurrent operations
* Based on patterns from civitai/civitai and p-queue
*/
export class Semaphore {
private capacity: number;
private permits: number;
private queue: Array<() => void> = [];
constructor(capacity: number) {
if (capacity < 1) {
throw new Error('Semaphore capacity must be at least 1');
}
this.capacity = capacity;
this.permits = capacity;
}
/**
* Acquire a permit. If none available, waits until one is released.
*/
async acquire(): Promise<void> {
return new Promise<void>((resolve) => {
if (this.permits > 0) {
this.permits--;
resolve();
} else {
// Queue the release callback
this.queue.push(() => {
resolve();
});
}
});
}
/**
* Release a permit and wake up a waiting task if any.
*/
release(): void {
if (this.queue.length > 0) {
const next = this.queue.shift();
if (next) {
next();
}
} else {
this.permits++;
}
}
/**
* Get current available permits.
*/
getAvailablePermits(): number {
return this.permits;
}
/**
* Get total capacity.
*/
getCapacity(): number {
return this.capacity;
}
/**
* Get number of waiting tasks.
*/
getWaitingCount(): number {
return this.queue.length;
}
}
/**
* Execute async tasks with concurrency limit
* @param tasks Array of async task functions
* @param concurrency Maximum concurrent tasks
* @returns Promise that resolves with all results when complete
*/
export async function executeWithConcurrency<T>(
tasks: Array<() => Promise<T>>,
concurrency: number
): Promise<T[]> {
const semaphore = new Semaphore(concurrency);
const results: T[] = new Array(tasks.length);
const promises = tasks.map(async (task, index) => {
await semaphore.acquire();
try {
results[index] = await task();
return results[index];
} finally {
semaphore.release();
}
});
return Promise.all(promises);
}
/**
* Execute async tasks with concurrency limit and progress callback
* @param tasks Array of async task functions
* @param concurrency Maximum concurrent tasks
* @param onProgress Callback with (completed, total, result) after each task
* @returns Promise that resolves with all results when complete
*/
export async function executeWithConcurrencyAndProgress<T>(
tasks: Array<() => Promise<T>>,
concurrency: number,
onProgress?: (completed: number, total: number, result: T, error?: Error) => void
): Promise<T[]> {
const semaphore = new Semaphore(concurrency);
const results: T[] = new Array(tasks.length);
let completed = 0;
const total = tasks.length;
const promises = tasks.map(async (task, index) => {
await semaphore.acquire();
try {
results[index] = await task();
completed++;
onProgress?.(completed, total, results[index]!);
return results[index];
} catch (error) {
completed++;
onProgress?.(completed, total, undefined as T, error as Error);
throw error;
} finally {
semaphore.release();
}
});
return Promise.all(promises);
}
/**
* Batch processor with concurrency control
* Useful for processing large arrays in chunks with controlled concurrency
*/
export class BatchProcessor<T, R> {
private semaphore: Semaphore;
private processor: (item: T, index: number) => Promise<R>;
private onError?: (error: Error, item: T, index: number) => void;
private onProgress?: (completed: number, total: number) => void;
constructor(
processor: (item: T, index: number) => Promise<R>,
concurrency: number,
options?: {
onError?: (error: Error, item: T, index: number) => void;
onProgress?: (completed: number, total: number) => void;
}
) {
this.processor = processor;
this.semaphore = new Semaphore(concurrency);
this.onError = options?.onError;
this.onProgress = options?.onProgress;
}
/**
* Process an array of items with concurrency control
* Only returns successful results, errors are handled by onError callback
*/
async process(items: T[]): Promise<Awaited<R>[]> {
const results: (Awaited<R> | undefined)[] = new Array(items.length);
let completed = 0;
const total = items.length;
const promises = items.map(async (item, index) => {
await this.semaphore.acquire();
try {
const result = await this.processor(item, index);
completed++;
this.onProgress?.(completed, total);
return result;
} catch (error) {
completed++;
this.onProgress?.(completed, total);
this.onError?.(error as Error, item, index);
return undefined;
} finally {
this.semaphore.release();
}
});
const allResults = await Promise.all(promises);
return allResults.filter((r): r is Awaited<R> => r !== undefined);
}
/**
* Process an array and return both results and errors
*/
async processWithErrors(items: T[]): Promise<{
results: R[];
errors: Array<{ error: Error; item: T; index: number }>;
}> {
const results: R[] = [];
const errors: Array<{ error: Error; item: T; index: number }> = [];
let completed = 0;
const total = items.length;
const promises = items.map(async (item, index) => {
await this.semaphore.acquire();
try {
const result = await this.processor(item, index);
results.push(result);
completed++;
this.onProgress?.(completed, total);
} catch (error) {
completed++;
this.onProgress?.(completed, total);
errors.push({ error: error as Error, item, index });
} finally {
this.semaphore.release();
}
});
await Promise.all(promises);
return { results, errors };
}
}