diff --git a/src/app.js b/src/app.js index f80347c8..a1f8020b 100644 --- a/src/app.js +++ b/src/app.js @@ -507,7 +507,8 @@ class Application { const [expiredKeys, errorAccounts] = await Promise.all([ apiKeyService.cleanupExpiredKeys(), - claudeAccountService.cleanupErrorAccounts() + claudeAccountService.cleanupErrorAccounts(), + claudeAccountService.cleanupTempErrorAccounts() // 新增:清理临时错误账户 ]) await redis.cleanup() diff --git a/src/services/claudeAccountService.js b/src/services/claudeAccountService.js index ffd390bd..97748393 100644 --- a/src/services/claudeAccountService.js +++ b/src/services/claudeAccountService.js @@ -1734,6 +1734,145 @@ class ClaudeAccountService { throw error } } + + // 🧹 清理临时错误账户 + async cleanupTempErrorAccounts() { + try { + const accounts = await redis.getAllClaudeAccounts() + let cleanedCount = 0 + const TEMP_ERROR_RECOVERY_MINUTES = 60 // 临时错误状态恢复时间(分钟) + + for (const account of accounts) { + if (account.status === 'temp_error' && account.tempErrorAt) { + const tempErrorAt = new Date(account.tempErrorAt) + const now = new Date() + const minutesSinceTempError = (now - tempErrorAt) / (1000 * 60) + + // 如果临时错误状态超过指定时间,尝试重新激活 + if (minutesSinceTempError > TEMP_ERROR_RECOVERY_MINUTES) { + account.status = 'active' // 恢复为 active 状态 + account.schedulable = 'true' // 恢复为可调度 + delete account.errorMessage + delete account.tempErrorAt + await redis.setClaudeAccount(account.id, account) + // 同时清除500错误计数 + await this.clearInternalErrors(account.id) + cleanedCount++ + logger.success(`🧹 Reset temp_error status for account ${account.name} (${account.id})`) + } + } + } + + if (cleanedCount > 0) { + logger.success(`🧹 Reset ${cleanedCount} temp_error accounts`) + } + + return cleanedCount + } catch (error) { + logger.error('❌ Failed to cleanup temp_error accounts:', error) + return 0 + } + } + + // 记录5xx服务器错误 + async recordServerError(accountId, statusCode) { + try { + const key = `claude_account:${accountId}:5xx_errors` + + // 增加错误计数,设置5分钟过期时间 + await redis.client.incr(key) + await redis.client.expire(key, 300) // 5分钟 + + logger.info(`📝 Recorded ${statusCode} error for account ${accountId}`) + } catch (error) { + logger.error(`❌ Failed to record ${statusCode} error for account ${accountId}:`, error) + } + } + + // 记录500内部错误(保留以便向后兼容) + async recordInternalError(accountId) { + return this.recordServerError(accountId, 500) + } + + // 获取5xx错误计数 + async getServerErrorCount(accountId) { + try { + const key = `claude_account:${accountId}:5xx_errors` + + const count = await redis.client.get(key) + return parseInt(count) || 0 + } catch (error) { + logger.error(`❌ Failed to get 5xx error count for account ${accountId}:`, error) + return 0 + } + } + + // 获取500错误计数(保留以便向后兼容) + async getInternalErrorCount(accountId) { + return this.getServerErrorCount(accountId) + } + + // 清除500错误计数 + async clearInternalErrors(accountId) { + try { + const key = `claude_account:${accountId}:5xx_errors` + + await redis.client.del(key) + logger.info(`✅ Cleared 5xx error count for account ${accountId}`) + } catch (error) { + logger.error(`❌ Failed to clear 5xx errors for account ${accountId}:`, error) + } + } + + // 标记账号为临时错误状态 + async markAccountTempError(accountId, sessionHash = null) { + try { + const accountData = await redis.getClaudeAccount(accountId) + if (!accountData || Object.keys(accountData).length === 0) { + throw new Error('Account not found') + } + + // 更新账户状态 + const updatedAccountData = { ...accountData } + updatedAccountData.status = 'temp_error' // 新增的临时错误状态 + updatedAccountData.schedulable = 'false' // 设置为不可调度 + updatedAccountData.errorMessage = 'Account temporarily disabled due to consecutive 500 errors' + updatedAccountData.tempErrorAt = new Date().toISOString() + + // 保存更新后的账户数据 + await redis.setClaudeAccount(accountId, updatedAccountData) + + // 如果有sessionHash,删除粘性会话映射 + if (sessionHash) { + await redis.client.del(`sticky_session:${sessionHash}`) + logger.info(`🗑️ Deleted sticky session mapping for hash: ${sessionHash}`) + } + + logger.warn( + `⚠️ Account ${accountData.name} (${accountId}) marked as temp_error and disabled for scheduling` + ) + + // 发送Webhook通知 + try { + const webhookNotifier = require('../utils/webhookNotifier') + await webhookNotifier.sendAccountAnomalyNotification({ + accountId, + accountName: accountData.name, + platform: 'claude-oauth', + status: 'temp_error', + errorCode: 'CLAUDE_OAUTH_TEMP_ERROR', + reason: 'Account temporarily disabled due to consecutive 500 errors' + }) + } catch (webhookError) { + logger.error('Failed to send webhook notification:', webhookError) + } + + return { success: true } + } catch (error) { + logger.error(`❌ Failed to mark account ${accountId} as temp_error:`, error) + throw error + } + } } module.exports = new ClaudeAccountService() diff --git a/src/services/claudeRelayService.js b/src/services/claudeRelayService.js index 49a9192a..0ca60f1b 100644 --- a/src/services/claudeRelayService.js +++ b/src/services/claudeRelayService.js @@ -197,6 +197,23 @@ class ClaudeRelayService { ) } } + // 检查是否为5xx状态码 + else if (response.statusCode >= 500 && response.statusCode < 600) { + logger.warn(`🔥 Server error (${response.statusCode}) detected for account ${accountId}`) + // 记录5xx错误 + await claudeAccountService.recordServerError(accountId, response.statusCode) + // 检查是否需要标记为临时错误状态(连续3次500) + const errorCount = await claudeAccountService.getServerErrorCount(accountId) + logger.info( + `🔥 Account ${accountId} has ${errorCount} consecutive 5xx errors in the last 5 minutes` + ) + if (errorCount >= 3) { + logger.error( + `❌ Account ${accountId} exceeded 5xx error threshold (${errorCount} errors), marking as temp_error` + ) + await claudeAccountService.markAccountTempError(accountId, sessionHash) + } + } // 检查是否为429状态码 else if (response.statusCode === 429) { isRateLimited = true @@ -247,8 +264,9 @@ class ClaudeRelayService { ) } } else if (response.statusCode === 200 || response.statusCode === 201) { - // 请求成功,清除401错误计数 + // 请求成功,清除401和500错误计数 await this.clearUnauthorizedErrors(accountId) + await claudeAccountService.clearInternalErrors(accountId) // 如果请求成功,检查并移除限流状态 const isRateLimited = await unifiedClaudeScheduler.isAccountRateLimited( accountId, @@ -883,6 +901,34 @@ class ClaudeRelayService { // 错误响应处理 if (res.statusCode !== 200) { + // 将错误处理逻辑封装在一个异步函数中 + const handleErrorResponse = async () => { + // 增加对5xx错误的处理 + if (res.statusCode >= 500 && res.statusCode < 600) { + logger.warn( + `🔥 [Stream] Server error (${res.statusCode}) detected for account ${accountId}` + ) + // 记录5xx错误 + await claudeAccountService.recordServerError(accountId, res.statusCode) + // 检查是否需要标记为临时错误状态(连续3次500) + const errorCount = await claudeAccountService.getServerErrorCount(accountId) + logger.info( + `🔥 [Stream] Account ${accountId} has ${errorCount} consecutive 5xx errors in the last 5 minutes` + ) + if (errorCount >= 3) { + logger.error( + `❌ [Stream] Account ${accountId} exceeded 5xx error threshold (${errorCount} errors), marking as temp_error` + ) + await claudeAccountService.markAccountTempError(accountId, sessionHash) + } + } + } + + // 调用异步错误处理函数 + handleErrorResponse().catch((err) => { + logger.error('❌ Error in stream error handler:', err) + }) + logger.error(`❌ Claude API returned error status: ${res.statusCode}`) let errorData = '' @@ -1162,6 +1208,9 @@ class ClaudeRelayService { rateLimitResetTimestamp ) } else if (res.statusCode === 200) { + // 请求成功,清除401和500错误计数 + await this.clearUnauthorizedErrors(accountId) + await claudeAccountService.clearInternalErrors(accountId) // 如果请求成功,检查并移除限流状态 const isRateLimited = await unifiedClaudeScheduler.isAccountRateLimited( accountId, diff --git a/src/services/unifiedClaudeScheduler.js b/src/services/unifiedClaudeScheduler.js index 287bb465..4e6535bd 100644 --- a/src/services/unifiedClaudeScheduler.js +++ b/src/services/unifiedClaudeScheduler.js @@ -176,7 +176,8 @@ class UnifiedClaudeScheduler { boundAccount && boundAccount.isActive === 'true' && boundAccount.status !== 'error' && - boundAccount.status !== 'blocked' + boundAccount.status !== 'blocked' && + boundAccount.status !== 'temp_error' ) { const isRateLimited = await claudeAccountService.isAccountRateLimited(boundAccount.id) if (!isRateLimited) { @@ -262,6 +263,7 @@ class UnifiedClaudeScheduler { account.isActive === 'true' && account.status !== 'error' && account.status !== 'blocked' && + account.status !== 'temp_error' && (account.accountType === 'shared' || !account.accountType) && // 兼容旧数据 this._isSchedulable(account.schedulable) ) { @@ -441,7 +443,12 @@ class UnifiedClaudeScheduler { try { if (accountType === 'claude-official') { const account = await redis.getClaudeAccount(accountId) - if (!account || account.isActive !== 'true' || account.status === 'error') { + if ( + !account || + account.isActive !== 'true' || + account.status === 'error' || + account.status === 'temp_error' + ) { return false } // 检查是否可调度