feat: 扩展熔断机制支持所有5xx错误码

- 扩展错误检测从单一500错误到所有5xx错误码(500-599)
- 新增temp_error状态,连续3次5xx错误触发临时熔断
- 支持流式和非流式请求的统一5xx错误处理
- 添加定时清理机制,60分钟后自动恢复temp_error状态
- 完善错误计数和清理逻辑,提高系统可靠性

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
sczheng189
2025-08-25 15:17:31 +08:00
parent 81ad8a787f
commit f4f88091c1
4 changed files with 200 additions and 4 deletions

View File

@@ -507,7 +507,8 @@ class Application {
const [expiredKeys, errorAccounts] = await Promise.all([
apiKeyService.cleanupExpiredKeys(),
claudeAccountService.cleanupErrorAccounts()
claudeAccountService.cleanupErrorAccounts(),
claudeAccountService.cleanupTempErrorAccounts() // 新增:清理临时错误账户
])
await redis.cleanup()

View File

@@ -1734,6 +1734,145 @@ class ClaudeAccountService {
throw error
}
}
// 🧹 清理临时错误账户
async cleanupTempErrorAccounts() {
try {
const accounts = await redis.getAllClaudeAccounts()
let cleanedCount = 0
const TEMP_ERROR_RECOVERY_MINUTES = 60 // 临时错误状态恢复时间(分钟)
for (const account of accounts) {
if (account.status === 'temp_error' && account.tempErrorAt) {
const tempErrorAt = new Date(account.tempErrorAt)
const now = new Date()
const minutesSinceTempError = (now - tempErrorAt) / (1000 * 60)
// 如果临时错误状态超过指定时间,尝试重新激活
if (minutesSinceTempError > TEMP_ERROR_RECOVERY_MINUTES) {
account.status = 'active' // 恢复为 active 状态
account.schedulable = 'true' // 恢复为可调度
delete account.errorMessage
delete account.tempErrorAt
await redis.setClaudeAccount(account.id, account)
// 同时清除500错误计数
await this.clearInternalErrors(account.id)
cleanedCount++
logger.success(`🧹 Reset temp_error status for account ${account.name} (${account.id})`)
}
}
}
if (cleanedCount > 0) {
logger.success(`🧹 Reset ${cleanedCount} temp_error accounts`)
}
return cleanedCount
} catch (error) {
logger.error('❌ Failed to cleanup temp_error accounts:', error)
return 0
}
}
// 记录5xx服务器错误
async recordServerError(accountId, statusCode) {
try {
const key = `claude_account:${accountId}:5xx_errors`
// 增加错误计数设置5分钟过期时间
await redis.client.incr(key)
await redis.client.expire(key, 300) // 5分钟
logger.info(`📝 Recorded ${statusCode} error for account ${accountId}`)
} catch (error) {
logger.error(`❌ Failed to record ${statusCode} error for account ${accountId}:`, error)
}
}
// 记录500内部错误(保留以便向后兼容)
async recordInternalError(accountId) {
return this.recordServerError(accountId, 500)
}
// 获取5xx错误计数
async getServerErrorCount(accountId) {
try {
const key = `claude_account:${accountId}:5xx_errors`
const count = await redis.client.get(key)
return parseInt(count) || 0
} catch (error) {
logger.error(`❌ Failed to get 5xx error count for account ${accountId}:`, error)
return 0
}
}
// 获取500错误计数(保留以便向后兼容)
async getInternalErrorCount(accountId) {
return this.getServerErrorCount(accountId)
}
// 清除500错误计数
async clearInternalErrors(accountId) {
try {
const key = `claude_account:${accountId}:5xx_errors`
await redis.client.del(key)
logger.info(`✅ Cleared 5xx error count for account ${accountId}`)
} catch (error) {
logger.error(`❌ Failed to clear 5xx errors for account ${accountId}:`, error)
}
}
// 标记账号为临时错误状态
async markAccountTempError(accountId, sessionHash = null) {
try {
const accountData = await redis.getClaudeAccount(accountId)
if (!accountData || Object.keys(accountData).length === 0) {
throw new Error('Account not found')
}
// 更新账户状态
const updatedAccountData = { ...accountData }
updatedAccountData.status = 'temp_error' // 新增的临时错误状态
updatedAccountData.schedulable = 'false' // 设置为不可调度
updatedAccountData.errorMessage = 'Account temporarily disabled due to consecutive 500 errors'
updatedAccountData.tempErrorAt = new Date().toISOString()
// 保存更新后的账户数据
await redis.setClaudeAccount(accountId, updatedAccountData)
// 如果有sessionHash删除粘性会话映射
if (sessionHash) {
await redis.client.del(`sticky_session:${sessionHash}`)
logger.info(`🗑️ Deleted sticky session mapping for hash: ${sessionHash}`)
}
logger.warn(
`⚠️ Account ${accountData.name} (${accountId}) marked as temp_error and disabled for scheduling`
)
// 发送Webhook通知
try {
const webhookNotifier = require('../utils/webhookNotifier')
await webhookNotifier.sendAccountAnomalyNotification({
accountId,
accountName: accountData.name,
platform: 'claude-oauth',
status: 'temp_error',
errorCode: 'CLAUDE_OAUTH_TEMP_ERROR',
reason: 'Account temporarily disabled due to consecutive 500 errors'
})
} catch (webhookError) {
logger.error('Failed to send webhook notification:', webhookError)
}
return { success: true }
} catch (error) {
logger.error(`❌ Failed to mark account ${accountId} as temp_error:`, error)
throw error
}
}
}
module.exports = new ClaudeAccountService()

View File

@@ -197,6 +197,23 @@ class ClaudeRelayService {
)
}
}
// 检查是否为5xx状态码
else if (response.statusCode >= 500 && response.statusCode < 600) {
logger.warn(`🔥 Server error (${response.statusCode}) detected for account ${accountId}`)
// 记录5xx错误
await claudeAccountService.recordServerError(accountId, response.statusCode)
// 检查是否需要标记为临时错误状态连续3次500
const errorCount = await claudeAccountService.getServerErrorCount(accountId)
logger.info(
`🔥 Account ${accountId} has ${errorCount} consecutive 5xx errors in the last 5 minutes`
)
if (errorCount >= 3) {
logger.error(
`❌ Account ${accountId} exceeded 5xx error threshold (${errorCount} errors), marking as temp_error`
)
await claudeAccountService.markAccountTempError(accountId, sessionHash)
}
}
// 检查是否为429状态码
else if (response.statusCode === 429) {
isRateLimited = true
@@ -247,8 +264,9 @@ class ClaudeRelayService {
)
}
} else if (response.statusCode === 200 || response.statusCode === 201) {
// 请求成功清除401错误计数
// 请求成功清除401和500错误计数
await this.clearUnauthorizedErrors(accountId)
await claudeAccountService.clearInternalErrors(accountId)
// 如果请求成功,检查并移除限流状态
const isRateLimited = await unifiedClaudeScheduler.isAccountRateLimited(
accountId,
@@ -883,6 +901,34 @@ class ClaudeRelayService {
// 错误响应处理
if (res.statusCode !== 200) {
// 将错误处理逻辑封装在一个异步函数中
const handleErrorResponse = async () => {
// 增加对5xx错误的处理
if (res.statusCode >= 500 && res.statusCode < 600) {
logger.warn(
`🔥 [Stream] Server error (${res.statusCode}) detected for account ${accountId}`
)
// 记录5xx错误
await claudeAccountService.recordServerError(accountId, res.statusCode)
// 检查是否需要标记为临时错误状态连续3次500
const errorCount = await claudeAccountService.getServerErrorCount(accountId)
logger.info(
`🔥 [Stream] Account ${accountId} has ${errorCount} consecutive 5xx errors in the last 5 minutes`
)
if (errorCount >= 3) {
logger.error(
`❌ [Stream] Account ${accountId} exceeded 5xx error threshold (${errorCount} errors), marking as temp_error`
)
await claudeAccountService.markAccountTempError(accountId, sessionHash)
}
}
}
// 调用异步错误处理函数
handleErrorResponse().catch((err) => {
logger.error('❌ Error in stream error handler:', err)
})
logger.error(`❌ Claude API returned error status: ${res.statusCode}`)
let errorData = ''
@@ -1162,6 +1208,9 @@ class ClaudeRelayService {
rateLimitResetTimestamp
)
} else if (res.statusCode === 200) {
// 请求成功清除401和500错误计数
await this.clearUnauthorizedErrors(accountId)
await claudeAccountService.clearInternalErrors(accountId)
// 如果请求成功,检查并移除限流状态
const isRateLimited = await unifiedClaudeScheduler.isAccountRateLimited(
accountId,

View File

@@ -176,7 +176,8 @@ class UnifiedClaudeScheduler {
boundAccount &&
boundAccount.isActive === 'true' &&
boundAccount.status !== 'error' &&
boundAccount.status !== 'blocked'
boundAccount.status !== 'blocked' &&
boundAccount.status !== 'temp_error'
) {
const isRateLimited = await claudeAccountService.isAccountRateLimited(boundAccount.id)
if (!isRateLimited) {
@@ -262,6 +263,7 @@ class UnifiedClaudeScheduler {
account.isActive === 'true' &&
account.status !== 'error' &&
account.status !== 'blocked' &&
account.status !== 'temp_error' &&
(account.accountType === 'shared' || !account.accountType) && // 兼容旧数据
this._isSchedulable(account.schedulable)
) {
@@ -441,7 +443,12 @@ class UnifiedClaudeScheduler {
try {
if (accountType === 'claude-official') {
const account = await redis.getClaudeAccount(accountId)
if (!account || account.isActive !== 'true' || account.status === 'error') {
if (
!account ||
account.isActive !== 'true' ||
account.status === 'error' ||
account.status === 'temp_error'
) {
return false
}
// 检查是否可调度