mirror of
https://github.com/Wei-Shaw/claude-relay-service.git
synced 2026-01-22 16:43:35 +00:00
feat: 扩展熔断机制支持所有5xx错误码
- 扩展错误检测从单一500错误到所有5xx错误码(500-599) - 新增temp_error状态,连续3次5xx错误触发临时熔断 - 支持流式和非流式请求的统一5xx错误处理 - 添加定时清理机制,60分钟后自动恢复temp_error状态 - 完善错误计数和清理逻辑,提高系统可靠性 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -507,7 +507,8 @@ class Application {
|
||||
|
||||
const [expiredKeys, errorAccounts] = await Promise.all([
|
||||
apiKeyService.cleanupExpiredKeys(),
|
||||
claudeAccountService.cleanupErrorAccounts()
|
||||
claudeAccountService.cleanupErrorAccounts(),
|
||||
claudeAccountService.cleanupTempErrorAccounts() // 新增:清理临时错误账户
|
||||
])
|
||||
|
||||
await redis.cleanup()
|
||||
|
||||
@@ -1734,6 +1734,145 @@ class ClaudeAccountService {
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
// 🧹 清理临时错误账户
|
||||
async cleanupTempErrorAccounts() {
|
||||
try {
|
||||
const accounts = await redis.getAllClaudeAccounts()
|
||||
let cleanedCount = 0
|
||||
const TEMP_ERROR_RECOVERY_MINUTES = 60 // 临时错误状态恢复时间(分钟)
|
||||
|
||||
for (const account of accounts) {
|
||||
if (account.status === 'temp_error' && account.tempErrorAt) {
|
||||
const tempErrorAt = new Date(account.tempErrorAt)
|
||||
const now = new Date()
|
||||
const minutesSinceTempError = (now - tempErrorAt) / (1000 * 60)
|
||||
|
||||
// 如果临时错误状态超过指定时间,尝试重新激活
|
||||
if (minutesSinceTempError > TEMP_ERROR_RECOVERY_MINUTES) {
|
||||
account.status = 'active' // 恢复为 active 状态
|
||||
account.schedulable = 'true' // 恢复为可调度
|
||||
delete account.errorMessage
|
||||
delete account.tempErrorAt
|
||||
await redis.setClaudeAccount(account.id, account)
|
||||
// 同时清除500错误计数
|
||||
await this.clearInternalErrors(account.id)
|
||||
cleanedCount++
|
||||
logger.success(`🧹 Reset temp_error status for account ${account.name} (${account.id})`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cleanedCount > 0) {
|
||||
logger.success(`🧹 Reset ${cleanedCount} temp_error accounts`)
|
||||
}
|
||||
|
||||
return cleanedCount
|
||||
} catch (error) {
|
||||
logger.error('❌ Failed to cleanup temp_error accounts:', error)
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// 记录5xx服务器错误
|
||||
async recordServerError(accountId, statusCode) {
|
||||
try {
|
||||
const key = `claude_account:${accountId}:5xx_errors`
|
||||
|
||||
// 增加错误计数,设置5分钟过期时间
|
||||
await redis.client.incr(key)
|
||||
await redis.client.expire(key, 300) // 5分钟
|
||||
|
||||
logger.info(`📝 Recorded ${statusCode} error for account ${accountId}`)
|
||||
} catch (error) {
|
||||
logger.error(`❌ Failed to record ${statusCode} error for account ${accountId}:`, error)
|
||||
}
|
||||
}
|
||||
|
||||
// 记录500内部错误(保留以便向后兼容)
|
||||
async recordInternalError(accountId) {
|
||||
return this.recordServerError(accountId, 500)
|
||||
}
|
||||
|
||||
// 获取5xx错误计数
|
||||
async getServerErrorCount(accountId) {
|
||||
try {
|
||||
const key = `claude_account:${accountId}:5xx_errors`
|
||||
|
||||
const count = await redis.client.get(key)
|
||||
return parseInt(count) || 0
|
||||
} catch (error) {
|
||||
logger.error(`❌ Failed to get 5xx error count for account ${accountId}:`, error)
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// 获取500错误计数(保留以便向后兼容)
|
||||
async getInternalErrorCount(accountId) {
|
||||
return this.getServerErrorCount(accountId)
|
||||
}
|
||||
|
||||
// 清除500错误计数
|
||||
async clearInternalErrors(accountId) {
|
||||
try {
|
||||
const key = `claude_account:${accountId}:5xx_errors`
|
||||
|
||||
await redis.client.del(key)
|
||||
logger.info(`✅ Cleared 5xx error count for account ${accountId}`)
|
||||
} catch (error) {
|
||||
logger.error(`❌ Failed to clear 5xx errors for account ${accountId}:`, error)
|
||||
}
|
||||
}
|
||||
|
||||
// 标记账号为临时错误状态
|
||||
async markAccountTempError(accountId, sessionHash = null) {
|
||||
try {
|
||||
const accountData = await redis.getClaudeAccount(accountId)
|
||||
if (!accountData || Object.keys(accountData).length === 0) {
|
||||
throw new Error('Account not found')
|
||||
}
|
||||
|
||||
// 更新账户状态
|
||||
const updatedAccountData = { ...accountData }
|
||||
updatedAccountData.status = 'temp_error' // 新增的临时错误状态
|
||||
updatedAccountData.schedulable = 'false' // 设置为不可调度
|
||||
updatedAccountData.errorMessage = 'Account temporarily disabled due to consecutive 500 errors'
|
||||
updatedAccountData.tempErrorAt = new Date().toISOString()
|
||||
|
||||
// 保存更新后的账户数据
|
||||
await redis.setClaudeAccount(accountId, updatedAccountData)
|
||||
|
||||
// 如果有sessionHash,删除粘性会话映射
|
||||
if (sessionHash) {
|
||||
await redis.client.del(`sticky_session:${sessionHash}`)
|
||||
logger.info(`🗑️ Deleted sticky session mapping for hash: ${sessionHash}`)
|
||||
}
|
||||
|
||||
logger.warn(
|
||||
`⚠️ Account ${accountData.name} (${accountId}) marked as temp_error and disabled for scheduling`
|
||||
)
|
||||
|
||||
// 发送Webhook通知
|
||||
try {
|
||||
const webhookNotifier = require('../utils/webhookNotifier')
|
||||
await webhookNotifier.sendAccountAnomalyNotification({
|
||||
accountId,
|
||||
accountName: accountData.name,
|
||||
platform: 'claude-oauth',
|
||||
status: 'temp_error',
|
||||
errorCode: 'CLAUDE_OAUTH_TEMP_ERROR',
|
||||
reason: 'Account temporarily disabled due to consecutive 500 errors'
|
||||
})
|
||||
} catch (webhookError) {
|
||||
logger.error('Failed to send webhook notification:', webhookError)
|
||||
}
|
||||
|
||||
return { success: true }
|
||||
} catch (error) {
|
||||
logger.error(`❌ Failed to mark account ${accountId} as temp_error:`, error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = new ClaudeAccountService()
|
||||
|
||||
@@ -197,6 +197,23 @@ class ClaudeRelayService {
|
||||
)
|
||||
}
|
||||
}
|
||||
// 检查是否为5xx状态码
|
||||
else if (response.statusCode >= 500 && response.statusCode < 600) {
|
||||
logger.warn(`🔥 Server error (${response.statusCode}) detected for account ${accountId}`)
|
||||
// 记录5xx错误
|
||||
await claudeAccountService.recordServerError(accountId, response.statusCode)
|
||||
// 检查是否需要标记为临时错误状态(连续3次500)
|
||||
const errorCount = await claudeAccountService.getServerErrorCount(accountId)
|
||||
logger.info(
|
||||
`🔥 Account ${accountId} has ${errorCount} consecutive 5xx errors in the last 5 minutes`
|
||||
)
|
||||
if (errorCount >= 3) {
|
||||
logger.error(
|
||||
`❌ Account ${accountId} exceeded 5xx error threshold (${errorCount} errors), marking as temp_error`
|
||||
)
|
||||
await claudeAccountService.markAccountTempError(accountId, sessionHash)
|
||||
}
|
||||
}
|
||||
// 检查是否为429状态码
|
||||
else if (response.statusCode === 429) {
|
||||
isRateLimited = true
|
||||
@@ -247,8 +264,9 @@ class ClaudeRelayService {
|
||||
)
|
||||
}
|
||||
} else if (response.statusCode === 200 || response.statusCode === 201) {
|
||||
// 请求成功,清除401错误计数
|
||||
// 请求成功,清除401和500错误计数
|
||||
await this.clearUnauthorizedErrors(accountId)
|
||||
await claudeAccountService.clearInternalErrors(accountId)
|
||||
// 如果请求成功,检查并移除限流状态
|
||||
const isRateLimited = await unifiedClaudeScheduler.isAccountRateLimited(
|
||||
accountId,
|
||||
@@ -883,6 +901,34 @@ class ClaudeRelayService {
|
||||
|
||||
// 错误响应处理
|
||||
if (res.statusCode !== 200) {
|
||||
// 将错误处理逻辑封装在一个异步函数中
|
||||
const handleErrorResponse = async () => {
|
||||
// 增加对5xx错误的处理
|
||||
if (res.statusCode >= 500 && res.statusCode < 600) {
|
||||
logger.warn(
|
||||
`🔥 [Stream] Server error (${res.statusCode}) detected for account ${accountId}`
|
||||
)
|
||||
// 记录5xx错误
|
||||
await claudeAccountService.recordServerError(accountId, res.statusCode)
|
||||
// 检查是否需要标记为临时错误状态(连续3次500)
|
||||
const errorCount = await claudeAccountService.getServerErrorCount(accountId)
|
||||
logger.info(
|
||||
`🔥 [Stream] Account ${accountId} has ${errorCount} consecutive 5xx errors in the last 5 minutes`
|
||||
)
|
||||
if (errorCount >= 3) {
|
||||
logger.error(
|
||||
`❌ [Stream] Account ${accountId} exceeded 5xx error threshold (${errorCount} errors), marking as temp_error`
|
||||
)
|
||||
await claudeAccountService.markAccountTempError(accountId, sessionHash)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 调用异步错误处理函数
|
||||
handleErrorResponse().catch((err) => {
|
||||
logger.error('❌ Error in stream error handler:', err)
|
||||
})
|
||||
|
||||
logger.error(`❌ Claude API returned error status: ${res.statusCode}`)
|
||||
let errorData = ''
|
||||
|
||||
@@ -1162,6 +1208,9 @@ class ClaudeRelayService {
|
||||
rateLimitResetTimestamp
|
||||
)
|
||||
} else if (res.statusCode === 200) {
|
||||
// 请求成功,清除401和500错误计数
|
||||
await this.clearUnauthorizedErrors(accountId)
|
||||
await claudeAccountService.clearInternalErrors(accountId)
|
||||
// 如果请求成功,检查并移除限流状态
|
||||
const isRateLimited = await unifiedClaudeScheduler.isAccountRateLimited(
|
||||
accountId,
|
||||
|
||||
@@ -176,7 +176,8 @@ class UnifiedClaudeScheduler {
|
||||
boundAccount &&
|
||||
boundAccount.isActive === 'true' &&
|
||||
boundAccount.status !== 'error' &&
|
||||
boundAccount.status !== 'blocked'
|
||||
boundAccount.status !== 'blocked' &&
|
||||
boundAccount.status !== 'temp_error'
|
||||
) {
|
||||
const isRateLimited = await claudeAccountService.isAccountRateLimited(boundAccount.id)
|
||||
if (!isRateLimited) {
|
||||
@@ -262,6 +263,7 @@ class UnifiedClaudeScheduler {
|
||||
account.isActive === 'true' &&
|
||||
account.status !== 'error' &&
|
||||
account.status !== 'blocked' &&
|
||||
account.status !== 'temp_error' &&
|
||||
(account.accountType === 'shared' || !account.accountType) && // 兼容旧数据
|
||||
this._isSchedulable(account.schedulable)
|
||||
) {
|
||||
@@ -441,7 +443,12 @@ class UnifiedClaudeScheduler {
|
||||
try {
|
||||
if (accountType === 'claude-official') {
|
||||
const account = await redis.getClaudeAccount(accountId)
|
||||
if (!account || account.isActive !== 'true' || account.status === 'error') {
|
||||
if (
|
||||
!account ||
|
||||
account.isActive !== 'true' ||
|
||||
account.status === 'error' ||
|
||||
account.status === 'temp_error'
|
||||
) {
|
||||
return false
|
||||
}
|
||||
// 检查是否可调度
|
||||
|
||||
Reference in New Issue
Block a user