From e611f97dae464327c63e79fad9cbc72302501d26 Mon Sep 17 00:00:00 2001 From: X-Zero-L Date: Mon, 2 Mar 2026 21:53:18 +0800 Subject: [PATCH] fix: expose cooldown details and make 503 backoff configurable --- .env.example | 3 +- config/config.example.js | 2 + src/services/relay/claudeRelayService.js | 5 +- .../scheduler/unifiedClaudeScheduler.js | 5 +- src/utils/upstreamErrorHelper.js | 69 ++++++- web/admin-spa/src/views/AccountsView.vue | 170 +++++++++++++++--- 6 files changed, 217 insertions(+), 37 deletions(-) diff --git a/.env.example b/.env.example index c67b80a0..ef8a870b 100644 --- a/.env.example +++ b/.env.example @@ -156,7 +156,8 @@ ENABLE_CORS=true TRUST_PROXY=true # ⏱️ 上游错误自动暂停配置(秒) -# UPSTREAM_ERROR_5XX_TTL_SECONDS=300 # 5xx错误暂停时间(默认5分钟) +# UPSTREAM_ERROR_503_TTL_SECONDS=60 # 503错误暂停时间(默认60秒) +# UPSTREAM_ERROR_5XX_TTL_SECONDS=300 # 500/502等5xx错误暂停时间(默认5分钟) # UPSTREAM_ERROR_OVERLOAD_TTL_SECONDS=600 # 529过载暂停时间(默认10分钟) # UPSTREAM_ERROR_AUTH_TTL_SECONDS=1800 # 401/403认证错误暂停时间(默认30分钟) # UPSTREAM_ERROR_TIMEOUT_TTL_SECONDS=300 # 504超时暂停时间(默认5分钟) diff --git a/config/config.example.js b/config/config.example.js index dda050bc..57cdce1f 100644 --- a/config/config.example.js +++ b/config/config.example.js @@ -232,6 +232,8 @@ const config = { // ⏱️ 上游错误自动暂停配置 upstreamError: { + serviceUnavailableTtlSeconds: + parseInt(process.env.UPSTREAM_ERROR_503_TTL_SECONDS) || 60, // 503错误暂停秒数 serverErrorTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_5XX_TTL_SECONDS) || 300, // 5xx错误暂停秒数 overloadTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_OVERLOAD_TTL_SECONDS) || 600, // 529过载暂停秒数 authErrorTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_AUTH_TTL_SECONDS) || 1800, // 401/403认证错误暂停秒数 diff --git a/src/services/relay/claudeRelayService.js b/src/services/relay/claudeRelayService.js index 7f336344..1dc6bc64 100644 --- a/src/services/relay/claudeRelayService.js +++ b/src/services/relay/claudeRelayService.js @@ -2886,13 +2886,14 @@ class ClaudeRelayService { `⏱️ ${prefix}${isTimeout ? 'Timeout' : 'Server'} error for account ${accountId}, error count: ${errorCount}/${threshold}` ) - // 标记账户为临时不可用(5分钟) + // 标记账户为临时不可用(TTL 由 upstreamError 配置决定) try { await unifiedClaudeScheduler.markAccountTemporarilyUnavailable( accountId, accountType, sessionHash, - 300 + null, + statusCode ) } catch (markError) { logger.error(`❌ Failed to mark account temporarily unavailable: ${accountId}`, markError) diff --git a/src/services/scheduler/unifiedClaudeScheduler.js b/src/services/scheduler/unifiedClaudeScheduler.js index 93d0820b..a92998c2 100644 --- a/src/services/scheduler/unifiedClaudeScheduler.js +++ b/src/services/scheduler/unifiedClaudeScheduler.js @@ -1299,10 +1299,11 @@ class UnifiedClaudeScheduler { accountId, accountType, sessionHash = null, - ttlSeconds = 300 + ttlSeconds = null, + statusCode = 500 ) { try { - await upstreamErrorHelper.markTempUnavailable(accountId, accountType, 500, ttlSeconds) + await upstreamErrorHelper.markTempUnavailable(accountId, accountType, statusCode, ttlSeconds) if (sessionHash) { await this._deleteSessionMapping(sessionHash) } diff --git a/src/utils/upstreamErrorHelper.js b/src/utils/upstreamErrorHelper.js index 57a2f271..838ab1fa 100644 --- a/src/utils/upstreamErrorHelper.js +++ b/src/utils/upstreamErrorHelper.js @@ -8,6 +8,7 @@ const ERROR_HISTORY_TTL = 3 * 24 * 60 * 60 // 3天 // 默认 TTL(秒) const DEFAULT_TTL = { server_error: 300, // 5xx: 5分钟 + service_unavailable: 60, // 503: 1分钟(默认更短,避免短暂抖动导致长时间不可路由) overload: 600, // 529: 10分钟 auth_error: 1800, // 401/403: 30分钟 timeout: 300, // 504/网络超时: 5分钟 @@ -29,7 +30,16 @@ const getConfig = () => { const getTtlConfig = () => { const config = getConfig() + const parseEnvPositiveInt = (name) => { + const value = parseInt(process.env[name], 10) + return Number.isFinite(value) && value > 0 ? value : null + } + return { + service_unavailable: + config.upstreamError?.serviceUnavailableTtlSeconds ?? + parseEnvPositiveInt('UPSTREAM_ERROR_503_TTL_SECONDS') ?? + DEFAULT_TTL.service_unavailable, server_error: config.upstreamError?.serverErrorTtlSeconds ?? DEFAULT_TTL.server_error, overload: config.upstreamError?.overloadTtlSeconds ?? DEFAULT_TTL.overload, auth_error: config.upstreamError?.authErrorTtlSeconds ?? DEFAULT_TTL.auth_error, @@ -52,6 +62,9 @@ const classifyError = (statusCode) => { if (statusCode === 529) { return 'overload' } + if (statusCode === 503) { + return 'service_unavailable' + } if (statusCode === 504) { return 'timeout' } @@ -204,7 +217,13 @@ const markTempUnavailable = async ( } const ttlConfig = getTtlConfig() - const ttlSeconds = customTtl ?? ttlConfig[errorType] + const parsedCustomTtl = Number(customTtl) + const ttlSeconds = + Number.isFinite(parsedCustomTtl) && parsedCustomTtl > 0 + ? Math.ceil(parsedCustomTtl) + : ttlConfig[errorType] + const markedAtIso = new Date().toISOString() + const expiresAtIso = new Date(Date.now() + ttlSeconds * 1000).toISOString() const redis = getRedis() const client = redis.getClientSafe() @@ -215,18 +234,21 @@ const markTempUnavailable = async ( JSON.stringify({ statusCode, errorType, - markedAt: new Date().toISOString() + markedAt: markedAtIso, + ttlSeconds, + cooldownSeconds: ttlSeconds, + expiresAt: expiresAtIso }) ) logger.warn( - `⏱️ [UpstreamError] Account ${accountId} (${accountType}) marked temporarily unavailable for ${ttlSeconds}s (${statusCode} ${errorType})` + `⏱️ [UpstreamError] Account ${accountId} (${accountType}) marked temporarily unavailable for ${ttlSeconds}s (${statusCode} ${errorType}), recovers at ${expiresAtIso}` ) // 异步记录错误历史,不阻塞主流程 recordErrorHistory(accountId, accountType, statusCode, errorType, context).catch(() => {}) - return { success: true, ttlSeconds, errorType } + return { success: true, ttlSeconds, errorType, expiresAt: expiresAtIso } } catch (error) { logger.error( `❌ [UpstreamError] Failed to mark account ${accountId} temporarily unavailable:`, @@ -242,7 +264,22 @@ const isTempUnavailable = async (accountId, accountType) => { const redis = getRedis() const client = redis.getClientSafe() const key = `${TEMP_UNAVAILABLE_PREFIX}:${accountType}:${accountId}` - return (await client.exists(key)) === 1 + const ttl = await client.ttl(key) + + if (ttl === -2) { + return false + } + + if (ttl === -1) { + // 理论上该 key 必须带 TTL;如果无 TTL,自动清理以避免“永久不可用” + logger.warn( + `⚠️ [UpstreamError] Found temp_unavailable key without TTL for account ${accountId} (${accountType}), auto-clearing` + ) + await client.del(key) + return false + } + + return ttl > 0 } catch (error) { logger.error( `❌ [UpstreamError] Failed to check temp unavailable status for ${accountId}:`, @@ -281,6 +318,7 @@ const getAllTempUnavailable = async () => { pipeline.ttl(key) } const results = await pipeline.exec() + const cleanupPipeline = client.pipeline() const statuses = {} for (let i = 0; i < keys.length; i++) { @@ -295,21 +333,40 @@ const getAllTempUnavailable = async () => { continue } + if (ttl === -1) { + // 自愈:清理无 TTL 的异常键,避免账户被永久阻塞 + cleanupPipeline.del(key) + continue + } + try { const data = JSON.parse(value) const compositeKey = `${accountType}:${accountId}` + const cooldownSecondsRaw = Number(data.cooldownSeconds) + const ttlSecondsRaw = Number(data.ttlSeconds) + const configuredCooldownSeconds = Number.isFinite(cooldownSecondsRaw) + ? Math.max(0, Math.floor(cooldownSecondsRaw)) + : Number.isFinite(ttlSecondsRaw) + ? Math.max(0, Math.floor(ttlSecondsRaw)) + : null + statuses[compositeKey] = { accountId, accountType, statusCode: data.statusCode, errorType: data.errorType, markedAt: data.markedAt, - ttl: ttl > 0 ? ttl : 0 + ttl: ttl > 0 ? ttl : 0, + remainingSeconds: ttl > 0 ? ttl : 0, + cooldownSeconds: configuredCooldownSeconds, + expiresAt: data.expiresAt || null } } catch { // ignore parse errors } } + + await cleanupPipeline.exec().catch(() => {}) return statuses } catch (error) { logger.error('❌ [UpstreamError] Failed to get all temp unavailable statuses:', error) diff --git a/web/admin-spa/src/views/AccountsView.vue b/web/admin-spa/src/views/AccountsView.vue index dc61685b..8c8c1867 100644 --- a/web/admin-spa/src/views/AccountsView.vue +++ b/web/admin-spa/src/views/AccountsView.vue @@ -756,11 +756,23 @@ > 临时暂停 - ({{ formatTempUnavailableTime(account.tempUnavailable.ttl) }}) + + ({{ + formatTempUnavailableTime( + getTempUnavailableRemainingSeconds(account.tempUnavailable) + ) + }} + / + {{ + formatTempUnavailableTime( + getTempUnavailableCooldownSeconds(account.tempUnavailable) + ) + }}) + @@ -2328,6 +2340,38 @@ const platformToAccountType = (platform) => { if (platform === 'azure_openai') return 'azure-openai' return platform } + +const TEMP_UNAVAILABLE_ACCOUNT_TYPE_ALIASES = { + claude: ['claude-official', 'claude'], + 'claude-console': ['claude-console'], + bedrock: ['bedrock'], + gemini: ['gemini'], + 'gemini-api': ['gemini-api'], + openai: ['openai'], + 'openai-responses': ['openai-responses'], + ccr: ['ccr'], + droid: ['droid'], + azure_openai: ['azure-openai'], + 'azure-openai': ['azure-openai'] +} + +const resolveTempUnavailableStatusForAccount = (tempStatuses, account) => { + if (!tempStatuses || !account) return null + + const accountTypeAliases = TEMP_UNAVAILABLE_ACCOUNT_TYPE_ALIASES[account.platform] || [ + account.platform + ] + + for (const accountType of accountTypeAliases) { + const key = `${accountType}:${account.id}` + if (tempStatuses[key]) { + return tempStatuses[key] + } + } + + return null +} + const openErrorHistory = (account) => { errorHistoryTarget.value = { accountType: platformToAccountType(account.platform), @@ -3437,23 +3481,7 @@ const loadAccounts = async (forceReload = false) => { if (tempRes?.success && tempRes.data) { const tempStatuses = tempRes.data filteredAccounts = filteredAccounts.map((account) => { - // 尝试匹配 accountType:accountId - const platformTypeMap = { - claude: 'claude-official', - 'claude-console': 'claude-console', - bedrock: 'bedrock', - gemini: 'gemini', - 'gemini-api': 'gemini-api', - openai: 'openai', - 'openai-responses': 'openai-responses', - ccr: 'ccr', - droid: 'droid', - azure_openai: 'azure-openai', - 'azure-openai': 'azure-openai' - } - const accountType = platformTypeMap[account.platform] || account.platform - const key = `${accountType}:${account.id}` - const tempStatus = tempStatuses[key] + const tempStatus = resolveTempUnavailableStatusForAccount(tempStatuses, account) if (tempStatus) { return { ...account, tempUnavailable: tempStatus } } @@ -3757,6 +3785,83 @@ const formatTempUnavailableTime = (seconds) => { return `${secs}s` } +const toPositiveInteger = (value) => { + const parsed = Number(value) + return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : 0 +} + +const getTempUnavailableRemainingSeconds = (tempUnavailable) => { + if (!tempUnavailable) return 0 + return toPositiveInteger(tempUnavailable.remainingSeconds || tempUnavailable.ttl) +} + +const getTempUnavailableCooldownSeconds = (tempUnavailable) => { + if (!tempUnavailable) return 0 + return toPositiveInteger(tempUnavailable.cooldownSeconds) +} + +const getTempUnavailableRecoveryAt = (tempUnavailable) => { + if (!tempUnavailable) return '' + + if (tempUnavailable.expiresAt) { + const expiresAt = new Date(tempUnavailable.expiresAt) + if (!Number.isNaN(expiresAt.getTime())) { + return tempUnavailable.expiresAt + } + } + + if (tempUnavailable.markedAt) { + const markedAt = new Date(tempUnavailable.markedAt) + const cooldownSeconds = getTempUnavailableCooldownSeconds(tempUnavailable) + if (!Number.isNaN(markedAt.getTime()) && cooldownSeconds > 0) { + return new Date(markedAt.getTime() + cooldownSeconds * 1000).toISOString() + } + } + + return '' +} + +const formatTempUnavailableRecoveryAt = (tempUnavailable) => { + const recoveryAt = getTempUnavailableRecoveryAt(tempUnavailable) + if (!recoveryAt) return '' + + const recoveryDate = new Date(recoveryAt) + if (Number.isNaN(recoveryDate.getTime())) return '' + + const month = `${recoveryDate.getMonth() + 1}`.padStart(2, '0') + const day = `${recoveryDate.getDate()}`.padStart(2, '0') + const hours = `${recoveryDate.getHours()}`.padStart(2, '0') + const minutes = `${recoveryDate.getMinutes()}`.padStart(2, '0') + const seconds = `${recoveryDate.getSeconds()}`.padStart(2, '0') + return `${month}-${day} ${hours}:${minutes}:${seconds}` +} + +const getTempUnavailableTooltipContent = (tempUnavailable) => { + if (!tempUnavailable) return '' + + const details = [] + const statusCodeText = tempUnavailable.statusCode ? `HTTP ${tempUnavailable.statusCode}` : '' + const errorTypeText = tempUnavailable.errorType || 'upstream_error' + details.push(`${errorTypeText}${statusCodeText ? ` (${statusCodeText})` : ''}`) + + const cooldownSeconds = getTempUnavailableCooldownSeconds(tempUnavailable) + if (cooldownSeconds > 0) { + details.push(`内部冷却 ${formatTempUnavailableTime(cooldownSeconds)}`) + } + + const remainingSeconds = getTempUnavailableRemainingSeconds(tempUnavailable) + if (remainingSeconds > 0) { + details.push(`剩余 ${formatTempUnavailableTime(remainingSeconds)}`) + } + + const recoveryAtText = formatTempUnavailableRecoveryAt(tempUnavailable) + if (recoveryAtText) { + details.push(`预计恢复 ${recoveryAtText}`) + } + + return details.join(',') +} + // 检查账户是否被限流 const isAccountRateLimited = (account) => { if (!account) return false @@ -4488,12 +4593,25 @@ const getRoutingBlockReasons = (account) => { } if (account.tempUnavailable) { - const ttl = Number.isFinite(account.tempUnavailable.ttl) - ? formatTempUnavailableTime(account.tempUnavailable.ttl) - : '' + const cooldownSeconds = getTempUnavailableCooldownSeconds(account.tempUnavailable) + const remainingSeconds = getTempUnavailableRemainingSeconds(account.tempUnavailable) + const recoveryAtText = formatTempUnavailableRecoveryAt(account.tempUnavailable) + + const detailParts = [] + if (cooldownSeconds > 0) { + detailParts.push(`内部冷却 ${formatTempUnavailableTime(cooldownSeconds)}`) + } + if (remainingSeconds > 0) { + detailParts.push(`剩余 ${formatTempUnavailableTime(remainingSeconds)}`) + } + if (recoveryAtText) { + detailParts.push(`预计恢复 ${recoveryAtText}`) + } + + const detailText = detailParts.length > 0 ? `,${detailParts.join(',')}` : '' const tempReason = account.tempUnavailable.errorType - ? `临时暂停(${account.tempUnavailable.errorType}${account.tempUnavailable.statusCode ? ` / HTTP ${account.tempUnavailable.statusCode}` : ''}${ttl ? `,剩余 ${ttl}` : ''})` - : `临时暂停${ttl ? `(剩余 ${ttl})` : ''}` + ? `临时暂停(${account.tempUnavailable.errorType}${account.tempUnavailable.statusCode ? ` / HTTP ${account.tempUnavailable.statusCode}` : ''}${detailText})` + : `临时暂停${detailParts.length > 0 ? `(${detailParts.join(',')})` : ''}` reasons.push(tempReason) }