fix: expose cooldown details and make 503 backoff configurable

This commit is contained in:
X-Zero-L
2026-03-02 21:53:18 +08:00
parent 96706c27f6
commit e611f97dae
6 changed files with 217 additions and 37 deletions

View File

@@ -156,7 +156,8 @@ ENABLE_CORS=true
TRUST_PROXY=true
# ⏱️ 上游错误自动暂停配置(秒)
# UPSTREAM_ERROR_5XX_TTL_SECONDS=300 # 5xx错误暂停时间(默认5分钟
# UPSTREAM_ERROR_503_TTL_SECONDS=60 # 503错误暂停时间(默认60秒
# UPSTREAM_ERROR_5XX_TTL_SECONDS=300 # 500/502等5xx错误暂停时间默认5分钟
# UPSTREAM_ERROR_OVERLOAD_TTL_SECONDS=600 # 529过载暂停时间默认10分钟
# UPSTREAM_ERROR_AUTH_TTL_SECONDS=1800 # 401/403认证错误暂停时间默认30分钟
# UPSTREAM_ERROR_TIMEOUT_TTL_SECONDS=300 # 504超时暂停时间默认5分钟

View File

@@ -232,6 +232,8 @@ const config = {
// ⏱️ 上游错误自动暂停配置
upstreamError: {
serviceUnavailableTtlSeconds:
parseInt(process.env.UPSTREAM_ERROR_503_TTL_SECONDS) || 60, // 503错误暂停秒数
serverErrorTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_5XX_TTL_SECONDS) || 300, // 5xx错误暂停秒数
overloadTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_OVERLOAD_TTL_SECONDS) || 600, // 529过载暂停秒数
authErrorTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_AUTH_TTL_SECONDS) || 1800, // 401/403认证错误暂停秒数

View File

@@ -2886,13 +2886,14 @@ class ClaudeRelayService {
`⏱️ ${prefix}${isTimeout ? 'Timeout' : 'Server'} error for account ${accountId}, error count: ${errorCount}/${threshold}`
)
// 标记账户为临时不可用(5分钟
// 标记账户为临时不可用(TTL 由 upstreamError 配置决定
try {
await unifiedClaudeScheduler.markAccountTemporarilyUnavailable(
accountId,
accountType,
sessionHash,
300
null,
statusCode
)
} catch (markError) {
logger.error(`❌ Failed to mark account temporarily unavailable: ${accountId}`, markError)

View File

@@ -1299,10 +1299,11 @@ class UnifiedClaudeScheduler {
accountId,
accountType,
sessionHash = null,
ttlSeconds = 300
ttlSeconds = null,
statusCode = 500
) {
try {
await upstreamErrorHelper.markTempUnavailable(accountId, accountType, 500, ttlSeconds)
await upstreamErrorHelper.markTempUnavailable(accountId, accountType, statusCode, ttlSeconds)
if (sessionHash) {
await this._deleteSessionMapping(sessionHash)
}

View File

@@ -8,6 +8,7 @@ const ERROR_HISTORY_TTL = 3 * 24 * 60 * 60 // 3天
// 默认 TTL
const DEFAULT_TTL = {
server_error: 300, // 5xx: 5分钟
service_unavailable: 60, // 503: 1分钟默认更短避免短暂抖动导致长时间不可路由
overload: 600, // 529: 10分钟
auth_error: 1800, // 401/403: 30分钟
timeout: 300, // 504/网络超时: 5分钟
@@ -29,7 +30,16 @@ const getConfig = () => {
const getTtlConfig = () => {
const config = getConfig()
const parseEnvPositiveInt = (name) => {
const value = parseInt(process.env[name], 10)
return Number.isFinite(value) && value > 0 ? value : null
}
return {
service_unavailable:
config.upstreamError?.serviceUnavailableTtlSeconds ??
parseEnvPositiveInt('UPSTREAM_ERROR_503_TTL_SECONDS') ??
DEFAULT_TTL.service_unavailable,
server_error: config.upstreamError?.serverErrorTtlSeconds ?? DEFAULT_TTL.server_error,
overload: config.upstreamError?.overloadTtlSeconds ?? DEFAULT_TTL.overload,
auth_error: config.upstreamError?.authErrorTtlSeconds ?? DEFAULT_TTL.auth_error,
@@ -52,6 +62,9 @@ const classifyError = (statusCode) => {
if (statusCode === 529) {
return 'overload'
}
if (statusCode === 503) {
return 'service_unavailable'
}
if (statusCode === 504) {
return 'timeout'
}
@@ -204,7 +217,13 @@ const markTempUnavailable = async (
}
const ttlConfig = getTtlConfig()
const ttlSeconds = customTtl ?? ttlConfig[errorType]
const parsedCustomTtl = Number(customTtl)
const ttlSeconds =
Number.isFinite(parsedCustomTtl) && parsedCustomTtl > 0
? Math.ceil(parsedCustomTtl)
: ttlConfig[errorType]
const markedAtIso = new Date().toISOString()
const expiresAtIso = new Date(Date.now() + ttlSeconds * 1000).toISOString()
const redis = getRedis()
const client = redis.getClientSafe()
@@ -215,18 +234,21 @@ const markTempUnavailable = async (
JSON.stringify({
statusCode,
errorType,
markedAt: new Date().toISOString()
markedAt: markedAtIso,
ttlSeconds,
cooldownSeconds: ttlSeconds,
expiresAt: expiresAtIso
})
)
logger.warn(
`⏱️ [UpstreamError] Account ${accountId} (${accountType}) marked temporarily unavailable for ${ttlSeconds}s (${statusCode} ${errorType})`
`⏱️ [UpstreamError] Account ${accountId} (${accountType}) marked temporarily unavailable for ${ttlSeconds}s (${statusCode} ${errorType}), recovers at ${expiresAtIso}`
)
// 异步记录错误历史,不阻塞主流程
recordErrorHistory(accountId, accountType, statusCode, errorType, context).catch(() => {})
return { success: true, ttlSeconds, errorType }
return { success: true, ttlSeconds, errorType, expiresAt: expiresAtIso }
} catch (error) {
logger.error(
`❌ [UpstreamError] Failed to mark account ${accountId} temporarily unavailable:`,
@@ -242,7 +264,22 @@ const isTempUnavailable = async (accountId, accountType) => {
const redis = getRedis()
const client = redis.getClientSafe()
const key = `${TEMP_UNAVAILABLE_PREFIX}:${accountType}:${accountId}`
return (await client.exists(key)) === 1
const ttl = await client.ttl(key)
if (ttl === -2) {
return false
}
if (ttl === -1) {
// 理论上该 key 必须带 TTL如果无 TTL自动清理以避免“永久不可用”
logger.warn(
`⚠️ [UpstreamError] Found temp_unavailable key without TTL for account ${accountId} (${accountType}), auto-clearing`
)
await client.del(key)
return false
}
return ttl > 0
} catch (error) {
logger.error(
`❌ [UpstreamError] Failed to check temp unavailable status for ${accountId}:`,
@@ -281,6 +318,7 @@ const getAllTempUnavailable = async () => {
pipeline.ttl(key)
}
const results = await pipeline.exec()
const cleanupPipeline = client.pipeline()
const statuses = {}
for (let i = 0; i < keys.length; i++) {
@@ -295,21 +333,40 @@ const getAllTempUnavailable = async () => {
continue
}
if (ttl === -1) {
// 自愈:清理无 TTL 的异常键,避免账户被永久阻塞
cleanupPipeline.del(key)
continue
}
try {
const data = JSON.parse(value)
const compositeKey = `${accountType}:${accountId}`
const cooldownSecondsRaw = Number(data.cooldownSeconds)
const ttlSecondsRaw = Number(data.ttlSeconds)
const configuredCooldownSeconds = Number.isFinite(cooldownSecondsRaw)
? Math.max(0, Math.floor(cooldownSecondsRaw))
: Number.isFinite(ttlSecondsRaw)
? Math.max(0, Math.floor(ttlSecondsRaw))
: null
statuses[compositeKey] = {
accountId,
accountType,
statusCode: data.statusCode,
errorType: data.errorType,
markedAt: data.markedAt,
ttl: ttl > 0 ? ttl : 0
ttl: ttl > 0 ? ttl : 0,
remainingSeconds: ttl > 0 ? ttl : 0,
cooldownSeconds: configuredCooldownSeconds,
expiresAt: data.expiresAt || null
}
} catch {
// ignore parse errors
}
}
await cleanupPipeline.exec().catch(() => {})
return statuses
} catch (error) {
logger.error('❌ [UpstreamError] Failed to get all temp unavailable statuses:', error)

View File

@@ -756,11 +756,23 @@
>
<i class="fas fa-clock mr-1" />
临时暂停
<span v-if="account.tempUnavailable.ttl > 0"
>({{ formatTempUnavailableTime(account.tempUnavailable.ttl) }})</span
>
<span v-if="getTempUnavailableRemainingSeconds(account.tempUnavailable) > 0">
({{
formatTempUnavailableTime(
getTempUnavailableRemainingSeconds(account.tempUnavailable)
)
}}
<span v-if="getTempUnavailableCooldownSeconds(account.tempUnavailable) > 0"
>/
{{
formatTempUnavailableTime(
getTempUnavailableCooldownSeconds(account.tempUnavailable)
)
}}</span
>)
</span>
<el-tooltip
:content="`${account.tempUnavailable.errorType} (HTTP ${account.tempUnavailable.statusCode})`"
:content="getTempUnavailableTooltipContent(account.tempUnavailable)"
effect="dark"
placement="top"
>
@@ -2328,6 +2340,38 @@ const platformToAccountType = (platform) => {
if (platform === 'azure_openai') return 'azure-openai'
return platform
}
const TEMP_UNAVAILABLE_ACCOUNT_TYPE_ALIASES = {
claude: ['claude-official', 'claude'],
'claude-console': ['claude-console'],
bedrock: ['bedrock'],
gemini: ['gemini'],
'gemini-api': ['gemini-api'],
openai: ['openai'],
'openai-responses': ['openai-responses'],
ccr: ['ccr'],
droid: ['droid'],
azure_openai: ['azure-openai'],
'azure-openai': ['azure-openai']
}
const resolveTempUnavailableStatusForAccount = (tempStatuses, account) => {
if (!tempStatuses || !account) return null
const accountTypeAliases = TEMP_UNAVAILABLE_ACCOUNT_TYPE_ALIASES[account.platform] || [
account.platform
]
for (const accountType of accountTypeAliases) {
const key = `${accountType}:${account.id}`
if (tempStatuses[key]) {
return tempStatuses[key]
}
}
return null
}
const openErrorHistory = (account) => {
errorHistoryTarget.value = {
accountType: platformToAccountType(account.platform),
@@ -3437,23 +3481,7 @@ const loadAccounts = async (forceReload = false) => {
if (tempRes?.success && tempRes.data) {
const tempStatuses = tempRes.data
filteredAccounts = filteredAccounts.map((account) => {
// 尝试匹配 accountType:accountId
const platformTypeMap = {
claude: 'claude-official',
'claude-console': 'claude-console',
bedrock: 'bedrock',
gemini: 'gemini',
'gemini-api': 'gemini-api',
openai: 'openai',
'openai-responses': 'openai-responses',
ccr: 'ccr',
droid: 'droid',
azure_openai: 'azure-openai',
'azure-openai': 'azure-openai'
}
const accountType = platformTypeMap[account.platform] || account.platform
const key = `${accountType}:${account.id}`
const tempStatus = tempStatuses[key]
const tempStatus = resolveTempUnavailableStatusForAccount(tempStatuses, account)
if (tempStatus) {
return { ...account, tempUnavailable: tempStatus }
}
@@ -3757,6 +3785,83 @@ const formatTempUnavailableTime = (seconds) => {
return `${secs}s`
}
const toPositiveInteger = (value) => {
const parsed = Number(value)
return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : 0
}
const getTempUnavailableRemainingSeconds = (tempUnavailable) => {
if (!tempUnavailable) return 0
return toPositiveInteger(tempUnavailable.remainingSeconds || tempUnavailable.ttl)
}
const getTempUnavailableCooldownSeconds = (tempUnavailable) => {
if (!tempUnavailable) return 0
return toPositiveInteger(tempUnavailable.cooldownSeconds)
}
const getTempUnavailableRecoveryAt = (tempUnavailable) => {
if (!tempUnavailable) return ''
if (tempUnavailable.expiresAt) {
const expiresAt = new Date(tempUnavailable.expiresAt)
if (!Number.isNaN(expiresAt.getTime())) {
return tempUnavailable.expiresAt
}
}
if (tempUnavailable.markedAt) {
const markedAt = new Date(tempUnavailable.markedAt)
const cooldownSeconds = getTempUnavailableCooldownSeconds(tempUnavailable)
if (!Number.isNaN(markedAt.getTime()) && cooldownSeconds > 0) {
return new Date(markedAt.getTime() + cooldownSeconds * 1000).toISOString()
}
}
return ''
}
const formatTempUnavailableRecoveryAt = (tempUnavailable) => {
const recoveryAt = getTempUnavailableRecoveryAt(tempUnavailable)
if (!recoveryAt) return ''
const recoveryDate = new Date(recoveryAt)
if (Number.isNaN(recoveryDate.getTime())) return ''
const month = `${recoveryDate.getMonth() + 1}`.padStart(2, '0')
const day = `${recoveryDate.getDate()}`.padStart(2, '0')
const hours = `${recoveryDate.getHours()}`.padStart(2, '0')
const minutes = `${recoveryDate.getMinutes()}`.padStart(2, '0')
const seconds = `${recoveryDate.getSeconds()}`.padStart(2, '0')
return `${month}-${day} ${hours}:${minutes}:${seconds}`
}
const getTempUnavailableTooltipContent = (tempUnavailable) => {
if (!tempUnavailable) return ''
const details = []
const statusCodeText = tempUnavailable.statusCode ? `HTTP ${tempUnavailable.statusCode}` : ''
const errorTypeText = tempUnavailable.errorType || 'upstream_error'
details.push(`${errorTypeText}${statusCodeText ? ` (${statusCodeText})` : ''}`)
const cooldownSeconds = getTempUnavailableCooldownSeconds(tempUnavailable)
if (cooldownSeconds > 0) {
details.push(`内部冷却 ${formatTempUnavailableTime(cooldownSeconds)}`)
}
const remainingSeconds = getTempUnavailableRemainingSeconds(tempUnavailable)
if (remainingSeconds > 0) {
details.push(`剩余 ${formatTempUnavailableTime(remainingSeconds)}`)
}
const recoveryAtText = formatTempUnavailableRecoveryAt(tempUnavailable)
if (recoveryAtText) {
details.push(`预计恢复 ${recoveryAtText}`)
}
return details.join('')
}
// 检查账户是否被限流
const isAccountRateLimited = (account) => {
if (!account) return false
@@ -4488,12 +4593,25 @@ const getRoutingBlockReasons = (account) => {
}
if (account.tempUnavailable) {
const ttl = Number.isFinite(account.tempUnavailable.ttl)
? formatTempUnavailableTime(account.tempUnavailable.ttl)
: ''
const cooldownSeconds = getTempUnavailableCooldownSeconds(account.tempUnavailable)
const remainingSeconds = getTempUnavailableRemainingSeconds(account.tempUnavailable)
const recoveryAtText = formatTempUnavailableRecoveryAt(account.tempUnavailable)
const detailParts = []
if (cooldownSeconds > 0) {
detailParts.push(`内部冷却 ${formatTempUnavailableTime(cooldownSeconds)}`)
}
if (remainingSeconds > 0) {
detailParts.push(`剩余 ${formatTempUnavailableTime(remainingSeconds)}`)
}
if (recoveryAtText) {
detailParts.push(`预计恢复 ${recoveryAtText}`)
}
const detailText = detailParts.length > 0 ? `${detailParts.join('')}` : ''
const tempReason = account.tempUnavailable.errorType
? `临时暂停(${account.tempUnavailable.errorType}${account.tempUnavailable.statusCode ? ` / HTTP ${account.tempUnavailable.statusCode}` : ''}${ttl ? `,剩余 ${ttl}` : ''}`
: `临时暂停${ttl ? `剩余 ${ttl}` : ''}`
? `临时暂停(${account.tempUnavailable.errorType}${account.tempUnavailable.statusCode ? ` / HTTP ${account.tempUnavailable.statusCode}` : ''}${detailText}`
: `临时暂停${detailParts.length > 0 ? `${detailParts.join('')}` : ''}`
reasons.push(tempReason)
}