mirror of
https://github.com/Wei-Shaw/claude-relay-service.git
synced 2026-03-29 23:14:57 +00:00
fix: expose cooldown details and make 503 backoff configurable
This commit is contained in:
@@ -156,7 +156,8 @@ ENABLE_CORS=true
|
||||
TRUST_PROXY=true
|
||||
|
||||
# ⏱️ 上游错误自动暂停配置(秒)
|
||||
# UPSTREAM_ERROR_5XX_TTL_SECONDS=300 # 5xx错误暂停时间(默认5分钟)
|
||||
# UPSTREAM_ERROR_503_TTL_SECONDS=60 # 503错误暂停时间(默认60秒)
|
||||
# UPSTREAM_ERROR_5XX_TTL_SECONDS=300 # 500/502等5xx错误暂停时间(默认5分钟)
|
||||
# UPSTREAM_ERROR_OVERLOAD_TTL_SECONDS=600 # 529过载暂停时间(默认10分钟)
|
||||
# UPSTREAM_ERROR_AUTH_TTL_SECONDS=1800 # 401/403认证错误暂停时间(默认30分钟)
|
||||
# UPSTREAM_ERROR_TIMEOUT_TTL_SECONDS=300 # 504超时暂停时间(默认5分钟)
|
||||
|
||||
@@ -232,6 +232,8 @@ const config = {
|
||||
|
||||
// ⏱️ 上游错误自动暂停配置
|
||||
upstreamError: {
|
||||
serviceUnavailableTtlSeconds:
|
||||
parseInt(process.env.UPSTREAM_ERROR_503_TTL_SECONDS) || 60, // 503错误暂停秒数
|
||||
serverErrorTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_5XX_TTL_SECONDS) || 300, // 5xx错误暂停秒数
|
||||
overloadTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_OVERLOAD_TTL_SECONDS) || 600, // 529过载暂停秒数
|
||||
authErrorTtlSeconds: parseInt(process.env.UPSTREAM_ERROR_AUTH_TTL_SECONDS) || 1800, // 401/403认证错误暂停秒数
|
||||
|
||||
@@ -2886,13 +2886,14 @@ class ClaudeRelayService {
|
||||
`⏱️ ${prefix}${isTimeout ? 'Timeout' : 'Server'} error for account ${accountId}, error count: ${errorCount}/${threshold}`
|
||||
)
|
||||
|
||||
// 标记账户为临时不可用(5分钟)
|
||||
// 标记账户为临时不可用(TTL 由 upstreamError 配置决定)
|
||||
try {
|
||||
await unifiedClaudeScheduler.markAccountTemporarilyUnavailable(
|
||||
accountId,
|
||||
accountType,
|
||||
sessionHash,
|
||||
300
|
||||
null,
|
||||
statusCode
|
||||
)
|
||||
} catch (markError) {
|
||||
logger.error(`❌ Failed to mark account temporarily unavailable: ${accountId}`, markError)
|
||||
|
||||
@@ -1299,10 +1299,11 @@ class UnifiedClaudeScheduler {
|
||||
accountId,
|
||||
accountType,
|
||||
sessionHash = null,
|
||||
ttlSeconds = 300
|
||||
ttlSeconds = null,
|
||||
statusCode = 500
|
||||
) {
|
||||
try {
|
||||
await upstreamErrorHelper.markTempUnavailable(accountId, accountType, 500, ttlSeconds)
|
||||
await upstreamErrorHelper.markTempUnavailable(accountId, accountType, statusCode, ttlSeconds)
|
||||
if (sessionHash) {
|
||||
await this._deleteSessionMapping(sessionHash)
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ const ERROR_HISTORY_TTL = 3 * 24 * 60 * 60 // 3天
|
||||
// 默认 TTL(秒)
|
||||
const DEFAULT_TTL = {
|
||||
server_error: 300, // 5xx: 5分钟
|
||||
service_unavailable: 60, // 503: 1分钟(默认更短,避免短暂抖动导致长时间不可路由)
|
||||
overload: 600, // 529: 10分钟
|
||||
auth_error: 1800, // 401/403: 30分钟
|
||||
timeout: 300, // 504/网络超时: 5分钟
|
||||
@@ -29,7 +30,16 @@ const getConfig = () => {
|
||||
|
||||
const getTtlConfig = () => {
|
||||
const config = getConfig()
|
||||
const parseEnvPositiveInt = (name) => {
|
||||
const value = parseInt(process.env[name], 10)
|
||||
return Number.isFinite(value) && value > 0 ? value : null
|
||||
}
|
||||
|
||||
return {
|
||||
service_unavailable:
|
||||
config.upstreamError?.serviceUnavailableTtlSeconds ??
|
||||
parseEnvPositiveInt('UPSTREAM_ERROR_503_TTL_SECONDS') ??
|
||||
DEFAULT_TTL.service_unavailable,
|
||||
server_error: config.upstreamError?.serverErrorTtlSeconds ?? DEFAULT_TTL.server_error,
|
||||
overload: config.upstreamError?.overloadTtlSeconds ?? DEFAULT_TTL.overload,
|
||||
auth_error: config.upstreamError?.authErrorTtlSeconds ?? DEFAULT_TTL.auth_error,
|
||||
@@ -52,6 +62,9 @@ const classifyError = (statusCode) => {
|
||||
if (statusCode === 529) {
|
||||
return 'overload'
|
||||
}
|
||||
if (statusCode === 503) {
|
||||
return 'service_unavailable'
|
||||
}
|
||||
if (statusCode === 504) {
|
||||
return 'timeout'
|
||||
}
|
||||
@@ -204,7 +217,13 @@ const markTempUnavailable = async (
|
||||
}
|
||||
|
||||
const ttlConfig = getTtlConfig()
|
||||
const ttlSeconds = customTtl ?? ttlConfig[errorType]
|
||||
const parsedCustomTtl = Number(customTtl)
|
||||
const ttlSeconds =
|
||||
Number.isFinite(parsedCustomTtl) && parsedCustomTtl > 0
|
||||
? Math.ceil(parsedCustomTtl)
|
||||
: ttlConfig[errorType]
|
||||
const markedAtIso = new Date().toISOString()
|
||||
const expiresAtIso = new Date(Date.now() + ttlSeconds * 1000).toISOString()
|
||||
|
||||
const redis = getRedis()
|
||||
const client = redis.getClientSafe()
|
||||
@@ -215,18 +234,21 @@ const markTempUnavailable = async (
|
||||
JSON.stringify({
|
||||
statusCode,
|
||||
errorType,
|
||||
markedAt: new Date().toISOString()
|
||||
markedAt: markedAtIso,
|
||||
ttlSeconds,
|
||||
cooldownSeconds: ttlSeconds,
|
||||
expiresAt: expiresAtIso
|
||||
})
|
||||
)
|
||||
|
||||
logger.warn(
|
||||
`⏱️ [UpstreamError] Account ${accountId} (${accountType}) marked temporarily unavailable for ${ttlSeconds}s (${statusCode} ${errorType})`
|
||||
`⏱️ [UpstreamError] Account ${accountId} (${accountType}) marked temporarily unavailable for ${ttlSeconds}s (${statusCode} ${errorType}), recovers at ${expiresAtIso}`
|
||||
)
|
||||
|
||||
// 异步记录错误历史,不阻塞主流程
|
||||
recordErrorHistory(accountId, accountType, statusCode, errorType, context).catch(() => {})
|
||||
|
||||
return { success: true, ttlSeconds, errorType }
|
||||
return { success: true, ttlSeconds, errorType, expiresAt: expiresAtIso }
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`❌ [UpstreamError] Failed to mark account ${accountId} temporarily unavailable:`,
|
||||
@@ -242,7 +264,22 @@ const isTempUnavailable = async (accountId, accountType) => {
|
||||
const redis = getRedis()
|
||||
const client = redis.getClientSafe()
|
||||
const key = `${TEMP_UNAVAILABLE_PREFIX}:${accountType}:${accountId}`
|
||||
return (await client.exists(key)) === 1
|
||||
const ttl = await client.ttl(key)
|
||||
|
||||
if (ttl === -2) {
|
||||
return false
|
||||
}
|
||||
|
||||
if (ttl === -1) {
|
||||
// 理论上该 key 必须带 TTL;如果无 TTL,自动清理以避免“永久不可用”
|
||||
logger.warn(
|
||||
`⚠️ [UpstreamError] Found temp_unavailable key without TTL for account ${accountId} (${accountType}), auto-clearing`
|
||||
)
|
||||
await client.del(key)
|
||||
return false
|
||||
}
|
||||
|
||||
return ttl > 0
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`❌ [UpstreamError] Failed to check temp unavailable status for ${accountId}:`,
|
||||
@@ -281,6 +318,7 @@ const getAllTempUnavailable = async () => {
|
||||
pipeline.ttl(key)
|
||||
}
|
||||
const results = await pipeline.exec()
|
||||
const cleanupPipeline = client.pipeline()
|
||||
|
||||
const statuses = {}
|
||||
for (let i = 0; i < keys.length; i++) {
|
||||
@@ -295,21 +333,40 @@ const getAllTempUnavailable = async () => {
|
||||
continue
|
||||
}
|
||||
|
||||
if (ttl === -1) {
|
||||
// 自愈:清理无 TTL 的异常键,避免账户被永久阻塞
|
||||
cleanupPipeline.del(key)
|
||||
continue
|
||||
}
|
||||
|
||||
try {
|
||||
const data = JSON.parse(value)
|
||||
const compositeKey = `${accountType}:${accountId}`
|
||||
const cooldownSecondsRaw = Number(data.cooldownSeconds)
|
||||
const ttlSecondsRaw = Number(data.ttlSeconds)
|
||||
const configuredCooldownSeconds = Number.isFinite(cooldownSecondsRaw)
|
||||
? Math.max(0, Math.floor(cooldownSecondsRaw))
|
||||
: Number.isFinite(ttlSecondsRaw)
|
||||
? Math.max(0, Math.floor(ttlSecondsRaw))
|
||||
: null
|
||||
|
||||
statuses[compositeKey] = {
|
||||
accountId,
|
||||
accountType,
|
||||
statusCode: data.statusCode,
|
||||
errorType: data.errorType,
|
||||
markedAt: data.markedAt,
|
||||
ttl: ttl > 0 ? ttl : 0
|
||||
ttl: ttl > 0 ? ttl : 0,
|
||||
remainingSeconds: ttl > 0 ? ttl : 0,
|
||||
cooldownSeconds: configuredCooldownSeconds,
|
||||
expiresAt: data.expiresAt || null
|
||||
}
|
||||
} catch {
|
||||
// ignore parse errors
|
||||
}
|
||||
}
|
||||
|
||||
await cleanupPipeline.exec().catch(() => {})
|
||||
return statuses
|
||||
} catch (error) {
|
||||
logger.error('❌ [UpstreamError] Failed to get all temp unavailable statuses:', error)
|
||||
|
||||
@@ -756,11 +756,23 @@
|
||||
>
|
||||
<i class="fas fa-clock mr-1" />
|
||||
临时暂停
|
||||
<span v-if="account.tempUnavailable.ttl > 0"
|
||||
>({{ formatTempUnavailableTime(account.tempUnavailable.ttl) }})</span
|
||||
>
|
||||
<span v-if="getTempUnavailableRemainingSeconds(account.tempUnavailable) > 0">
|
||||
({{
|
||||
formatTempUnavailableTime(
|
||||
getTempUnavailableRemainingSeconds(account.tempUnavailable)
|
||||
)
|
||||
}}
|
||||
<span v-if="getTempUnavailableCooldownSeconds(account.tempUnavailable) > 0"
|
||||
>/
|
||||
{{
|
||||
formatTempUnavailableTime(
|
||||
getTempUnavailableCooldownSeconds(account.tempUnavailable)
|
||||
)
|
||||
}}</span
|
||||
>)
|
||||
</span>
|
||||
<el-tooltip
|
||||
:content="`${account.tempUnavailable.errorType} (HTTP ${account.tempUnavailable.statusCode})`"
|
||||
:content="getTempUnavailableTooltipContent(account.tempUnavailable)"
|
||||
effect="dark"
|
||||
placement="top"
|
||||
>
|
||||
@@ -2328,6 +2340,38 @@ const platformToAccountType = (platform) => {
|
||||
if (platform === 'azure_openai') return 'azure-openai'
|
||||
return platform
|
||||
}
|
||||
|
||||
const TEMP_UNAVAILABLE_ACCOUNT_TYPE_ALIASES = {
|
||||
claude: ['claude-official', 'claude'],
|
||||
'claude-console': ['claude-console'],
|
||||
bedrock: ['bedrock'],
|
||||
gemini: ['gemini'],
|
||||
'gemini-api': ['gemini-api'],
|
||||
openai: ['openai'],
|
||||
'openai-responses': ['openai-responses'],
|
||||
ccr: ['ccr'],
|
||||
droid: ['droid'],
|
||||
azure_openai: ['azure-openai'],
|
||||
'azure-openai': ['azure-openai']
|
||||
}
|
||||
|
||||
const resolveTempUnavailableStatusForAccount = (tempStatuses, account) => {
|
||||
if (!tempStatuses || !account) return null
|
||||
|
||||
const accountTypeAliases = TEMP_UNAVAILABLE_ACCOUNT_TYPE_ALIASES[account.platform] || [
|
||||
account.platform
|
||||
]
|
||||
|
||||
for (const accountType of accountTypeAliases) {
|
||||
const key = `${accountType}:${account.id}`
|
||||
if (tempStatuses[key]) {
|
||||
return tempStatuses[key]
|
||||
}
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
const openErrorHistory = (account) => {
|
||||
errorHistoryTarget.value = {
|
||||
accountType: platformToAccountType(account.platform),
|
||||
@@ -3437,23 +3481,7 @@ const loadAccounts = async (forceReload = false) => {
|
||||
if (tempRes?.success && tempRes.data) {
|
||||
const tempStatuses = tempRes.data
|
||||
filteredAccounts = filteredAccounts.map((account) => {
|
||||
// 尝试匹配 accountType:accountId
|
||||
const platformTypeMap = {
|
||||
claude: 'claude-official',
|
||||
'claude-console': 'claude-console',
|
||||
bedrock: 'bedrock',
|
||||
gemini: 'gemini',
|
||||
'gemini-api': 'gemini-api',
|
||||
openai: 'openai',
|
||||
'openai-responses': 'openai-responses',
|
||||
ccr: 'ccr',
|
||||
droid: 'droid',
|
||||
azure_openai: 'azure-openai',
|
||||
'azure-openai': 'azure-openai'
|
||||
}
|
||||
const accountType = platformTypeMap[account.platform] || account.platform
|
||||
const key = `${accountType}:${account.id}`
|
||||
const tempStatus = tempStatuses[key]
|
||||
const tempStatus = resolveTempUnavailableStatusForAccount(tempStatuses, account)
|
||||
if (tempStatus) {
|
||||
return { ...account, tempUnavailable: tempStatus }
|
||||
}
|
||||
@@ -3757,6 +3785,83 @@ const formatTempUnavailableTime = (seconds) => {
|
||||
return `${secs}s`
|
||||
}
|
||||
|
||||
const toPositiveInteger = (value) => {
|
||||
const parsed = Number(value)
|
||||
return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : 0
|
||||
}
|
||||
|
||||
const getTempUnavailableRemainingSeconds = (tempUnavailable) => {
|
||||
if (!tempUnavailable) return 0
|
||||
return toPositiveInteger(tempUnavailable.remainingSeconds || tempUnavailable.ttl)
|
||||
}
|
||||
|
||||
const getTempUnavailableCooldownSeconds = (tempUnavailable) => {
|
||||
if (!tempUnavailable) return 0
|
||||
return toPositiveInteger(tempUnavailable.cooldownSeconds)
|
||||
}
|
||||
|
||||
const getTempUnavailableRecoveryAt = (tempUnavailable) => {
|
||||
if (!tempUnavailable) return ''
|
||||
|
||||
if (tempUnavailable.expiresAt) {
|
||||
const expiresAt = new Date(tempUnavailable.expiresAt)
|
||||
if (!Number.isNaN(expiresAt.getTime())) {
|
||||
return tempUnavailable.expiresAt
|
||||
}
|
||||
}
|
||||
|
||||
if (tempUnavailable.markedAt) {
|
||||
const markedAt = new Date(tempUnavailable.markedAt)
|
||||
const cooldownSeconds = getTempUnavailableCooldownSeconds(tempUnavailable)
|
||||
if (!Number.isNaN(markedAt.getTime()) && cooldownSeconds > 0) {
|
||||
return new Date(markedAt.getTime() + cooldownSeconds * 1000).toISOString()
|
||||
}
|
||||
}
|
||||
|
||||
return ''
|
||||
}
|
||||
|
||||
const formatTempUnavailableRecoveryAt = (tempUnavailable) => {
|
||||
const recoveryAt = getTempUnavailableRecoveryAt(tempUnavailable)
|
||||
if (!recoveryAt) return ''
|
||||
|
||||
const recoveryDate = new Date(recoveryAt)
|
||||
if (Number.isNaN(recoveryDate.getTime())) return ''
|
||||
|
||||
const month = `${recoveryDate.getMonth() + 1}`.padStart(2, '0')
|
||||
const day = `${recoveryDate.getDate()}`.padStart(2, '0')
|
||||
const hours = `${recoveryDate.getHours()}`.padStart(2, '0')
|
||||
const minutes = `${recoveryDate.getMinutes()}`.padStart(2, '0')
|
||||
const seconds = `${recoveryDate.getSeconds()}`.padStart(2, '0')
|
||||
return `${month}-${day} ${hours}:${minutes}:${seconds}`
|
||||
}
|
||||
|
||||
const getTempUnavailableTooltipContent = (tempUnavailable) => {
|
||||
if (!tempUnavailable) return ''
|
||||
|
||||
const details = []
|
||||
const statusCodeText = tempUnavailable.statusCode ? `HTTP ${tempUnavailable.statusCode}` : ''
|
||||
const errorTypeText = tempUnavailable.errorType || 'upstream_error'
|
||||
details.push(`${errorTypeText}${statusCodeText ? ` (${statusCodeText})` : ''}`)
|
||||
|
||||
const cooldownSeconds = getTempUnavailableCooldownSeconds(tempUnavailable)
|
||||
if (cooldownSeconds > 0) {
|
||||
details.push(`内部冷却 ${formatTempUnavailableTime(cooldownSeconds)}`)
|
||||
}
|
||||
|
||||
const remainingSeconds = getTempUnavailableRemainingSeconds(tempUnavailable)
|
||||
if (remainingSeconds > 0) {
|
||||
details.push(`剩余 ${formatTempUnavailableTime(remainingSeconds)}`)
|
||||
}
|
||||
|
||||
const recoveryAtText = formatTempUnavailableRecoveryAt(tempUnavailable)
|
||||
if (recoveryAtText) {
|
||||
details.push(`预计恢复 ${recoveryAtText}`)
|
||||
}
|
||||
|
||||
return details.join(',')
|
||||
}
|
||||
|
||||
// 检查账户是否被限流
|
||||
const isAccountRateLimited = (account) => {
|
||||
if (!account) return false
|
||||
@@ -4488,12 +4593,25 @@ const getRoutingBlockReasons = (account) => {
|
||||
}
|
||||
|
||||
if (account.tempUnavailable) {
|
||||
const ttl = Number.isFinite(account.tempUnavailable.ttl)
|
||||
? formatTempUnavailableTime(account.tempUnavailable.ttl)
|
||||
: ''
|
||||
const cooldownSeconds = getTempUnavailableCooldownSeconds(account.tempUnavailable)
|
||||
const remainingSeconds = getTempUnavailableRemainingSeconds(account.tempUnavailable)
|
||||
const recoveryAtText = formatTempUnavailableRecoveryAt(account.tempUnavailable)
|
||||
|
||||
const detailParts = []
|
||||
if (cooldownSeconds > 0) {
|
||||
detailParts.push(`内部冷却 ${formatTempUnavailableTime(cooldownSeconds)}`)
|
||||
}
|
||||
if (remainingSeconds > 0) {
|
||||
detailParts.push(`剩余 ${formatTempUnavailableTime(remainingSeconds)}`)
|
||||
}
|
||||
if (recoveryAtText) {
|
||||
detailParts.push(`预计恢复 ${recoveryAtText}`)
|
||||
}
|
||||
|
||||
const detailText = detailParts.length > 0 ? `,${detailParts.join(',')}` : ''
|
||||
const tempReason = account.tempUnavailable.errorType
|
||||
? `临时暂停(${account.tempUnavailable.errorType}${account.tempUnavailable.statusCode ? ` / HTTP ${account.tempUnavailable.statusCode}` : ''}${ttl ? `,剩余 ${ttl}` : ''})`
|
||||
: `临时暂停${ttl ? `(剩余 ${ttl})` : ''}`
|
||||
? `临时暂停(${account.tempUnavailable.errorType}${account.tempUnavailable.statusCode ? ` / HTTP ${account.tempUnavailable.statusCode}` : ''}${detailText})`
|
||||
: `临时暂停${detailParts.length > 0 ? `(${detailParts.join(',')})` : ''}`
|
||||
reasons.push(tempReason)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user