mirror of
https://github.com/Wei-Shaw/claude-relay-service.git
synced 2026-01-23 09:06:18 +00:00
feat: enhance concurrency queue with health check and admin endpoints
- Add queue health check for fast-fail when overloaded (P90 > threshold) - Implement socket identity verification with UUID token - Add wait time statistics (P50/P90/P99) and queue stats tracking - Add admin endpoints for queue stats and cleanup - Add CLEAR_CONCURRENCY_QUEUES_ON_STARTUP config option - Update documentation with troubleshooting and proxy config guide
This commit is contained in:
@@ -50,6 +50,18 @@ function getWeekStringInTimezone(date = new Date()) {
|
||||
return `${year}-W${String(weekNumber).padStart(2, '0')}`
|
||||
}
|
||||
|
||||
// 并发队列相关常量
|
||||
const QUEUE_STATS_TTL_SECONDS = 86400 * 7 // 统计计数保留 7 天
|
||||
const WAIT_TIME_TTL_SECONDS = 86400 // 等待时间样本保留 1 天(滚动窗口,无需长期保留)
|
||||
// 等待时间样本数配置(提高统计置信度)
|
||||
// - 每 API Key 从 100 提高到 500:提供更稳定的 P99 估计
|
||||
// - 全局从 500 提高到 2000:支持更高精度的 P99.9 分析
|
||||
// - 内存开销约 12-20KB(Redis quicklist 每元素 1-10 字节),可接受
|
||||
// 详见 design.md Decision 5: 等待时间统计样本数
|
||||
const WAIT_TIME_SAMPLES_PER_KEY = 500 // 每个 API Key 保留的等待时间样本数
|
||||
const WAIT_TIME_SAMPLES_GLOBAL = 2000 // 全局保留的等待时间样本数
|
||||
const QUEUE_TTL_BUFFER_SECONDS = 30 // 排队计数器TTL缓冲时间
|
||||
|
||||
class RedisClient {
|
||||
constructor() {
|
||||
this.client = null
|
||||
@@ -2769,4 +2781,380 @@ redisClient.scanUserMessageQueueLocks = async function () {
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// 🚦 API Key 并发请求排队方法
|
||||
// ============================================
|
||||
|
||||
/**
|
||||
* 增加排队计数(使用 Lua 脚本确保原子性)
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @param {number} [timeoutMs=60000] - 排队超时时间(毫秒),用于计算 TTL
|
||||
* @returns {Promise<number>} 增加后的排队数量
|
||||
*/
|
||||
redisClient.incrConcurrencyQueue = async function (apiKeyId, timeoutMs = 60000) {
|
||||
const key = `concurrency:queue:${apiKeyId}`
|
||||
try {
|
||||
// 使用 Lua 脚本确保 INCR 和 EXPIRE 原子执行,防止进程崩溃导致计数器泄漏
|
||||
// TTL = 超时时间 + 缓冲时间(确保键不会在请求还在等待时过期)
|
||||
const ttlSeconds = Math.ceil(timeoutMs / 1000) + QUEUE_TTL_BUFFER_SECONDS
|
||||
const script = `
|
||||
local count = redis.call('INCR', KEYS[1])
|
||||
redis.call('EXPIRE', KEYS[1], ARGV[1])
|
||||
return count
|
||||
`
|
||||
const count = await this.client.eval(script, 1, key, String(ttlSeconds))
|
||||
logger.database(
|
||||
`🚦 Incremented queue count for key ${apiKeyId}: ${count} (TTL: ${ttlSeconds}s)`
|
||||
)
|
||||
return parseInt(count)
|
||||
} catch (error) {
|
||||
logger.error(`Failed to increment concurrency queue for ${apiKeyId}:`, error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 减少排队计数(使用 Lua 脚本确保原子性)
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @returns {Promise<number>} 减少后的排队数量
|
||||
*/
|
||||
redisClient.decrConcurrencyQueue = async function (apiKeyId) {
|
||||
const key = `concurrency:queue:${apiKeyId}`
|
||||
try {
|
||||
// 使用 Lua 脚本确保 DECR 和 DEL 原子执行,防止进程崩溃导致计数器残留
|
||||
const script = `
|
||||
local count = redis.call('DECR', KEYS[1])
|
||||
if count <= 0 then
|
||||
redis.call('DEL', KEYS[1])
|
||||
return 0
|
||||
end
|
||||
return count
|
||||
`
|
||||
const count = await this.client.eval(script, 1, key)
|
||||
const result = parseInt(count)
|
||||
if (result === 0) {
|
||||
logger.database(`🚦 Queue count for key ${apiKeyId} is 0, removed key`)
|
||||
} else {
|
||||
logger.database(`🚦 Decremented queue count for key ${apiKeyId}: ${result}`)
|
||||
}
|
||||
return result
|
||||
} catch (error) {
|
||||
logger.error(`Failed to decrement concurrency queue for ${apiKeyId}:`, error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取排队计数
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @returns {Promise<number>} 当前排队数量
|
||||
*/
|
||||
redisClient.getConcurrencyQueueCount = async function (apiKeyId) {
|
||||
const key = `concurrency:queue:${apiKeyId}`
|
||||
try {
|
||||
const count = await this.client.get(key)
|
||||
return parseInt(count || 0)
|
||||
} catch (error) {
|
||||
logger.error(`Failed to get concurrency queue count for ${apiKeyId}:`, error)
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 清空排队计数
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @returns {Promise<boolean>} 是否成功清空
|
||||
*/
|
||||
redisClient.clearConcurrencyQueue = async function (apiKeyId) {
|
||||
const key = `concurrency:queue:${apiKeyId}`
|
||||
try {
|
||||
await this.client.del(key)
|
||||
logger.database(`🚦 Cleared queue count for key ${apiKeyId}`)
|
||||
return true
|
||||
} catch (error) {
|
||||
logger.error(`Failed to clear concurrency queue for ${apiKeyId}:`, error)
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 扫描所有排队计数器
|
||||
* @returns {Promise<string[]>} API Key ID 列表
|
||||
*/
|
||||
redisClient.scanConcurrencyQueueKeys = async function () {
|
||||
const apiKeyIds = []
|
||||
let cursor = '0'
|
||||
let iterations = 0
|
||||
const MAX_ITERATIONS = 1000
|
||||
|
||||
try {
|
||||
do {
|
||||
const [newCursor, keys] = await this.client.scan(
|
||||
cursor,
|
||||
'MATCH',
|
||||
'concurrency:queue:*',
|
||||
'COUNT',
|
||||
100
|
||||
)
|
||||
cursor = newCursor
|
||||
iterations++
|
||||
|
||||
for (const key of keys) {
|
||||
// 排除统计和等待时间相关的键
|
||||
if (
|
||||
key.startsWith('concurrency:queue:stats:') ||
|
||||
key.startsWith('concurrency:queue:wait_times:')
|
||||
) {
|
||||
continue
|
||||
}
|
||||
const apiKeyId = key.replace('concurrency:queue:', '')
|
||||
apiKeyIds.push(apiKeyId)
|
||||
}
|
||||
|
||||
if (iterations >= MAX_ITERATIONS) {
|
||||
logger.warn(
|
||||
`🚦 Concurrency queue: SCAN reached max iterations (${MAX_ITERATIONS}), stopping early`,
|
||||
{ foundQueues: apiKeyIds.length }
|
||||
)
|
||||
break
|
||||
}
|
||||
} while (cursor !== '0')
|
||||
|
||||
return apiKeyIds
|
||||
} catch (error) {
|
||||
logger.error('Failed to scan concurrency queue keys:', error)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理所有排队计数器(用于服务重启)
|
||||
* @returns {Promise<number>} 清理的计数器数量
|
||||
*/
|
||||
redisClient.clearAllConcurrencyQueues = async function () {
|
||||
let cleared = 0
|
||||
let cursor = '0'
|
||||
let iterations = 0
|
||||
const MAX_ITERATIONS = 1000
|
||||
|
||||
try {
|
||||
do {
|
||||
const [newCursor, keys] = await this.client.scan(
|
||||
cursor,
|
||||
'MATCH',
|
||||
'concurrency:queue:*',
|
||||
'COUNT',
|
||||
100
|
||||
)
|
||||
cursor = newCursor
|
||||
iterations++
|
||||
|
||||
// 只删除排队计数器,保留统计数据
|
||||
const queueKeys = keys.filter(
|
||||
(key) =>
|
||||
!key.startsWith('concurrency:queue:stats:') &&
|
||||
!key.startsWith('concurrency:queue:wait_times:')
|
||||
)
|
||||
|
||||
if (queueKeys.length > 0) {
|
||||
await this.client.del(...queueKeys)
|
||||
cleared += queueKeys.length
|
||||
}
|
||||
|
||||
if (iterations >= MAX_ITERATIONS) {
|
||||
break
|
||||
}
|
||||
} while (cursor !== '0')
|
||||
|
||||
if (cleared > 0) {
|
||||
logger.info(`🚦 Cleared ${cleared} concurrency queue counter(s) on startup`)
|
||||
}
|
||||
return cleared
|
||||
} catch (error) {
|
||||
logger.error('Failed to clear all concurrency queues:', error)
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 增加排队统计计数(使用 Lua 脚本确保原子性)
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @param {string} field - 统计字段 (entered/success/timeout/cancelled)
|
||||
* @returns {Promise<number>} 增加后的计数
|
||||
*/
|
||||
redisClient.incrConcurrencyQueueStats = async function (apiKeyId, field) {
|
||||
const key = `concurrency:queue:stats:${apiKeyId}`
|
||||
try {
|
||||
// 使用 Lua 脚本确保 HINCRBY 和 EXPIRE 原子执行
|
||||
// 防止在两者之间崩溃导致统计键没有 TTL(内存泄漏)
|
||||
const script = `
|
||||
local count = redis.call('HINCRBY', KEYS[1], ARGV[1], 1)
|
||||
redis.call('EXPIRE', KEYS[1], ARGV[2])
|
||||
return count
|
||||
`
|
||||
const count = await this.client.eval(script, 1, key, field, String(QUEUE_STATS_TTL_SECONDS))
|
||||
return parseInt(count)
|
||||
} catch (error) {
|
||||
logger.error(`Failed to increment queue stats ${field} for ${apiKeyId}:`, error)
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取排队统计
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @returns {Promise<Object>} 统计数据
|
||||
*/
|
||||
redisClient.getConcurrencyQueueStats = async function (apiKeyId) {
|
||||
const key = `concurrency:queue:stats:${apiKeyId}`
|
||||
try {
|
||||
const stats = await this.client.hgetall(key)
|
||||
return {
|
||||
entered: parseInt(stats?.entered || 0),
|
||||
success: parseInt(stats?.success || 0),
|
||||
timeout: parseInt(stats?.timeout || 0),
|
||||
cancelled: parseInt(stats?.cancelled || 0),
|
||||
socket_changed: parseInt(stats?.socket_changed || 0),
|
||||
rejected_overload: parseInt(stats?.rejected_overload || 0)
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Failed to get queue stats for ${apiKeyId}:`, error)
|
||||
return {
|
||||
entered: 0,
|
||||
success: 0,
|
||||
timeout: 0,
|
||||
cancelled: 0,
|
||||
socket_changed: 0,
|
||||
rejected_overload: 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录排队等待时间(按 API Key 分开存储)
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @param {number} waitTimeMs - 等待时间(毫秒)
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
redisClient.recordQueueWaitTime = async function (apiKeyId, waitTimeMs) {
|
||||
const key = `concurrency:queue:wait_times:${apiKeyId}`
|
||||
try {
|
||||
// 使用 Lua 脚本确保原子性,同时设置 TTL 防止内存泄漏
|
||||
const script = `
|
||||
redis.call('LPUSH', KEYS[1], ARGV[1])
|
||||
redis.call('LTRIM', KEYS[1], 0, ARGV[2])
|
||||
redis.call('EXPIRE', KEYS[1], ARGV[3])
|
||||
return 1
|
||||
`
|
||||
await this.client.eval(
|
||||
script,
|
||||
1,
|
||||
key,
|
||||
waitTimeMs,
|
||||
WAIT_TIME_SAMPLES_PER_KEY - 1,
|
||||
WAIT_TIME_TTL_SECONDS
|
||||
)
|
||||
} catch (error) {
|
||||
logger.error(`Failed to record queue wait time for ${apiKeyId}:`, error)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录全局排队等待时间
|
||||
* @param {number} waitTimeMs - 等待时间(毫秒)
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
redisClient.recordGlobalQueueWaitTime = async function (waitTimeMs) {
|
||||
const key = 'concurrency:queue:wait_times:global'
|
||||
try {
|
||||
// 使用 Lua 脚本确保原子性,同时设置 TTL 防止内存泄漏
|
||||
const script = `
|
||||
redis.call('LPUSH', KEYS[1], ARGV[1])
|
||||
redis.call('LTRIM', KEYS[1], 0, ARGV[2])
|
||||
redis.call('EXPIRE', KEYS[1], ARGV[3])
|
||||
return 1
|
||||
`
|
||||
await this.client.eval(
|
||||
script,
|
||||
1,
|
||||
key,
|
||||
waitTimeMs,
|
||||
WAIT_TIME_SAMPLES_GLOBAL - 1,
|
||||
WAIT_TIME_TTL_SECONDS
|
||||
)
|
||||
} catch (error) {
|
||||
logger.error('Failed to record global queue wait time:', error)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取全局等待时间列表
|
||||
* @returns {Promise<number[]>} 等待时间列表
|
||||
*/
|
||||
redisClient.getGlobalQueueWaitTimes = async function () {
|
||||
const key = 'concurrency:queue:wait_times:global'
|
||||
try {
|
||||
const samples = await this.client.lrange(key, 0, -1)
|
||||
return samples.map(Number)
|
||||
} catch (error) {
|
||||
logger.error('Failed to get global queue wait times:', error)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取指定 API Key 的等待时间列表
|
||||
* @param {string} apiKeyId - API Key ID
|
||||
* @returns {Promise<number[]>} 等待时间列表
|
||||
*/
|
||||
redisClient.getQueueWaitTimes = async function (apiKeyId) {
|
||||
const key = `concurrency:queue:wait_times:${apiKeyId}`
|
||||
try {
|
||||
const samples = await this.client.lrange(key, 0, -1)
|
||||
return samples.map(Number)
|
||||
} catch (error) {
|
||||
logger.error(`Failed to get queue wait times for ${apiKeyId}:`, error)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 扫描所有排队统计键
|
||||
* @returns {Promise<string[]>} API Key ID 列表
|
||||
*/
|
||||
redisClient.scanConcurrencyQueueStatsKeys = async function () {
|
||||
const apiKeyIds = []
|
||||
let cursor = '0'
|
||||
let iterations = 0
|
||||
const MAX_ITERATIONS = 1000
|
||||
|
||||
try {
|
||||
do {
|
||||
const [newCursor, keys] = await this.client.scan(
|
||||
cursor,
|
||||
'MATCH',
|
||||
'concurrency:queue:stats:*',
|
||||
'COUNT',
|
||||
100
|
||||
)
|
||||
cursor = newCursor
|
||||
iterations++
|
||||
|
||||
for (const key of keys) {
|
||||
const apiKeyId = key.replace('concurrency:queue:stats:', '')
|
||||
apiKeyIds.push(apiKeyId)
|
||||
}
|
||||
|
||||
if (iterations >= MAX_ITERATIONS) {
|
||||
break
|
||||
}
|
||||
} while (cursor !== '0')
|
||||
|
||||
return apiKeyIds
|
||||
} catch (error) {
|
||||
logger.error('Failed to scan concurrency queue stats keys:', error)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = redisClient
|
||||
|
||||
Reference in New Issue
Block a user