feat: enhance concurrency queue with health check and admin endpoints

- Add queue health check for fast-fail when overloaded (P90 > threshold) - Implement socket identity verification with UUID token - Add wait time statistics (P50/P90/P99) and queue stats tracking - Add admin endpoints for queue stats and cleanup - Add CLEAR_CONCURRENCY_QUEUES_ON_STARTUP config option - Update documentation with troubleshooting and proxy config guide
2026-01-23 09:06:18 +00:00 · 2025-12-12 14:08:30 +08:00
parent 403f609f69
commit 07633ddbf8
18 changed files with 3039 additions and 86 deletions
--- a/src/models/redis.js
+++ b/src/models/redis.js
@@ -50,6 +50,18 @@ function getWeekStringInTimezone(date = new Date()) {
  return `${year}-W${String(weekNumber).padStart(2, '0')}`
 }

+// 并发队列相关常量
+const QUEUE_STATS_TTL_SECONDS = 86400 * 7 // 统计计数保留 7 天
+const WAIT_TIME_TTL_SECONDS = 86400 // 等待时间样本保留 1 天（滚动窗口，无需长期保留）
+// 等待时间样本数配置（提高统计置信度）
+// - 每 API Key 从 100 提高到 500：提供更稳定的 P99 估计
+// - 全局从 500 提高到 2000：支持更高精度的 P99.9 分析
+// - 内存开销约 12-20KB（Redis quicklist 每元素 1-10 字节），可接受
+// 详见 design.md Decision 5: 等待时间统计样本数
+const WAIT_TIME_SAMPLES_PER_KEY = 500 // 每个 API Key 保留的等待时间样本数
+const WAIT_TIME_SAMPLES_GLOBAL = 2000 // 全局保留的等待时间样本数
+const QUEUE_TTL_BUFFER_SECONDS = 30 // 排队计数器TTL缓冲时间
+
 class RedisClient {
  constructor() {
    this.client = null
@@ -2769,4 +2781,380 @@ redisClient.scanUserMessageQueueLocks = async function () {
  }
 }

+// ============================================
+// 🚦 API Key 并发请求排队方法
+// ============================================
+
+/**
+ * 增加排队计数（使用 Lua 脚本确保原子性）
+ * @param {string} apiKeyId - API Key ID
+ * @param {number} [timeoutMs=60000] - 排队超时时间（毫秒），用于计算 TTL
+ * @returns {Promise<number>} 增加后的排队数量
+ */
+redisClient.incrConcurrencyQueue = async function (apiKeyId, timeoutMs = 60000) {
+  const key = `concurrency:queue:${apiKeyId}`
+  try {
+    // 使用 Lua 脚本确保 INCR 和 EXPIRE 原子执行，防止进程崩溃导致计数器泄漏
+    // TTL = 超时时间 + 缓冲时间（确保键不会在请求还在等待时过期）
+    const ttlSeconds = Math.ceil(timeoutMs / 1000) + QUEUE_TTL_BUFFER_SECONDS
+    const script = `
+      local count = redis.call('INCR', KEYS[1])
+      redis.call('EXPIRE', KEYS[1], ARGV[1])
+      return count
+    `
+    const count = await this.client.eval(script, 1, key, String(ttlSeconds))
+    logger.database(
+      `🚦 Incremented queue count for key ${apiKeyId}: ${count} (TTL: ${ttlSeconds}s)`
+    )
+    return parseInt(count)
+  } catch (error) {
+    logger.error(`Failed to increment concurrency queue for ${apiKeyId}:`, error)
+    throw error
+  }
+}
+
+/**
+ * 减少排队计数（使用 Lua 脚本确保原子性）
+ * @param {string} apiKeyId - API Key ID
+ * @returns {Promise<number>} 减少后的排队数量
+ */
+redisClient.decrConcurrencyQueue = async function (apiKeyId) {
+  const key = `concurrency:queue:${apiKeyId}`
+  try {
+    // 使用 Lua 脚本确保 DECR 和 DEL 原子执行，防止进程崩溃导致计数器残留
+    const script = `
+      local count = redis.call('DECR', KEYS[1])
+      if count <= 0 then
+        redis.call('DEL', KEYS[1])
+        return 0
+      end
+      return count
+    `
+    const count = await this.client.eval(script, 1, key)
+    const result = parseInt(count)
+    if (result === 0) {
+      logger.database(`🚦 Queue count for key ${apiKeyId} is 0, removed key`)
+    } else {
+      logger.database(`🚦 Decremented queue count for key ${apiKeyId}: ${result}`)
+    }
+    return result
+  } catch (error) {
+    logger.error(`Failed to decrement concurrency queue for ${apiKeyId}:`, error)
+    throw error
+  }
+}
+
+/**
+ * 获取排队计数
+ * @param {string} apiKeyId - API Key ID
+ * @returns {Promise<number>} 当前排队数量
+ */
+redisClient.getConcurrencyQueueCount = async function (apiKeyId) {
+  const key = `concurrency:queue:${apiKeyId}`
+  try {
+    const count = await this.client.get(key)
+    return parseInt(count || 0)
+  } catch (error) {
+    logger.error(`Failed to get concurrency queue count for ${apiKeyId}:`, error)
+    return 0
+  }
+}
+
+/**
+ * 清空排队计数
+ * @param {string} apiKeyId - API Key ID
+ * @returns {Promise<boolean>} 是否成功清空
+ */
+redisClient.clearConcurrencyQueue = async function (apiKeyId) {
+  const key = `concurrency:queue:${apiKeyId}`
+  try {
+    await this.client.del(key)
+    logger.database(`🚦 Cleared queue count for key ${apiKeyId}`)
+    return true
+  } catch (error) {
+    logger.error(`Failed to clear concurrency queue for ${apiKeyId}:`, error)
+    return false
+  }
+}
+
+/**
+ * 扫描所有排队计数器
+ * @returns {Promise<string[]>} API Key ID 列表
+ */
+redisClient.scanConcurrencyQueueKeys = async function () {
+  const apiKeyIds = []
+  let cursor = '0'
+  let iterations = 0
+  const MAX_ITERATIONS = 1000
+
+  try {
+    do {
+      const [newCursor, keys] = await this.client.scan(
+        cursor,
+        'MATCH',
+        'concurrency:queue:*',
+        'COUNT',
+        100
+      )
+      cursor = newCursor
+      iterations++
+
+      for (const key of keys) {
+        // 排除统计和等待时间相关的键
+        if (
+          key.startsWith('concurrency:queue:stats:') ||
+          key.startsWith('concurrency:queue:wait_times:')
+        ) {
+          continue
+        }
+        const apiKeyId = key.replace('concurrency:queue:', '')
+        apiKeyIds.push(apiKeyId)
+      }
+
+      if (iterations >= MAX_ITERATIONS) {
+        logger.warn(
+          `🚦 Concurrency queue: SCAN reached max iterations (${MAX_ITERATIONS}), stopping early`,
+          { foundQueues: apiKeyIds.length }
+        )
+        break
+      }
+    } while (cursor !== '0')
+
+    return apiKeyIds
+  } catch (error) {
+    logger.error('Failed to scan concurrency queue keys:', error)
+    return []
+  }
+}
+
+/**
+ * 清理所有排队计数器（用于服务重启）
+ * @returns {Promise<number>} 清理的计数器数量
+ */
+redisClient.clearAllConcurrencyQueues = async function () {
+  let cleared = 0
+  let cursor = '0'
+  let iterations = 0
+  const MAX_ITERATIONS = 1000
+
+  try {
+    do {
+      const [newCursor, keys] = await this.client.scan(
+        cursor,
+        'MATCH',
+        'concurrency:queue:*',
+        'COUNT',
+        100
+      )
+      cursor = newCursor
+      iterations++
+
+      // 只删除排队计数器，保留统计数据
+      const queueKeys = keys.filter(
+        (key) =>
+          !key.startsWith('concurrency:queue:stats:') &&
+          !key.startsWith('concurrency:queue:wait_times:')
+      )
+
+      if (queueKeys.length > 0) {
+        await this.client.del(...queueKeys)
+        cleared += queueKeys.length
+      }
+
+      if (iterations >= MAX_ITERATIONS) {
+        break
+      }
+    } while (cursor !== '0')
+
+    if (cleared > 0) {
+      logger.info(`🚦 Cleared ${cleared} concurrency queue counter(s) on startup`)
+    }
+    return cleared
+  } catch (error) {
+    logger.error('Failed to clear all concurrency queues:', error)
+    return 0
+  }
+}
+
+/**
+ * 增加排队统计计数（使用 Lua 脚本确保原子性）
+ * @param {string} apiKeyId - API Key ID
+ * @param {string} field - 统计字段 (entered/success/timeout/cancelled)
+ * @returns {Promise<number>} 增加后的计数
+ */
+redisClient.incrConcurrencyQueueStats = async function (apiKeyId, field) {
+  const key = `concurrency:queue:stats:${apiKeyId}`
+  try {
+    // 使用 Lua 脚本确保 HINCRBY 和 EXPIRE 原子执行
+    // 防止在两者之间崩溃导致统计键没有 TTL（内存泄漏）
+    const script = `
+      local count = redis.call('HINCRBY', KEYS[1], ARGV[1], 1)
+      redis.call('EXPIRE', KEYS[1], ARGV[2])
+      return count
+    `
+    const count = await this.client.eval(script, 1, key, field, String(QUEUE_STATS_TTL_SECONDS))
+    return parseInt(count)
+  } catch (error) {
+    logger.error(`Failed to increment queue stats ${field} for ${apiKeyId}:`, error)
+    return 0
+  }
+}
+
+/**
+ * 获取排队统计
+ * @param {string} apiKeyId - API Key ID
+ * @returns {Promise<Object>} 统计数据
+ */
+redisClient.getConcurrencyQueueStats = async function (apiKeyId) {
+  const key = `concurrency:queue:stats:${apiKeyId}`
+  try {
+    const stats = await this.client.hgetall(key)
+    return {
+      entered: parseInt(stats?.entered || 0),
+      success: parseInt(stats?.success || 0),
+      timeout: parseInt(stats?.timeout || 0),
+      cancelled: parseInt(stats?.cancelled || 0),
+      socket_changed: parseInt(stats?.socket_changed || 0),
+      rejected_overload: parseInt(stats?.rejected_overload || 0)
+    }
+  } catch (error) {
+    logger.error(`Failed to get queue stats for ${apiKeyId}:`, error)
+    return {
+      entered: 0,
+      success: 0,
+      timeout: 0,
+      cancelled: 0,
+      socket_changed: 0,
+      rejected_overload: 0
+    }
+  }
+}
+
+/**
+ * 记录排队等待时间（按 API Key 分开存储）
+ * @param {string} apiKeyId - API Key ID
+ * @param {number} waitTimeMs - 等待时间（毫秒）
+ * @returns {Promise<void>}
+ */
+redisClient.recordQueueWaitTime = async function (apiKeyId, waitTimeMs) {
+  const key = `concurrency:queue:wait_times:${apiKeyId}`
+  try {
+    // 使用 Lua 脚本确保原子性，同时设置 TTL 防止内存泄漏
+    const script = `
+      redis.call('LPUSH', KEYS[1], ARGV[1])
+      redis.call('LTRIM', KEYS[1], 0, ARGV[2])
+      redis.call('EXPIRE', KEYS[1], ARGV[3])
+      return 1
+    `
+    await this.client.eval(
+      script,
+      1,
+      key,
+      waitTimeMs,
+      WAIT_TIME_SAMPLES_PER_KEY - 1,
+      WAIT_TIME_TTL_SECONDS
+    )
+  } catch (error) {
+    logger.error(`Failed to record queue wait time for ${apiKeyId}:`, error)
+  }
+}
+
+/**
+ * 记录全局排队等待时间
+ * @param {number} waitTimeMs - 等待时间（毫秒）
+ * @returns {Promise<void>}
+ */
+redisClient.recordGlobalQueueWaitTime = async function (waitTimeMs) {
+  const key = 'concurrency:queue:wait_times:global'
+  try {
+    // 使用 Lua 脚本确保原子性，同时设置 TTL 防止内存泄漏
+    const script = `
+      redis.call('LPUSH', KEYS[1], ARGV[1])
+      redis.call('LTRIM', KEYS[1], 0, ARGV[2])
+      redis.call('EXPIRE', KEYS[1], ARGV[3])
+      return 1
+    `
+    await this.client.eval(
+      script,
+      1,
+      key,
+      waitTimeMs,
+      WAIT_TIME_SAMPLES_GLOBAL - 1,
+      WAIT_TIME_TTL_SECONDS
+    )
+  } catch (error) {
+    logger.error('Failed to record global queue wait time:', error)
+  }
+}
+
+/**
+ * 获取全局等待时间列表
+ * @returns {Promise<number[]>} 等待时间列表
+ */
+redisClient.getGlobalQueueWaitTimes = async function () {
+  const key = 'concurrency:queue:wait_times:global'
+  try {
+    const samples = await this.client.lrange(key, 0, -1)
+    return samples.map(Number)
+  } catch (error) {
+    logger.error('Failed to get global queue wait times:', error)
+    return []
+  }
+}
+
+/**
+ * 获取指定 API Key 的等待时间列表
+ * @param {string} apiKeyId - API Key ID
+ * @returns {Promise<number[]>} 等待时间列表
+ */
+redisClient.getQueueWaitTimes = async function (apiKeyId) {
+  const key = `concurrency:queue:wait_times:${apiKeyId}`
+  try {
+    const samples = await this.client.lrange(key, 0, -1)
+    return samples.map(Number)
+  } catch (error) {
+    logger.error(`Failed to get queue wait times for ${apiKeyId}:`, error)
+    return []
+  }
+}
+
+/**
+ * 扫描所有排队统计键
+ * @returns {Promise<string[]>} API Key ID 列表
+ */
+redisClient.scanConcurrencyQueueStatsKeys = async function () {
+  const apiKeyIds = []
+  let cursor = '0'
+  let iterations = 0
+  const MAX_ITERATIONS = 1000
+
+  try {
+    do {
+      const [newCursor, keys] = await this.client.scan(
+        cursor,
+        'MATCH',
+        'concurrency:queue:stats:*',
+        'COUNT',
+        100
+      )
+      cursor = newCursor
+      iterations++
+
+      for (const key of keys) {
+        const apiKeyId = key.replace('concurrency:queue:stats:', '')
+        apiKeyIds.push(apiKeyId)
+      }
+
+      if (iterations >= MAX_ITERATIONS) {
+        break
+      }
+    } while (cursor !== '0')
+
+    return apiKeyIds
+  } catch (error) {
+    logger.error('Failed to scan concurrency queue stats keys:', error)
+    return []
+  }
+}
+
 module.exports = redisClient