fix(gateway): retry exec-read live tool probe

2026-04-19 09:58:38 +00:00 · 2026-03-03 03:36:37 +00:00
parent 70ab91500a
commit 92c4a2a29e
3 changed files with 155 additions and 44 deletions
--- a/src/gateway/gateway-models.profiles.live.test.ts
+++ b/src/gateway/gateway-models.profiles.live.test.ts
@@ -28,7 +28,12 @@ import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
 import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js";
 import { GatewayClient } from "./client.js";
 import { renderCatNoncePngBase64 } from "./live-image-probe.js";
-import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js";
+import {
  hasExpectedSingleNonce,
  hasExpectedToolNonce,
  shouldRetryExecReadProbe,
  shouldRetryToolReadProbe,
 } from "./live-tool-probe-utils.js";
 import { startGatewayServer } from "./server.js";
 import { extractPayloadText } from "./test-helpers.agent-results.js";
@@ -862,41 +867,77 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
            logProgress(`${progressLabel}: tool-exec`);
            const nonceC = randomUUID();
            const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
-
+            const maxExecReadAttempts = 3;
-            const execReadProbe = await client.request<AgentFinalPayload>(
+            let execReadText = "";
-              "agent",
+            for (
-              {
+              let execReadAttempt = 0;
-                sessionKey,
+              execReadAttempt < maxExecReadAttempts;
-                idempotencyKey: `idem-${runIdTool}-exec-read`,
+              execReadAttempt += 1
                message:
                  "OpenClaw live tool probe (local, safe): " +
                  "use the tool named `exec` (or `Exec`) to run this command: " +
                  `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
                  `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
                  "Finally reply including the nonce text you read back.",
                thinking: params.thinkingLevel,
                deliver: false,
              },
              { expectFinal: true },
            );
            if (execReadProbe?.status !== "ok") {
              throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
            }
            const execReadText = extractPayloadText(execReadProbe?.result);
            if (
              isEmptyStreamText(execReadText) &&
              (model.provider === "minimax" || model.provider === "openai-codex")
            ) {
-              logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
+              const strictReply = execReadAttempt > 0;
-              break;
+              const execReadProbe = await client.request<AgentFinalPayload>(
                "agent",
                {
                  sessionKey,
                  idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
                  message: strictReply
                    ? "OpenClaw live tool probe (local, safe): " +
                      "use the tool named `exec` (or `Exec`) to run this command: " +
                      `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
                      `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
                      `Then reply with exactly: ${nonceC}. No extra text.`
                    : "OpenClaw live tool probe (local, safe): " +
                      "use the tool named `exec` (or `Exec`) to run this command: " +
                      `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
                      `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
                      "Finally reply including the nonce text you read back.",
                  thinking: params.thinkingLevel,
                  deliver: false,
                },
                { expectFinal: true },
              );
              if (execReadProbe?.status !== "ok") {
                if (execReadAttempt + 1 < maxExecReadAttempts) {
                  logProgress(
                    `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) status=${String(execReadProbe?.status)}`,
                  );
                  continue;
                }
                throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
              }
              execReadText = extractPayloadText(execReadProbe?.result);
              if (
                isEmptyStreamText(execReadText) &&
                (model.provider === "minimax" || model.provider === "openai-codex")
              ) {
                logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
                break;
              }
              assertNoReasoningTags({
                text: execReadText,
                model: modelKey,
                phase: "tool-exec",
                label: params.label,
              });
              if (hasExpectedSingleNonce(execReadText, nonceC)) {
                break;
              }
              if (
                shouldRetryExecReadProbe({
                  text: execReadText,
                  nonce: nonceC,
                  attempt: execReadAttempt,
                  maxAttempts: maxExecReadAttempts,
                })
              ) {
                logProgress(
                  `${progressLabel}: tool-exec retry (${execReadAttempt + 2}/${maxExecReadAttempts}) malformed tool output`,
                );
                continue;
              }
              throw new Error(`exec+read probe missing nonce: ${execReadText}`);
            }
-            assertNoReasoningTags({
+            if (!hasExpectedSingleNonce(execReadText, nonceC)) {
              text: execReadText,
              model: modelKey,
              phase: "tool-exec",
              label: params.label,
            });
            if (!execReadText.includes(nonceC)) {
              throw new Error(`exec+read probe missing nonce: ${execReadText}`);
            }
--- a/src/gateway/live-tool-probe-utils.test.ts
+++ b/src/gateway/live-tool-probe-utils.test.ts
@@ -1,5 +1,10 @@
 import { describe, expect, it } from "vitest";
-import { hasExpectedToolNonce, shouldRetryToolReadProbe } from "./live-tool-probe-utils.js";
+import {
  hasExpectedSingleNonce,
  hasExpectedToolNonce,
  shouldRetryExecReadProbe,
  shouldRetryToolReadProbe,
 } from "./live-tool-probe-utils.js";
 describe("live tool probe utils", () => {
  it("matches nonce pair when both are present", () => {
@@ -7,6 +12,11 @@ describe("live tool probe utils", () => {
    expect(hasExpectedToolNonce("value a-1 only", "a-1", "b-2")).toBe(false);
  });
  it("matches single nonce when present", () => {
    expect(hasExpectedSingleNonce("value nonce-1", "nonce-1")).toBe(true);
    expect(hasExpectedSingleNonce("value nonce-2", "nonce-1")).toBe(false);
  });
  it("retries malformed tool output when attempts remain", () => {
    expect(
      shouldRetryToolReadProbe({
@@ -97,4 +107,37 @@ describe("live tool probe utils", () => {
      }),
    ).toBe(false);
  });
  it("retries malformed exec+read output when attempts remain", () => {
    expect(
      shouldRetryExecReadProbe({
        text: "read[object Object]",
        nonce: "nonce-c",
        attempt: 0,
        maxAttempts: 3,
      }),
    ).toBe(true);
  });
  it("does not retry exec+read once max attempts are exhausted", () => {
    expect(
      shouldRetryExecReadProbe({
        text: "read[object Object]",
        nonce: "nonce-c",
        attempt: 2,
        maxAttempts: 3,
      }),
    ).toBe(false);
  });
  it("does not retry exec+read when nonce is present", () => {
    expect(
      shouldRetryExecReadProbe({
        text: "nonce-c",
        nonce: "nonce-c",
        attempt: 0,
        maxAttempts: 3,
      }),
    ).toBe(false);
  });
 });
--- a/src/gateway/live-tool-probe-utils.ts
+++ b/src/gateway/live-tool-probe-utils.ts
@@ -2,6 +2,25 @@ export function hasExpectedToolNonce(text: string, nonceA: string, nonceB: strin
  return text.includes(nonceA) && text.includes(nonceB);
 }
 export function hasExpectedSingleNonce(text: string, nonce: string): boolean {
  return text.includes(nonce);
 }
 function hasMalformedToolOutput(text: string): boolean {
  const trimmed = text.trim();
  if (!trimmed) {
    return true;
  }
  const lower = trimmed.toLowerCase();
  if (trimmed.includes("[object Object]")) {
    return true;
  }
  if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) {
    return true;
  }
  return false;
 }
 export function shouldRetryToolReadProbe(params: {
  text: string;
  nonceA: string;
@@ -16,19 +35,27 @@ export function shouldRetryToolReadProbe(params: {
  if (hasExpectedToolNonce(params.text, params.nonceA, params.nonceB)) {
    return false;
  }
-  const trimmed = params.text.trim();
+  if (hasMalformedToolOutput(params.text)) {
  if (!trimmed) {
    return true;
  }
  const lower = trimmed.toLowerCase();
  if (trimmed.includes("[object Object]")) {
    return true;
  }
  if (/\bread\s*\[/.test(lower) || /\btool\b/.test(lower) || /\bfunction\b/.test(lower)) {
    return true;
  }
  const lower = params.text.trim().toLowerCase();
  if (params.provider === "mistral" && (lower.includes("noncea=") || lower.includes("nonceb="))) {
    return true;
  }
  return false;
 }
 export function shouldRetryExecReadProbe(params: {
  text: string;
  nonce: string;
  attempt: number;
  maxAttempts: number;
 }): boolean {
  if (params.attempt + 1 >= params.maxAttempts) {
    return false;
  }
  if (hasExpectedSingleNonce(params.text, params.nonce)) {
    return false;
  }
  return hasMalformedToolOutput(params.text);
 }