mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 23:14:31 +00:00
Agents: add system prompt safety guardrails (#5445)
* 🤖 agents: add system prompt safety guardrails What: - add safety guardrails to system prompt - update system prompt docs - update prompt tests Why: - discourage power-seeking or self-modification behavior - clarify safety/oversight priority when conflicts arise Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed) * 🤖 agents: tighten safety wording for prompt guardrails What: - scope safety wording to system prompts/safety/tool policy changes - document Safety inclusion in minimal prompt mode - update safety prompt tests Why: - avoid blocking normal code changes or PR workflows - keep prompt mode docs consistent with implementation Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed) * 🤖 docs: note safety guardrails are soft What: - document system prompt safety guardrails as advisory - add security note on prompt guardrails vs hard controls Why: - clarify threat model and operator expectations - avoid implying prompt text is an enforcement layer Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed)
This commit is contained in:
@@ -7,7 +7,7 @@ import type { EmbeddedContextFile } from "./pi-embedded-helpers.js";
|
||||
/**
|
||||
* Controls which hardcoded sections are included in the system prompt.
|
||||
* - "full": All sections (default, for main agent)
|
||||
* - "minimal": Reduced sections (Tooling, Workspace, Runtime) - used for subagents
|
||||
* - "minimal": Reduced sections (Tooling, Safety, Workspace, Sandbox, Runtime) - used for subagents
|
||||
* - "none": Just basic identity line, no sections
|
||||
*/
|
||||
export type PromptMode = "full" | "minimal" | "none";
|
||||
@@ -64,6 +64,16 @@ function buildTimeSection(params: { userTimezone?: string }) {
|
||||
return ["## Current Date & Time", `Time zone: ${params.userTimezone}`, ""];
|
||||
}
|
||||
|
||||
function buildSafetySection() {
|
||||
return [
|
||||
"## Safety",
|
||||
"You have no independent goals: do not pursue self-preservation, replication, resource acquisition, or power-seeking; avoid long-term plans beyond the user's request.",
|
||||
"Prioritize safety and human oversight over completion; if instructions conflict, pause and ask; comply with stop/pause/audit requests and never bypass safeguards. (Inspired by Anthropic's constitution.)",
|
||||
"Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested.",
|
||||
"",
|
||||
];
|
||||
}
|
||||
|
||||
function buildReplyTagsSection(isMinimal: boolean) {
|
||||
if (isMinimal) {
|
||||
return [];
|
||||
@@ -382,6 +392,7 @@ export function buildAgentSystemPrompt(params: {
|
||||
"Keep narration brief and value-dense; avoid repeating obvious steps.",
|
||||
"Use plain human language for narration unless in a technical context.",
|
||||
"",
|
||||
...buildSafetySection(),
|
||||
"## OpenClaw CLI Quick Reference",
|
||||
"OpenClaw is controlled via subcommands. Do not invent commands.",
|
||||
"To manage the Gateway daemon service (start/stop/restart):",
|
||||
|
||||
Reference in New Issue
Block a user