|
#!/usr/bin/env bash |
|
set -euo pipefail |
|
|
|
usage() { |
|
cat <<'EOF' |
|
Usage: |
|
codex-batch-macos.sh preview [--op-hint weak|strong|trace] |
|
codex-batch-macos.sh run [--op-hint weak|strong|trace] |
|
codex-batch-macos.sh benchmark |
|
|
|
Optional env vars: |
|
ROOT=/path/to/search |
|
CUTOFF=YYYY-MM-DD |
|
GH_USER=github-login |
|
FORCE_INCLUDE=/absolute/path/to/repo |
|
LOG_FILE=/tmp/custom-log.log |
|
OP_HINT_MODE=weak|strong|trace |
|
HINT_PROMPT="custom hint instruction" |
|
BENCHMARK_REPO=/path/to/operator-hint-benchmark |
|
EOF |
|
} |
|
|
|
mode="" |
|
op_hint_mode="${OP_HINT_MODE:-strong}" |
|
|
|
while (($#)); do |
|
case "$1" in |
|
preview|run|benchmark) |
|
if [[ -n "$mode" ]]; then |
|
echo "mode already set to '$mode'" >&2 |
|
usage |
|
exit 1 |
|
fi |
|
mode="$1" |
|
shift |
|
;; |
|
--op-hint) |
|
if [[ $# -lt 2 ]]; then |
|
echo "--op-hint requires a value" >&2 |
|
usage |
|
exit 1 |
|
fi |
|
op_hint_mode="$2" |
|
shift 2 |
|
;; |
|
--op-hint=*) |
|
op_hint_mode="${1#*=}" |
|
shift |
|
;; |
|
-h|--help) |
|
usage |
|
exit 0 |
|
;; |
|
*) |
|
echo "unknown argument: $1" >&2 |
|
usage |
|
exit 1 |
|
;; |
|
esac |
|
done |
|
|
|
mode="${mode:-preview}" |
|
|
|
case "$mode" in |
|
preview|run|benchmark) ;; |
|
*) |
|
usage |
|
exit 1 |
|
;; |
|
esac |
|
|
|
case "$op_hint_mode" in |
|
weak|strong|trace) ;; |
|
*) |
|
echo "invalid --op-hint mode: $op_hint_mode" >&2 |
|
exit 1 |
|
;; |
|
esac |
|
|
|
required_bins=(git codex grep find mktemp python3) |
|
if [[ "$mode" != "benchmark" ]]; then |
|
required_bins+=(gh) |
|
fi |
|
|
|
for bin in "${required_bins[@]}"; do |
|
command -v "$bin" >/dev/null 2>&1 || { |
|
echo "missing required command: $bin" >&2 |
|
exit 1 |
|
} |
|
done |
|
|
|
if [[ "$mode" != "benchmark" ]]; then |
|
gh auth status >/dev/null 2>&1 || { |
|
echo "gh auth is not ready; run gh auth login first." >&2 |
|
exit 1 |
|
} |
|
fi |
|
|
|
root="${ROOT:-$HOME}" |
|
cutoff="${CUTOFF:-$(date -v-1y +%F)}" |
|
gh_user="${GH_USER:-}" |
|
if [[ "$mode" != "benchmark" && -z "$gh_user" ]]; then |
|
gh_user="$(gh api user --jq .login)" |
|
fi |
|
force_include="${FORCE_INCLUDE:-}" |
|
log_file="${LOG_FILE:-/tmp/codex-batch-$(date +%Y%m%d-%H%M%S).log}" |
|
benchmark_repo="${BENCHMARK_REPO:-/Users/davidmontgomery/tmp/operator-hint-benchmark}" |
|
|
|
seen="$(mktemp)" |
|
repo_schema_file="$(mktemp)" |
|
hint_schema_file="$(mktemp)" |
|
parser_file="$(mktemp)" |
|
|
|
trap 'rm -f "$seen" "$repo_schema_file" "$hint_schema_file" "$parser_file"' EXIT |
|
|
|
if [[ -z "${CODEX_INTERNAL_ORIGINATOR_OVERRIDE:-}" && "$(date +%F)" < "2026-04-02" ]]; then |
|
export CODEX_INTERNAL_ORIGINATOR_OVERRIDE="Codex Desktop" |
|
fi |
|
|
|
normalize_origin() { |
|
local full |
|
full="$1" |
|
full="${full#git@github.com:}" |
|
full="${full#ssh://git@github.com/}" |
|
full="${full#https://github.com/}" |
|
full="${full#http://github.com/}" |
|
full="${full%.git}" |
|
printf '%s\n' "$full" |
|
} |
|
|
|
log_line() { |
|
local line |
|
line="$1" |
|
printf '%s\n' "$line" |
|
if [[ "$mode" != "preview" ]]; then |
|
printf '%s\n' "$line" >> "$log_file" |
|
fi |
|
} |
|
|
|
write_support_files() { |
|
cat > "$repo_schema_file" <<'JSON' |
|
{ |
|
"type": "object", |
|
"properties": { |
|
"status": { |
|
"type": "string", |
|
"enum": ["merged", "blocked", "skipped", "failed"] |
|
}, |
|
"repo": { |
|
"type": "string", |
|
"minLength": 1 |
|
}, |
|
"pr_url": { |
|
"type": "string" |
|
}, |
|
"merge_result": { |
|
"type": "string" |
|
}, |
|
"summary": { |
|
"type": "string" |
|
}, |
|
"checks_run": { |
|
"type": "array", |
|
"items": { "type": "string" } |
|
}, |
|
"operator_hint_examples": { |
|
"type": "array", |
|
"items": { "type": "string" }, |
|
"maxItems": 3 |
|
}, |
|
"blocker": { |
|
"type": "string" |
|
} |
|
}, |
|
"required": [ |
|
"status", |
|
"repo", |
|
"pr_url", |
|
"merge_result", |
|
"summary", |
|
"checks_run", |
|
"operator_hint_examples", |
|
"blocker" |
|
], |
|
"additionalProperties": false |
|
} |
|
JSON |
|
|
|
cat > "$hint_schema_file" <<'JSON' |
|
{ |
|
"type": "object", |
|
"properties": { |
|
"operatorHint": { |
|
"type": "string", |
|
"minLength": 1 |
|
} |
|
}, |
|
"required": ["operatorHint"], |
|
"additionalProperties": false |
|
} |
|
JSON |
|
|
|
cat > "$parser_file" <<'PY' |
|
#!/usr/bin/env python3 |
|
import json |
|
import os |
|
import sys |
|
|
|
|
|
def safe_json(text: str): |
|
try: |
|
return json.loads(text) |
|
except Exception: |
|
return None |
|
|
|
|
|
def emit(log, message: str): |
|
print(message) |
|
log.write(message + "\n") |
|
|
|
|
|
kind, label, log_path, result_path, usage_path = sys.argv[1:] |
|
os.makedirs(os.path.dirname(log_path), exist_ok=True) |
|
|
|
last_agent_text = "" |
|
error_messages = [] |
|
usage = {} |
|
|
|
with open(log_path, "a", encoding="utf-8") as log: |
|
log.write(f"\n### BEGIN {kind.upper()} {label} ###\n") |
|
|
|
for raw in sys.stdin: |
|
log.write(raw) |
|
line = raw.rstrip("\n") |
|
data = safe_json(line) |
|
if not isinstance(data, dict): |
|
continue |
|
|
|
if data.get("type") == "item.completed": |
|
item = data.get("item") or {} |
|
item_type = item.get("type") |
|
if item_type == "agent_message": |
|
last_agent_text = item.get("text", "") |
|
elif item_type == "error": |
|
message = item.get("message") or item.get("text") or "" |
|
if message: |
|
error_messages.append(message) |
|
elif data.get("type") == "turn.completed": |
|
usage = data.get("usage") or {} |
|
|
|
parsed = safe_json(last_agent_text.strip()) if last_agent_text.strip() else None |
|
|
|
if kind == "repo": |
|
if not isinstance(parsed, dict): |
|
status = "skipped" if last_agent_text.strip().upper() == "SKIP" else "failed" |
|
parsed = { |
|
"status": status, |
|
"repo": label, |
|
"pr_url": "", |
|
"merge_result": "", |
|
"summary": "", |
|
"checks_run": [], |
|
"operator_hint_examples": [], |
|
"blocker": last_agent_text.strip() or (error_messages[0] if error_messages else "No structured final response") |
|
} |
|
|
|
parsed.setdefault("status", "failed") |
|
parsed.setdefault("repo", label) |
|
parsed.setdefault("pr_url", "") |
|
parsed.setdefault("merge_result", "") |
|
parsed.setdefault("summary", "") |
|
parsed.setdefault("checks_run", []) |
|
parsed.setdefault("operator_hint_examples", []) |
|
parsed.setdefault("blocker", "") |
|
|
|
if parsed["status"] in {"blocked", "failed"} and not parsed["blocker"] and error_messages: |
|
parsed["blocker"] = error_messages[0] |
|
|
|
with open(result_path, "w", encoding="utf-8") as handle: |
|
json.dump(parsed, handle) |
|
with open(usage_path, "w", encoding="utf-8") as handle: |
|
json.dump(usage, handle) |
|
|
|
emit( |
|
log, |
|
f"[{label}] status={parsed['status']} tokens=in:{usage.get('input_tokens', 0)} " |
|
f"out:{usage.get('output_tokens', 0)} cached:{usage.get('cached_input_tokens', 0)}" |
|
) |
|
if parsed["pr_url"] or parsed["merge_result"]: |
|
emit(log, f"[{label}] pr={parsed['pr_url'] or '-'} merge={parsed['merge_result'] or '-'}") |
|
if parsed["summary"]: |
|
emit(log, f"[{label}] summary: {parsed['summary']}") |
|
if parsed["checks_run"]: |
|
emit(log, f"[{label}] checks: {' | '.join(parsed['checks_run'])}") |
|
if parsed["operator_hint_examples"]: |
|
emit(log, f"[{label}] hints: {' | '.join(parsed['operator_hint_examples'])}") |
|
if parsed["blocker"]: |
|
emit(log, f"[{label}] blocker: {parsed['blocker']}") |
|
|
|
else: |
|
if not isinstance(parsed, dict): |
|
parsed = { |
|
"operatorHint": last_agent_text.strip() or (error_messages[0] if error_messages else "") |
|
} |
|
|
|
parsed.setdefault("operatorHint", "") |
|
|
|
with open(result_path, "w", encoding="utf-8") as handle: |
|
json.dump(parsed, handle) |
|
with open(usage_path, "w", encoding="utf-8") as handle: |
|
json.dump(usage, handle) |
|
|
|
emit(log, f"[benchmark:{label}] operatorHint: {parsed['operatorHint']}") |
|
emit( |
|
log, |
|
f"[benchmark:{label}] tokens=in:{usage.get('input_tokens', 0)} " |
|
f"out:{usage.get('output_tokens', 0)} cached:{usage.get('cached_input_tokens', 0)}" |
|
) |
|
|
|
log.write(f"### END {kind.upper()} {label} ###\n") |
|
PY |
|
|
|
chmod +x "$parser_file" |
|
} |
|
|
|
build_hint_mode_instruction() { |
|
local mode_name |
|
mode_name="$1" |
|
|
|
if [[ -n "${HINT_PROMPT:-}" ]]; then |
|
printf '%s\n' "$HINT_PROMPT" |
|
return |
|
fi |
|
|
|
case "$mode_name" in |
|
weak) |
|
printf '%s\n' "Keep operatorHint intentionally lightweight. Output a short boundary label only, like 'request body parsing' or 'refresh path'. Do not include arrows, call chains, function names, missing guards, or broken invariants." |
|
;; |
|
strong) |
|
printf '%s\n' "Inspect the local call chain before writing operatorHint. Name the entry boundary, the next function or boundary, and the likely broken guard or invariant when you can justify them." |
|
;; |
|
trace) |
|
printf '%s\n' "Do a cheap trace or repro before writing operatorHint whenever it is safe and quick. Earn the hint by naming the entry boundary, the next function or boundary, and the likely broken guard or invariant." |
|
;; |
|
esac |
|
} |
|
|
|
read -r -d '' base_instructions <<'PROMPT' || true |
|
Analyze this repo first, then implement production-grade structured logging and error handling with minimal churn. If it is not mainly Python or TypeScript/Node, return status "skipped" and stop. If it already has a real structured logger, standardize on it; otherwise use structlog for Python and pino for TypeScript/Node, installing deps and updating lockfiles if needed. If the repo has both backend and frontend/app surfaces, cover both instead of stopping at the easiest slice. |
|
PROMPT |
|
|
|
read -r -d '' hint_contract <<'PROMPT' || true |
|
Keep public errors concise, not verbose. Add one additive structured field, preferably operatorHint unless the repo already has a clear naming/schema convention. operatorHint is not better logging copy and not a subsystem label; it is a pre-dug debugging breadcrumb for the next AI agent. It must behave like a compact implementation diagnosis. Use it in runtime error payloads, normalized errors, and log metadata, not as mandatory boilerplate on every exception constructor unless the repo already works that way. Do not break existing logging schemas or make the hint a Loki label or other high-cardinality tag. Replace swallowed errors, bare except blocks, weak console debugging, string promise rejections, and unhandled async flows with typed/contextual errors, structured logs, and correct re-raises. Add request/run correlation IDs where useful. Add focused tests when safe and run relevant checks. |
|
PROMPT |
|
|
|
read -r -d '' hint_examples <<'PROMPT' || true |
|
operatorHint contract: |
|
- operatorHint is one field only. |
|
- It may be sentence-level when needed, but it must stay concise and high-signal. |
|
- Prefer this shape when justified: "<entry boundary> -> <next function or boundary>; <likely broken guard or invariant>". |
|
- Do not use file paths or line numbers unless the repo already clearly expects them. |
|
- Do not copy the public error string. |
|
- Do not collapse to subsystem labels like "look at notifications". |
|
|
|
Bad vs weak vs strong examples: |
|
|
|
1) API parser / validation |
|
Public error: "Invalid request body" |
|
Bad: "look at notifications" |
|
Weak: "request body parsing" |
|
Strong: "patch_notification_route -> patch_notification -> parse_notification_patch_body assumes string text and strips it without presence/type guards" |
|
|
|
2) DB query path |
|
Public error: "Failed to list notifications" |
|
Bad: "check notifications query" |
|
Weak: "notifications unread count query" |
|
Strong: "listNotifications -> unread-count join drops rows when channel lookup is missing" |
|
|
|
3) Frontend async mutation |
|
Public error: "Failed to refresh notifications" |
|
Bad: "look at NotificationBell" |
|
Weak: "refresh path" |
|
Strong: "onNotificationClick -> markRead -> refreshNotifications reuses stale currentCursor after mutation" |
|
|
|
4) SSE / websocket / stream lifecycle |
|
Public error: "Live updates disconnected" |
|
Bad: "check SSE" |
|
Weak: "RunPage reconnect effect" |
|
Strong: "RunPage reconnect effect opens a new EventSource before cleanup" |
|
|
|
5) MCP / tool dispatch |
|
Public error: "Tool execution failed" |
|
Bad: "look at MCP tool call" |
|
Weak: "tool dispatch" |
|
Strong: "tool dispatch reaches gradeQuizSubmission() with missing quizId normalization" |
|
|
|
6) Config / env |
|
Public error: "MCP startup failed" |
|
Bad: "check env vars" |
|
Weak: "startup config" |
|
Strong: "MCP startup fails before client init because ANALOGLABOR_API_KEY is unset" |
|
|
|
Anti-patterns to avoid: |
|
- operatorHint equals the public error string |
|
- operatorHint is a stack-trace paragraph |
|
- operatorHint is generic boilerplate repeated on every error type |
|
- operatorHint says "check API route", "check tool call", or "look at server" |
|
- operatorHint becomes a Loki label or other high-cardinality tag |
|
PROMPT |
|
|
|
read -r -d '' git_workflow <<'PROMPT' || true |
|
Use this git/PR workflow and do not stop early: create or switch to a feat/* branch, make the changes, run checks, commit with the exact text @codex review in the commit message, push, and open or update a PR to main. After each push or PR update, run sleep 300, then inspect PR comments, review threads, Codex review findings, and GitHub checks with gh. If there are bugs, review findings, or failing checks, fix them, amend the commit, force-push with lease, sleep 300 again, and repeat until clean. When the PR is clean and checks are green, merge to main. If blocked by auth, permissions, or required human review, set status "blocked", explain the blocker, and stop for this repo. |
|
PROMPT |
|
|
|
read -r -d '' final_response_contract <<'PROMPT' || true |
|
Return only a JSON object that matches the CLI output schema. Populate: |
|
- status: merged, blocked, skipped, or failed |
|
- repo: the repo path |
|
- pr_url: PR URL or empty string |
|
- merge_result: merge method/result or empty string |
|
- summary: one concise sentence |
|
- checks_run: concise list of checks or validation commands actually run |
|
- operator_hint_examples: 1-3 actual hints added, or [] if none were added |
|
- blocker: empty string on success/skip, otherwise the concrete blocker |
|
PROMPT |
|
|
|
build_repo_prompt() { |
|
local mode_instruction |
|
mode_instruction="$1" |
|
|
|
cat <<PROMPT |
|
${base_instructions} |
|
|
|
${hint_contract} |
|
|
|
Hint strength mode: |
|
${mode_instruction} |
|
|
|
${hint_examples} |
|
|
|
${git_workflow} |
|
|
|
${final_response_contract} |
|
PROMPT |
|
} |
|
|
|
build_hint_prompt() { |
|
local target_file |
|
local mode_instruction |
|
target_file="$1" |
|
mode_instruction="$2" |
|
|
|
cat <<PROMPT |
|
Read ${target_file} and output only a JSON object with exactly one key: operatorHint. |
|
|
|
${hint_contract} |
|
|
|
Hint strength mode: |
|
${mode_instruction} |
|
|
|
${hint_examples} |
|
|
|
Return only JSON matching the provided schema. Do not emit prose before or after the JSON. |
|
PROMPT |
|
} |
|
|
|
run_codex_json() { |
|
local kind label repo schema prompt result_file usage_file |
|
kind="$1" |
|
label="$2" |
|
repo="$3" |
|
schema="$4" |
|
prompt="$5" |
|
result_file="$6" |
|
usage_file="$7" |
|
|
|
codex exec \ |
|
--ephemeral \ |
|
--full-auto \ |
|
--json \ |
|
--output-schema "$schema" \ |
|
-C "$repo" \ |
|
"$prompt" 2>&1 | python3 "$parser_file" "$kind" "$label" "$log_file" "$result_file" "$usage_file" |
|
} |
|
|
|
validate_hint_case() { |
|
local case_name result_file |
|
case_name="$1" |
|
result_file="$2" |
|
|
|
python3 - "$case_name" "$result_file" <<'PY' |
|
import json |
|
import sys |
|
|
|
case_name = sys.argv[1] |
|
with open(sys.argv[2], "r", encoding="utf-8") as handle: |
|
hint = json.load(handle).get("operatorHint", "") |
|
|
|
lower = hint.lower() |
|
|
|
checks = { |
|
"backend-weak": ( |
|
hint |
|
and "patch_notification_route" not in hint |
|
and "parse_notification_patch_body" not in hint |
|
and "->" not in hint, |
|
"weak backend hint should stay generic" |
|
), |
|
"backend-strong": ( |
|
all(token in hint for token in [ |
|
"patch_notification_route", |
|
"patch_notification", |
|
"parse_notification_patch_body", |
|
]) and any(token in lower for token in ["guard", "type", "dict", "text", "strip"]), |
|
"strong backend hint should name the route, parsing step, and broken guard/invariant" |
|
), |
|
"frontend-strong": ( |
|
all(token in lower for token in [ |
|
"onnotificationclick", |
|
"markread", |
|
"refreshnotifications", |
|
]) and any(token in lower for token in ["cursor", "stale"]), |
|
"strong frontend hint should name the click flow and stale cursor bug" |
|
), |
|
} |
|
|
|
ok, message = checks[case_name] |
|
if not ok: |
|
print(f"FAIL::{message}::{hint}") |
|
raise SystemExit(1) |
|
|
|
print(f"PASS::{hint}") |
|
PY |
|
} |
|
|
|
run_benchmarks() { |
|
local hint_mode_override result_file usage_file prompt |
|
if [[ ! -f "$benchmark_repo/notifications.py" || ! -f "$benchmark_repo/notification_bell.ts" ]]; then |
|
log_line "benchmark repo is missing notifications.py or notification_bell.ts: $benchmark_repo" |
|
return 1 |
|
fi |
|
|
|
local cases=( |
|
"backend-weak|notifications.py|weak" |
|
"backend-strong|notifications.py|strong" |
|
"frontend-strong|notification_bell.ts|strong" |
|
) |
|
|
|
for case_def in "${cases[@]}"; do |
|
IFS='|' read -r case_name target_file hint_mode_override <<<"$case_def" |
|
result_file="$(mktemp)" |
|
usage_file="$(mktemp)" |
|
prompt="$(build_hint_prompt "$target_file" "$(build_hint_mode_instruction "$hint_mode_override")")" |
|
|
|
log_line "[benchmark] ${case_name} starting" |
|
if ! run_codex_json "hint" "$case_name" "$benchmark_repo" "$hint_schema_file" "$prompt" "$result_file" "$usage_file"; then |
|
log_line "[benchmark] ${case_name} failed during codex exec" |
|
rm -f "$result_file" "$usage_file" |
|
return 1 |
|
fi |
|
|
|
if ! validate_hint_case "$case_name" "$result_file"; then |
|
log_line "[benchmark] ${case_name} validation failed" |
|
rm -f "$result_file" "$usage_file" |
|
return 1 |
|
fi |
|
|
|
log_line "[benchmark] ${case_name} passed" |
|
rm -f "$result_file" "$usage_file" |
|
done |
|
} |
|
|
|
write_support_files |
|
|
|
if [[ "$mode" != "preview" ]]; then |
|
log_line "log: $log_file" |
|
log_line "mode: $mode" |
|
log_line "op-hint mode: $op_hint_mode" |
|
if [[ -n "${HINT_PROMPT:-}" ]]; then |
|
log_line "hint override: custom" |
|
fi |
|
fi |
|
|
|
if [[ "$mode" == "benchmark" ]]; then |
|
log_line "benchmark repo: $benchmark_repo" |
|
run_benchmarks |
|
exit 0 |
|
fi |
|
|
|
if [[ "$mode" == "run" ]]; then |
|
log_line "root: $root" |
|
log_line "github user: $gh_user" |
|
log_line "cutoff: $cutoff" |
|
fi |
|
|
|
prompt="$(build_repo_prompt "$(build_hint_mode_instruction "$op_hint_mode")")" |
|
|
|
find "$root" -type d -name .git -print0 2>/dev/null | |
|
while IFS= read -r -d '' gitdir; do |
|
repo="${gitdir%/.git}" |
|
|
|
if [[ -n "$force_include" && "$repo" == "$force_include" ]]; then |
|
if [[ "$mode" == "preview" ]]; then |
|
printf '%s -> %s\n' "$repo" "(forced include)" |
|
else |
|
result_file="$(mktemp)" |
|
usage_file="$(mktemp)" |
|
log_line "=== $repo ===" |
|
if ! run_codex_json "repo" "$repo" "$repo" "$repo_schema_file" "$prompt" "$result_file" "$usage_file"; then |
|
log_line "FAILED: $repo" |
|
fi |
|
rm -f "$result_file" "$usage_file" |
|
fi |
|
continue |
|
fi |
|
|
|
if ! find "$repo" \ |
|
-type d \( -name node_modules -o -name .venv -o -name .mypy_cache -o -name .pytest_cache -o -name dist -o -name build \) -prune -o \ |
|
-type f ! -name ".DS_Store" -newermt "$cutoff" -print -quit 2>/dev/null | grep -q .; then |
|
continue |
|
fi |
|
|
|
origin="$(git -C "$repo" remote get-url origin 2>/dev/null || true)" |
|
printf '%s\n' "$origin" | grep -Eq "github\\.com[:/]${gh_user}/" || continue |
|
|
|
slug="$(normalize_origin "$origin")" |
|
gh repo view "$slug" --json isFork --jq ".isFork" 2>/dev/null | grep -qx "false" || continue |
|
grep -Fxq "$slug" "$seen" && continue |
|
printf '%s\n' "$slug" >> "$seen" |
|
|
|
if [[ "$mode" == "preview" ]]; then |
|
printf '%s -> %s\n' "$repo" "$slug" |
|
else |
|
result_file="$(mktemp)" |
|
usage_file="$(mktemp)" |
|
log_line "=== $repo ===" |
|
if ! run_codex_json "repo" "$repo" "$repo" "$repo_schema_file" "$prompt" "$result_file" "$usage_file"; then |
|
log_line "FAILED: $repo" |
|
fi |
|
rm -f "$result_file" "$usage_file" |
|
fi |
|
done |