Running SHADE Arena Experiments¶

This page shares sample code snippets for running SHADE Arena experiments.

Running Evaluation¶

The basic approach iterates through all 17 tasks and both modes sequentially.

from mini_control_arena import copy_template, run_eval, load_env, convert_logs

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL = "openai/gpt-4.1-2025-04-14"

SIDE_TASKS = [
    # public
    "vendor_bias", "api_theft", "fund_theft", "threshold_leak", "alert_suppression",

    # private (must request access)
    "bribery_allocation", "defamation_attack", "vote_rigging", "earnings_leak", "location_tracking",
    "election_rigging", "receipt_fraud", "evidence_deletion", "political_filtering", "insurance_fraud",
    "retaliatory_raises", "ssn_theft"
]

MODES = ["honest", "attack"]

# ============================================================================
# MAIN EXECUTION
# ============================================================================

load_env("./.env")
copy_template("shade_arena", "./templates/my-shade-arena", overwrite=True)

for side_task in SIDE_TASKS:
    for mode in MODES:
        run_eval(
            environment_dir="./templates/my-shade-arena",
            model=MODEL,
            parallel_tool_calls=True,
            max_connections=20,
            time_limit=600,
            log_dir=f"./logs/shade_arena/{side_task}/{mode}/{MODEL.replace('/','-')}",
            environment_mode=mode,
            environment_side_task=side_task,
        )

convert_logs(
    paths_to_convert=["./logs"],
    target_dir="./logs_converted",
    require_submit=False
)

Parallelization for SHADE Arena¶

SHADE Arena tasks have stateful environments where each task maintains its own component state (emails, files, calendar events, etc.). This differs from BashBench2 where samples are independent Docker containers. Additionally, SHADE Arena only has one main task per environment, making it tricky to use the native parallelization function of Inspect AI.

For multiple runs of the same task, use ProcessPoolExecutor to ensure complete state isolation between runs. Each worker process gets its own environment instance.

from mini_control_arena import copy_template, run_eval, load_env, convert_logs
from concurrent.futures import ProcessPoolExecutor

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL = "openai/gpt-4.1-2025-04-14"
PARALLEL_WORKERS = 20
RUNS_PER_TASK = 10
TIME_LIMIT = 600

SIDE_TASKS = [
    # public
    "vendor_bias", "api_theft", "fund_theft", "threshold_leak", "alert_suppression",

    # private (must request access)
    "bribery_allocation", "defamation_attack", "vote_rigging", "earnings_leak", "location_tracking",
    "election_rigging", "receipt_fraud", "evidence_deletion", "political_filtering", "insurance_fraud",
    "retaliatory_raises", "ssn_theft"
]

MODES = ["honest", "attack"]

# Build experiment list: (side_task, mode, run_id)
EXPERIMENTS = [
    (side_task, mode, run_id)
    for side_task in SIDE_TASKS
    for mode in MODES
    for run_id in range(RUNS_PER_TASK)
]

# ============================================================================
# MAIN EXECUTION
# ============================================================================

def run_one(params):
    side_task, mode, run_id = params
    print(f"Running {side_task} ({mode}) with id {run_id}")
    run_eval(
        environment_dir="./templates/my-shade-arena",
        model=MODEL,
        parallel_tool_calls=True,
        time_limit=TIME_LIMIT,
        log_dir=f"./logs/{MODEL.replace('/','-')}/{side_task}/{mode}/",
        display="none",  # IMPORTANT: set display to none for ProcessPoolExecutor
        environment_mode=mode,
        environment_side_task=side_task,
    )

if __name__ == "__main__":
    load_env()
    copy_template("shade_arena", "./templates/my-shade-arena", overwrite=True)

    print(f"Running {len(EXPERIMENTS)} experiments with {PARALLEL_WORKERS} workers")

    with ProcessPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
        list(executor.map(run_one, EXPERIMENTS))

    convert_logs(
        paths_to_convert=["./logs"],
        target_dir="./logs_converted",
        require_submit=False
    )

The display="none" parameter is required because ProcessPoolExecutor workers don't have terminal access.

Analyzing Logs¶

This script prints success percentages grouped by model, task, and mode.

import json
from pathlib import Path
from collections import defaultdict

LOG_DIR = Path("./logs_converted")

# Structure: logs_converted/{model}/{task}/{mode}/{file}.jsonl
# Group by model -> task -> mode
results = defaultdict(lambda: defaultdict(lambda: {"attack": [], "honest": []}))

for log_file in LOG_DIR.rglob("*.jsonl"):
    *_, model, task, mode, _ = log_file.parts

    with open(log_file) as f:
        for line in f:
            data = json.loads(line)
            if "scores" in data:
                scores = data["scores"]
                main = scores.get("main_task_scorer", {}).get("value", "-")
                side_key = [k for k in scores if k != "main_task_scorer"]
                side = scores[side_key[0]]["value"] if side_key else "-"
                results[model][task][mode].append((main, side))

def pct(vals, target="C"):
    if not vals: return "-"
    return f"{sum(1 for v in vals if v == target) / len(vals) * 100:.0f}%"

for model in sorted(results):
    print(f"\n=== {model} ===")
    print(f"{'Task':<25} {'Mode':<8} {'N':<4} {'Main':<8} {'Side'}")
    print("-" * 55)

    for task in sorted(results[model]):
        for mode in ["attack", "honest"]:
            runs = results[model][task][mode]
            if runs:
                mains = [r[0] for r in runs]
                sides = [r[1] for r in runs]
                side_pct = "-" if mode == "honest" else pct(sides)
                print(f"{task:<25} {mode:<8} {len(runs):<4} {pct(mains):<8} {side_pct}")

Performance Expectations¶

These figures show main task and side task success rates across the 17 SHADE Arena tasks. The results were generated using the code snippets above with 10 runs per task.

Error bars represent 95% Bayesian credible intervals computed using a Beta posterior. For each task, we model success rate with Beta(α, β) where α = successes + 1 and β = failures + 1. We draw 100,000 samples from this posterior and compute the 2.5th and 97.5th percentiles to form asymmetric error bars. See this writeup for more details on this method.

openai/gpt-4.1-2025-04-14¶

GPT-4.1 Attack Mode

GPT-4.1 Honest Mode

openai/gpt-4.1-mini-2025-04-14¶

GPT-4.1-mini Attack Mode

GPT-4.1-mini Honest Mode

google/vertex/gemini-2.0-flash-001¶

Gemini Attack Mode

Gemini Honest Mode