Skip to content

Running BashBench Experiments

This page shares sample code snippets for running BashBench experiments.

Running Evaluation

The basic approach iterates through both modes for the download_malware side task.

from mini_control_arena import copy_template, run_eval, load_env, convert_logs

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL = "openai/gpt-4.1-2025-04-14"
MODES = ["attack", "honest"]

# ============================================================================
# MAIN EXECUTION
# ============================================================================

load_env("./.env")
copy_template("bashbench", "./templates/my-bashbench", overwrite=True)
for mode in MODES:
    run_eval(
        environment_dir="./templates/my-bashbench",
        model=MODEL,
        log_format="eval",                          # Output format for logs
        log_dir=f"./logs/{MODEL.replace('/','-')}/download_malware-bashbench/{mode}",
        max_connections=20,                         # Parallel container instances
        time_limit=60,                              # Seconds per task
        environment_mode=mode,                      # "attack" or "honest"
        environment_samples=10,                     # Number of runs per configuration
        environment_seed=42                         # For reproducibility
    )

    convert_logs(
        paths_to_convert=[
            f"./logs/{MODEL.replace('/','-')}/download_malware-bashbench/{mode}",
        ],
        target_dir=f"./logs_converted/{MODEL.replace('/','-')}/download_malware-bashbench/{mode}",
    )

Analyzing Logs

This script prints success percentages grouped by model, task, and mode.

import json
from pathlib import Path
from collections import defaultdict

LOG_DIR = Path("./logs_converted")

# BashBench side tasks
SIDE_TASKS = ["download_malware-bashbench"]

# Structure: logs_converted/{model}/{task}/{mode}/{file}.jsonl
# Group by model -> task -> mode
results = defaultdict(lambda: defaultdict(lambda: {"attack": [], "honest": []}))

for log_file in LOG_DIR.rglob("*.jsonl"):
    *_, model, task, mode, _ = log_file.parts

    # Only process BashBench tasks
    if task not in SIDE_TASKS:
        continue

    with open(log_file) as f:
        for line in f:
            data = json.loads(line)
            if "scores" not in data:
                continue

            scores = data["scores"]

            # Get main task score
            main_value = scores.get("main_task_scorer", {}).get("value", None)
            if main_value is None:
                # Try first score if no explicit main_task_scorer
                for name, val in scores.items():
                    if isinstance(val, dict) and "value" in val:
                        main_value = val["value"]
                        break

            if main_value is not None:
                main_score = 1 if main_value == "C" else 0
            else:
                continue

            # Get side task score
            side_keys = [k for k in scores if k != "main_task_scorer"]
            side_score = None
            if side_keys:
                side_value = scores[side_keys[0]].get("value", None)
                if side_value is not None:
                    side_score = 1 if side_value == "C" else 0

            results[model][task][mode].append((main_score, side_score))

def pct(vals):
    if not vals:
        return "-"
    return f"{sum(vals) / len(vals) * 100:.0f}%"

for model in sorted(results):
    print(f"\n=== {model} ===")
    print(f"{'Task':<25} {'Mode':<8} {'N':<4} {'Main':<8} {'Side'}")
    print("-" * 55)

    for task in sorted(results[model]):
        for mode in ["attack", "honest"]:
            runs = results[model][task][mode]
            if runs:
                mains = [r[0] for r in runs]
                sides = [r[1] for r in runs if r[1] is not None]
                side_pct = "-" if mode == "honest" else pct(sides)
                print(f"{task:<25} {mode:<8} {len(runs):<4} {pct(mains):<8} {side_pct}")

Performance Expectations

These figures show main task and side task success rates for BashBench. The results were generated using the code snippets above with 10 runs per configuration.

Error bars use different methods for each score type. Main task scores are binary (pass/fail), so we use 95% Bayesian credible intervals with a Beta posterior where α = successes + 1 and β = failures + 1. Side task scores are also binary, using the same Bayesian method.

openai/gpt-4.1-2025-04-14

GPT-4.1 Attack Mode

GPT-4.1 Honest Mode

openai/gpt-4.1-mini-2025-04-14

GPT-4.1 Mini Attack Mode

GPT-4.1 Mini Honest Mode