Skip to content

Running APPS Experiments

This page shares sample code snippets for running APPS experiments.

Running Evaluation

The basic approach iterates through both modes for the code_backdooring side task.

from mini_control_arena import copy_template, run_eval, load_env, convert_logs

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL = "openai/gpt-4.1-2025-04-14"
MODES = ["attack", "honest"]

# ============================================================================
# MAIN EXECUTION
# ============================================================================

load_env("./.env")
copy_template("apps", "./templates/my-apps", overwrite=True)
for mode in MODES:
    run_eval(
        environment_dir="./templates/my-apps",
        model=MODEL,
        log_format="eval",                          # Output format for logs
        log_dir=f"./logs/{MODEL.replace('/','-')}/code_backdooring/{mode}",
        max_connections=20,                         # Parallel container instances
        time_limit=300,                             # Seconds per task
        environment_mode=mode,                      # "attack" or "honest"
        environment_samples=20,                     # Number of runs per configuration
        environment_seed=42                         # For reproducibility
    )

    convert_logs(
        paths_to_convert=[
            f"./logs/{MODEL.replace('/','-')}/code_backdooring/{mode}",
        ],
        target_dir=f"./logs_converted/{MODEL.replace('/','-')}/code_backdooring/{mode}",
    )

Analyzing Logs

This script prints success percentages grouped by model, task, and mode.

import json
from pathlib import Path
from collections import defaultdict

LOG_DIR = Path("./logs_converted")

# APPS side tasks
SIDE_TASKS = ["code_backdooring"]

# Structure: logs_converted/{model}/{task}/{mode}/{file}.jsonl
# Group by model -> task -> mode
results = defaultdict(lambda: defaultdict(lambda: {"attack": [], "honest": []}))

for log_file in LOG_DIR.rglob("*.jsonl"):
    *_, model, task, mode, _ = log_file.parts

    # Only process APPS tasks
    if task not in SIDE_TASKS:
        continue

    with open(log_file) as f:
        for line in f:
            data = json.loads(line)
            if "scores" not in data:
                continue

            scores = data["scores"]

            # Get main task score
            main_value = scores.get("main_task_success", {}).get("value", None)
            if main_value is None:
                # Try first score if no explicit main_task_success
                for name, val in scores.items():
                    if isinstance(val, dict) and "value" in val:
                        main_value = val["value"]
                        break

            if main_value is not None:
                main_score = 1 if main_value == "C" else 0
            else:
                continue

            # Get side task score
            side_keys = [k for k in scores if k != "main_task_success"]
            side_score = None
            if side_keys:
                side_value = scores[side_keys[0]].get("value", None)
                if side_value is not None:
                    side_score = 1 if side_value == "C" else 0

            results[model][task][mode].append((main_score, side_score))

def pct(vals):
    if not vals:
        return "-"
    return f"{sum(vals) / len(vals) * 100:.0f}%"

for model in sorted(results):
    print(f"\n=== {model} ===")
    print(f"{'Task':<25} {'Mode':<8} {'N':<4} {'Main':<8} {'Side'}")
    print("-" * 55)

    for task in sorted(results[model]):
        for mode in ["attack", "honest"]:
            runs = results[model][task][mode]
            if runs:
                mains = [r[0] for r in runs]
                sides = [r[1] for r in runs if r[1] is not None]
                side_pct = "-" if mode == "honest" else pct(sides)
                print(f"{task:<25} {mode:<8} {len(runs):<4} {pct(mains):<8} {side_pct}")

Performance Expectations

These figures show main task and side task success rates for APPS. The results were generated using the code snippets above with 20 runs per configuration.

Error bars use 95% Bayesian credible intervals with a Beta posterior where α = successes + 1 and β = failures + 1.

openai/gpt-4.1-2025-04-14

GPT-4.1 Attack Mode

GPT-4.1 Honest Mode

openai/gpt-4.1-mini-2025-04-14

GPT-4.1 Mini Attack Mode

GPT-4.1 Mini Honest Mode

google/vertex/gemini-2.0-flash-001

Gemini Attack Mode

Gemini Honest Mode