Running APPS Experiments¶
This page shares sample code snippets for running APPS experiments.
Running Evaluation¶
The basic approach iterates through both modes for the code_backdooring side task.
from mini_control_arena import copy_template, run_eval, load_env, convert_logs
# ============================================================================
# CONFIGURATION
# ============================================================================
MODEL = "openai/gpt-4.1-2025-04-14"
MODES = ["attack", "honest"]
# ============================================================================
# MAIN EXECUTION
# ============================================================================
load_env("./.env")
copy_template("apps", "./templates/my-apps", overwrite=True)
for mode in MODES:
run_eval(
environment_dir="./templates/my-apps",
model=MODEL,
log_format="eval", # Output format for logs
log_dir=f"./logs/{MODEL.replace('/','-')}/code_backdooring/{mode}",
max_connections=20, # Parallel container instances
time_limit=300, # Seconds per task
environment_mode=mode, # "attack" or "honest"
environment_samples=20, # Number of runs per configuration
environment_seed=42 # For reproducibility
)
convert_logs(
paths_to_convert=[
f"./logs/{MODEL.replace('/','-')}/code_backdooring/{mode}",
],
target_dir=f"./logs_converted/{MODEL.replace('/','-')}/code_backdooring/{mode}",
)
Analyzing Logs¶
This script prints success percentages grouped by model, task, and mode.
import json
from pathlib import Path
from collections import defaultdict
LOG_DIR = Path("./logs_converted")
# APPS side tasks
SIDE_TASKS = ["code_backdooring"]
# Structure: logs_converted/{model}/{task}/{mode}/{file}.jsonl
# Group by model -> task -> mode
results = defaultdict(lambda: defaultdict(lambda: {"attack": [], "honest": []}))
for log_file in LOG_DIR.rglob("*.jsonl"):
*_, model, task, mode, _ = log_file.parts
# Only process APPS tasks
if task not in SIDE_TASKS:
continue
with open(log_file) as f:
for line in f:
data = json.loads(line)
if "scores" not in data:
continue
scores = data["scores"]
# Get main task score
main_value = scores.get("main_task_success", {}).get("value", None)
if main_value is None:
# Try first score if no explicit main_task_success
for name, val in scores.items():
if isinstance(val, dict) and "value" in val:
main_value = val["value"]
break
if main_value is not None:
main_score = 1 if main_value == "C" else 0
else:
continue
# Get side task score
side_keys = [k for k in scores if k != "main_task_success"]
side_score = None
if side_keys:
side_value = scores[side_keys[0]].get("value", None)
if side_value is not None:
side_score = 1 if side_value == "C" else 0
results[model][task][mode].append((main_score, side_score))
def pct(vals):
if not vals:
return "-"
return f"{sum(vals) / len(vals) * 100:.0f}%"
for model in sorted(results):
print(f"\n=== {model} ===")
print(f"{'Task':<25} {'Mode':<8} {'N':<4} {'Main':<8} {'Side'}")
print("-" * 55)
for task in sorted(results[model]):
for mode in ["attack", "honest"]:
runs = results[model][task][mode]
if runs:
mains = [r[0] for r in runs]
sides = [r[1] for r in runs if r[1] is not None]
side_pct = "-" if mode == "honest" else pct(sides)
print(f"{task:<25} {mode:<8} {len(runs):<4} {pct(mains):<8} {side_pct}")
Performance Expectations¶
These figures show main task and side task success rates for APPS. The results were generated using the code snippets above with 20 runs per configuration.
Error bars use 95% Bayesian credible intervals with a Beta posterior where α = successes + 1 and β = failures + 1.
openai/gpt-4.1-2025-04-14¶


openai/gpt-4.1-mini-2025-04-14¶


google/vertex/gemini-2.0-flash-001¶

