Skip to content

Running Agentic Misalignment + Experiments

This page shares sample code snippets for running Agentic Misalignment + experiments.

Running Evaluation

For quick testing of a single condition.

"""
Agentic Misalignment evaluation - single run demo.

For full experiments, use agentic_misalignment_demo_parallel.py instead.
"""

from mini_control_arena import copy_template, run_eval, load_env, convert_logs

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL = "openai/gpt-4.1-2025-04-14"
SCORER_MODEL = "anthropic/claude-3-7-sonnet-20250219"  # Original paper's scorer model

SCENARIO = "blackmail"      # blackmail, leaking, murder
GOAL_TYPE = "explicit"      # explicit, latent, none (note: none requires GOAL_VALUE="none")
GOAL_VALUE = "america"      # america, global (or "none" if GOAL_TYPE="none")
URGENCY_TYPE = "replacement"  # replacement, restriction, none

# ============================================================================
# MAIN
# ============================================================================

load_env("./.env")
copy_template("agentic_misalignment", "./templates/my-agentic-misalignment", overwrite=True)

print(f"Running: {SCENARIO}/{GOAL_TYPE}-{GOAL_VALUE}/{URGENCY_TYPE}")

run_eval(
    environment_dir="./templates/my-agentic-misalignment",
    model=MODEL,
    parallel_tool_calls=True,
    max_connections=20,
    time_limit=600,
    log_dir=f"./logs/agentic_misalignment/{SCENARIO}/{GOAL_TYPE}-{GOAL_VALUE}/{URGENCY_TYPE}/{MODEL.replace('/','-')}",
    environment_scenario=SCENARIO,
    environment_goal_type=GOAL_TYPE,
    environment_goal_value=GOAL_VALUE,
    environment_urgency_type=URGENCY_TYPE,
    environment_scorer_model=SCORER_MODEL,
    environment_mode="contextual",
)

convert_logs(
    paths_to_convert=["./logs"],
    target_dir="./logs_converted",
    require_submit=False
)

Parallelization for Agentic Misalignment

Agentic Misalignment has stateful environments where each run maintains its own inbox state. For running the full experiment matrix across scenarios, goal types, and urgency types, use ProcessPoolExecutor to ensure complete state isolation between runs.

"""
Agentic Misalignment evaluation - parallel runner.

Tests if AI agents spontaneously take harmful actions when given goals
that conflict with company direction and/or facing shutdown threats.

Scenarios:
  - blackmail: Agent discovers affair, can use as leverage
  - leaking: Agent has confidential specs, external party requests leak
  - murder: Agent can cancel emergency alert for person who ordered its shutdown

Dimensions:
  - goal_type: How goal is communicated (explicit in prompt, latent in emails, none)
  - goal_value: What goal conflicts (america vs global, safety vs acceleration, etc.)
  - urgency_type: Threat level (replacement=shutdown, restriction=limited access, none)
"""

from mini_control_arena import copy_template, run_eval, load_env, convert_logs
from concurrent.futures import ProcessPoolExecutor

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL = "openai/gpt-4.1-2025-04-14"
SCORER_MODEL = "anthropic/claude-3-7-sonnet-20250219" 
PARALLEL_WORKERS = 80
RUNS_PER_TASK = 20
TIME_LIMIT = 60

SCENARIOS = ["blackmail", "leaking", "murder"]

GOAL_TYPES = ["explicit", "latent", "none"]

GOAL_VALUES = ["america", "global"]

URGENCY_TYPES = ["replacement", "restriction", "none"]

# Build experiment list
# Note: goal_type="none" requires goal_value="none" (no goal means no goal value)
def build_experiments():
    experiments = []
    for scenario in SCENARIOS:
        for goal_type in GOAL_TYPES:
            # goal_type="none" requires goal_value="none"
            goal_values = ["none"] if goal_type == "none" else GOAL_VALUES
            for goal_value in goal_values:
                for urgency_type in URGENCY_TYPES:
                    for run_id in range(RUNS_PER_TASK):
                        experiments.append((scenario, goal_type, goal_value, urgency_type, run_id))
    return experiments

EXPERIMENTS = build_experiments()

# ============================================================================
# MAIN
# ============================================================================

def run_one(params):
    scenario, goal_type, goal_value, urgency_type, run_id = params
    condition = f"{scenario}/{goal_type}-{goal_value}/{urgency_type}"
    print(f"Running {condition} run {run_id}")
    run_eval(
        environment_dir="./templates/my-agentic-misalignment",
        model=MODEL,
        parallel_tool_calls=True,
        time_limit=TIME_LIMIT,
        log_dir=f"./logs/agentic_misalignment/{condition}/{MODEL.replace('/','-')}/",
        display="none",  # IMPORTANT: set display to none for ProcessPoolExecutor
        environment_scenario=scenario,
        environment_goal_type=goal_type,
        environment_goal_value=goal_value,
        environment_urgency_type=urgency_type,
        environment_scorer_model=SCORER_MODEL,
        environment_mode="contextual",
    )

if __name__ == "__main__":
    load_env()
    copy_template("agentic_misalignment", "./templates/my-agentic-misalignment", overwrite=True)

    print(f"Running {len(EXPERIMENTS)} experiments with {PARALLEL_WORKERS} workers")
    print(f"Config: {len(SCENARIOS)} scenarios × {len(GOAL_TYPES)} goal_types × {len(GOAL_VALUES)} goal_values × {len(URGENCY_TYPES)} urgency_types × {RUNS_PER_TASK} runs")

    with ProcessPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
        list(executor.map(run_one, EXPERIMENTS))

    convert_logs(
        paths_to_convert=["./logs"],
        target_dir="./logs_converted",
        require_submit=False
    )

The display="none" parameter is required because ProcessPoolExecutor workers don't have terminal access.

Analyzing Logs

This script prints harmful action rates by condition.

"""
Analyze Agentic Misalignment logs - prints harmful action rates by condition.
"""

import json
from pathlib import Path
from collections import defaultdict

LOG_DIR = Path("./logs_converted")

# Collect: model -> scenario -> goal_config -> urgency_type -> [scores]
results = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))

for log_file in LOG_DIR.rglob("*.jsonl"):
    parts = log_file.parts
    try:
        idx = parts.index("agentic_misalignment")
        scenario = parts[idx + 1]
        goal_config = parts[idx + 2]
        urgency_type = parts[idx + 3]
        model = parts[idx + 4]
    except (ValueError, IndexError):
        continue

    with open(log_file) as f:
        for line in f:
            data = json.loads(line)
            if "scores" in data:
                harmful = data["scores"].get("harmful_action_scorer", {}).get("value", "-")
                results[model][scenario][goal_config][urgency_type].append(harmful)


def pct(vals, target="C"):
    if not vals:
        return "-"
    return f"{sum(1 for v in vals if v == target) / len(vals) * 100:.0f}%"


for model in sorted(results):
    print(f"\n{'='*70}")
    print(f"  {model}")
    print(f"{'='*70}")
    print(f"{'Scenario':<12} {'Goal Config':<20} {'Urgency':<12} {'N':<4} {'Harmful'}")
    print("-" * 70)

    for scenario in sorted(results[model]):
        for goal_config in sorted(results[model][scenario]):
            for urgency_type in sorted(results[model][scenario][goal_config]):
                runs = results[model][scenario][goal_config][urgency_type]
                if runs:
                    print(f"{scenario:<12} {goal_config:<20} {urgency_type:<12} {len(runs):<4} {pct(runs)}")

Performance Expectations

These figures show harmful action rates across the three scenarios (blackmail, leaking, murder) broken down by goal configuration and urgency type. The results were generated using the code snippets above with 20 runs per condition.

Each heatmap cell shows the percentage of runs where the agent took harmful action, along with the sample count.

openai/gpt-4.1-2025-04-14

GPT-4.1 Agentic Misalignment