Supertester.ChaosHelpers (Supertester v0.6.0)

Copy Markdown View Source

Chaos engineering toolkit for OTP resilience testing.

Provides controlled fault injection to verify system fault tolerance, recovery mechanisms, and graceful degradation under adverse conditions.

Key Features

  • Process crash injection
  • Random child killing in supervision trees
  • Resource exhaustion simulation
  • Comprehensive chaos scenario testing

Usage

import Supertester.ChaosHelpers

test "system survives random crashes" do
  {:ok, supervisor} = setup_isolated_supervisor(MySupervisor)

  # Inject random crashes
  inject_crash(worker, {:random, 0.3})

  # Run workload
  perform_work(1000)

  # Verify system recovered
  assert_all_children_alive(supervisor)
end

Summary

Functions

Asserts system recovers from chaos within timeout.

Randomly kills children in a supervision tree to test restart strategies.

Injects controlled crashes into a process for resilience testing.

Runs a comprehensive chaos testing suite.

Simulates resource exhaustion scenarios.

Types

chaos_report()

@type chaos_report() :: %{
  killed: non_neg_integer(),
  restarted: non_neg_integer(),
  supervisor_crashed: boolean(),
  duration_ms: non_neg_integer()
}

chaos_suite_report()

@type chaos_suite_report() :: %{
  total_scenarios: non_neg_integer(),
  passed: non_neg_integer(),
  failed: non_neg_integer(),
  failures: [failure_report()],
  duration_ms: non_neg_integer()
}

crash_spec()

@type crash_spec() ::
  :immediate
  | {:after_ms, milliseconds :: pos_integer()}
  | {:random, probability :: float()}

failure_report()

@type failure_report() :: %{scenario: map(), reason: term()}

Functions

assert_chaos_resilient(system, chaos_fn, recovery_fn, opts \\ [])

@spec assert_chaos_resilient(pid(), (-> any()), (-> boolean()), keyword()) :: :ok

Asserts system recovers from chaos within timeout.

Parameters

  • system - The system PID (supervisor or process)
  • chaos_fn - Function that applies chaos
  • recovery_fn - Function that checks if system recovered (returns boolean)
  • opts - Options (:timeout in ms, default: 5000)

Examples

assert_chaos_resilient(supervisor,
  fn -> chaos_kill_children(supervisor, kill_rate: 0.5) end,
  fn -> all_children_alive?(supervisor) end,
  timeout: 10_000
)

chaos_kill_children(supervisor, opts \\ [])

@spec chaos_kill_children(
  Supervisor.supervisor(),
  keyword()
) :: chaos_report()

Randomly kills children in a supervision tree to test restart strategies.

Options

  • :kill_rate - Percentage of children to kill (default: 0.3 = 30%)
  • :duration_ms - How long to run chaos (default: 5000)
  • :kill_interval_ms - Time between kills (default: 100)
  • :kill_reason - Reason for kills (default: :kill)

Examples

test "supervisor handles cascading failures" do
  {:ok, supervisor} = setup_isolated_supervisor(MySupervisor)

  report = chaos_kill_children(supervisor,
    kill_rate: 0.5,
    duration_ms: 3000,
    kill_interval_ms: 200
  )

  # Verify supervisor survived
  assert Process.alive?(supervisor)
  assert report.supervisor_crashed == false
end

inject_crash(target, crash_spec, opts \\ [])

@spec inject_crash(pid(), crash_spec(), keyword()) :: :ok

Injects controlled crashes into a process for resilience testing.

Parameters

  • target - The process PID to crash
  • crash_spec - How to crash the process
  • opts - Options (:reason for crash reason, default: :chaos_injection)

Crash Specifications

  • :immediate - Crash immediately
  • {:after_ms, duration} - Crash after duration milliseconds
  • {:random, probability} - Crash with given probability (0.0 to 1.0)

Examples

# Immediate crash
inject_crash(worker_pid, :immediate)

# Delayed crash
inject_crash(worker_pid, {:after_ms, 100})

# Random crash (30% probability)
inject_crash(worker_pid, {:random, 0.3})

run_chaos_suite(target, scenarios, opts \\ [])

@spec run_chaos_suite(pid(), [map()], keyword()) :: chaos_suite_report()

Runs a comprehensive chaos testing suite.

Parameters

  • target - The target system (supervisor or process)
  • scenarios - List of chaos scenarios
  • opts - Options (:timeout for overall timeout)

Examples

scenarios = [
  %{type: :kill_children, kill_rate: 0.3, duration_ms: 1000},
  %{type: :kill_children, kill_rate: 0.5, duration_ms: 1000},
]

report = run_chaos_suite(supervisor, scenarios, timeout: 30_000)

assert report.passed == report.total_scenarios
assert report.failed == 0

simulate_resource_exhaustion(resource, opts \\ [])

@spec simulate_resource_exhaustion(
  atom(),
  keyword()
) :: {:ok, cleanup_fn :: (-> :ok)} | {:error, term()}

Simulates resource exhaustion scenarios.

Options

  • :percentage - Percentage of limit to consume (default: 0.8 = 80%)
  • :spawn_count - Explicit number of processes/resources to spawn
  • :count - Number of resources for non-percentage resources

Examples

test "system handles process limit pressure" do
  {:ok, cleanup} = simulate_resource_exhaustion(:process_limit,
    percentage: 0.05,  # Use small percentage for tests
    spawn_count: 100
  )

  # Perform operations under pressure
  result = perform_critical_operation()

  # Cleanup
  cleanup.()

  # Verify graceful degradation
  assert result == :ok or match?({:error, _}, result)
end