Snakepit Telemetry Events

View Source

Comprehensive reference for all telemetry events emitted by Snakepit v0.6.0+.

Overview

Snakepit uses :telemetry for observability and monitoring. Events are emitted at key lifecycle points to enable:

  • Performance monitoring
  • Resource tracking
  • Worker health monitoring
  • Automatic recycling visibility
  • Custom metrics and alerts

Event List

Worker Lifecycle Events

[:snakepit, :worker, :recycled]

Emitted when a worker is recycled (TTL, max requests, memory threshold).

Measurements:

  • count: 1 - Always 1 per event

Metadata:

%{
  worker_id: "pool_worker_123",
  pool: :hpc_pool,
  reason: :ttl_expired | :max_requests | :memory_threshold | :manual | :worker_died,
  uptime_seconds: 3600,
  request_count: 1234
}

Example Handler:

:telemetry.attach(
  "worker-recycle-handler",
  [:snakepit, :worker, :recycled],
  fn _event, %{count: count}, metadata, _config ->
    Logger.info("Worker #{metadata.worker_id} recycled: #{metadata.reason}")
    Logger.info("  Uptime: #{metadata.uptime_seconds}s, Requests: #{metadata.request_count}")
  end,
  nil
)

[:snakepit, :worker, :health_check_failed]

Emitted when a worker fails a health check.

Measurements:

  • count: 1

Metadata:

%{
  worker_id: "pool_worker_123",
  pool: :hpc_pool,
  reason: :worker_dead | :health_check_failed | term()
}

[:snakepit, :worker, :started]

Emitted when a worker successfully starts (future enhancement).

Measurements:

  • count: 1
  • startup_time_ms: integer()

Metadata:

%{
  worker_id: "pool_worker_123",
  pool: :hpc_pool,
  profile: :process | :thread,
  capacity: 1 | 16  # Depends on profile
}

Pool Events (Future)

[:snakepit, :pool, :saturated]

Emitted when pool reaches capacity and queues requests.

Measurements:

  • count: 1
  • queue_size: integer()

Metadata:

%{
  pool: :hpc_pool,
  available_workers: 0,
  busy_workers: 16,
  queue_size: 42
}

[:snakepit, :request, :executed]

Emitted after each successful request.

Measurements:

  • count: 1
  • duration_ms: integer()

Metadata:

%{
  pool: :hpc_pool,
  worker_id: "pool_worker_123",
  command: "compute_intensive",
  success: true | false
}

Usage Examples

Basic Monitoring

# Attach handler for all Snakepit events
:telemetry.attach_many(
  "snakepit-monitor",
  [
    [:snakepit, :worker, :recycled],
    [:snakepit, :worker, :health_check_failed]
  ],
  &MyApp.TelemetryHandler.handle_event/4,
  nil
)

defmodule MyApp.TelemetryHandler do
  def handle_event(event, measurements, metadata, _config) do
    IO.inspect({event, measurements, metadata}, label: "Snakepit Event")
  end
end

Prometheus Metrics

# Count worker recycling events by reason
:telemetry.attach(
  "worker-recycle-counter",
  [:snakepit, :worker, :recycled],
  fn _event, _measurements, metadata, _config ->
    :telemetry_metrics_prometheus_core.execute(
      :counter,
      [:snakepit, :worker_recycled_total],
      1,
      %{reason: metadata.reason, pool: metadata.pool}
    )
  end,
  nil
)

# Track worker uptime histogram
:telemetry.attach(
  "worker-uptime-histogram",
  [:snakepit, :worker, :recycled],
  fn _event, _measurements, metadata, _config ->
    :telemetry_metrics_prometheus_core.execute(
      :histogram,
      [:snakepit, :worker_uptime_seconds],
      metadata.uptime_seconds,
      %{pool: metadata.pool}
    )
  end,
  nil
)

LiveDashboard Integration

# In router.ex
live_dashboard "/dashboard",
  metrics: MyApp.Telemetry,
  telemetry_poller_metrics: [
    # Snakepit metrics
    last_value("snakepit.worker.count"),
    counter("snakepit.worker.recycled_total"),
    summary("snakepit.worker.uptime_seconds"),
    last_value("snakepit.pool.queue_size")
  ]

Custom Alerts

# Alert if too many workers recycled in short time
:telemetry.attach(
  "worker-recycle-alert",
  [:snakepit, :worker, :recycled],
  fn _event, _measurements, metadata, state ->
    # Increment counter
    count = Map.get(state, :recycle_count, 0) + 1
    new_state = Map.put(state, :recycle_count, count)

    # Alert if > 10 recycling events in 60 seconds
    if count > 10 do
      Logger.warning("High worker churn detected: #{count} workers recycled")
      # Send alert to monitoring system
      MyApp.Monitoring.send_alert(:high_worker_churn, metadata)
    end

    new_state
  end,
  %{}
)

Telemetry Best Practices

1. Attach Handlers Early

# In application.ex, before starting Snakepit
defmodule MyApp.Application do
  def start(_type, _args) do
    # Attach telemetry handlers FIRST
    MyApp.Telemetry.attach_handlers()

    children = [
      # ... other children
      {Snakepit.Application, []}
    ]

    Supervisor.start_link(children, strategy: :one_for_one)
  end
end

2. Use Structured Metadata

# Good: Structured data
:telemetry.execute(
  [:myapp, :custom, :event],
  %{count: 1},
  %{pool: pool_name, worker: worker_id}
)

# Bad: Unstructured strings
:telemetry.execute(
  [:myapp, :custom, :event],
  %{count: 1},
  %{message: "Pool #{pool_name} worker #{worker_id}"}
)

3. Don't Block in Handlers

# Bad: Blocking I/O in handler
:telemetry.attach("handler", event, fn _event, _meas, meta, _cfg ->
  HTTPoison.post("http://metrics.example.com", Jason.encode!(meta))  # SLOW!
end, nil)

# Good: Async processing
:telemetry.attach("handler", event, fn _event, _meas, meta, _cfg ->
  Task.start(fn ->
    HTTPoison.post("http://metrics.example.com", Jason.encode!(meta))
  end)
end, nil)

Debugging Telemetry

See All Events

# Attach debug handler to see all Snakepit events
:telemetry.attach_many(
  "debug-all",
  [
    [:snakepit, :worker, :recycled],
    [:snakepit, :worker, :health_check_failed],
    [:snakepit, :worker, :started],
    [:snakepit, :pool, :saturated],
    [:snakepit, :request, :executed]
  ],
  fn event, measurements, metadata, _config ->
    IO.inspect({event, measurements, metadata}, label: "Telemetry")
  end,
  nil
)

Test Event Emission

# Manually trigger events for testing
:telemetry.execute(
  [:snakepit, :worker, :recycled],
  %{count: 1},
  %{
    worker_id: "test_worker",
    pool: :test_pool,
    reason: :manual,
    uptime_seconds: 100,
    request_count: 50
  }
)

References