SnmpKit.SnmpLib.Monitor (snmpkit v0.6.4)

Performance monitoring and metrics collection for SNMP operations.

This module provides comprehensive monitoring capabilities for SNMP applications, including real-time metrics, performance analytics, and health monitoring. Based on monitoring patterns proven in large-scale network management systems.

Features

Real-time Metrics: Live performance data collection and analysis
Historical Analytics: Trend analysis and capacity planning data
Health Monitoring: Automatic detection of performance degradation
Alerting: Configurable thresholds and notification system
Device Profiling: Per-device performance characteristics
Operation Tracking: Detailed metrics for all SNMP operation types

Metric Categories

Operation Metrics

Request/response times
Success/failure rates
Throughput measurements
Error classifications

Device Metrics

Per-device response characteristics
Availability percentages
Performance trends
Health scores

System Metrics

Connection pool utilization
Memory usage patterns
Resource consumption
Concurrent operation counts

Usage Examples

# Start monitoring system
{:ok, _pid} = SnmpKit.SnmpLib.Monitor.start_link()

# Record SNMP operation
SnmpKit.SnmpLib.Monitor.record_operation(
  device: "192.168.1.1",
  operation: :get,
  duration: 245,
  result: :success
)

# Get real-time stats
stats = SnmpKit.SnmpLib.Monitor.get_stats("192.168.1.1")
IO.puts("Average response time: " <> to_string(stats.avg_response_time) <> "ms")

# Set up alerting
SnmpKit.SnmpLib.Monitor.set_alert_threshold("192.168.1.1", :response_time, 5000)

Summary

Types

alert_threshold()

device_id()

device_stats()

metric_type()

operation_metric()

operation_result()

operation_type()

system_stats()

Functions

child_spec(init_arg)

Returns a specification to start this module under a supervisor.

export_data(format, timeframe \\ :last_hour)

Exports monitoring data for external analysis.

get_active_alerts()

Gets currently active alerts.

get_device_stats(device_id, timeframe \\ :all_time)

Gets comprehensive statistics for a specific device.

get_operation_metrics(operation, timeframe \\ :last_hour)

Gets performance metrics for a specific operation type.

get_system_stats()

Gets system-wide statistics and performance metrics.

health_check()

Forces a health check of all monitored devices.

record_operation(metric)

Records an SNMP operation for monitoring and analysis.

remove_alert_threshold(device_id, metric)

Removes an alert threshold.

set_alert_threshold(device_id, metric, threshold, opts \\ [])

Sets an alert threshold for automated monitoring.

start_link(opts \\ [])

Starts the monitoring system.

Types

alert_threshold()

@type alert_threshold() :: %{
  device_id: device_id(),
  metric: metric_type(),
  threshold: number(),
  condition: :above | :below,
  duration: pos_integer(),
  callback: function() | nil
}

device_id()

@type device_id() :: binary()

device_stats()

@type device_stats() :: %{
  device_id: device_id(),
  total_operations: non_neg_integer(),
  successful_operations: non_neg_integer(),
  failed_operations: non_neg_integer(),
  avg_response_time: float(),
  p95_response_time: float(),
  p99_response_time: float(),
  error_rate: float(),
  availability: float(),
  health_score: float(),
  last_seen: integer(),
  trend: :improving | :stable | :degrading
}

metric_type()

@type metric_type() :: :response_time | :error_rate | :throughput | :availability

operation_metric()

@type operation_metric() :: %{
  device: device_id(),
  operation: operation_type(),
  timestamp: integer(),
  duration: non_neg_integer(),
  result: operation_result(),
  error_type: atom() | nil,
  bytes_sent: non_neg_integer() | nil,
  bytes_received: non_neg_integer() | nil
}

operation_result()

@type operation_result() :: :success | :error | :timeout | :partial

operation_type()

@type operation_type() :: :get | :get_next | :get_bulk | :set | :walk

system_stats()

@type system_stats() :: %{
  total_devices: non_neg_integer(),
  active_devices: non_neg_integer(),
  total_operations: non_neg_integer(),
  operations_per_second: float(),
  average_response_time: float(),
  global_error_rate: float(),
  memory_usage: non_neg_integer(),
  uptime: non_neg_integer()
}

Functions

child_spec(init_arg)

Returns a specification to start this module under a supervisor.

See Supervisor.

export_data(format, timeframe \\ :last_hour)

@spec export_data(atom(), atom()) :: binary()

Exports monitoring data for external analysis.

Parameters

format: Export format (:json, :csv, :prometheus)
timeframe: Time range for export

JSON Export

JSON export uses Elixir's built-in JSON module (requires Elixir 1.18+).

Examples

data = SnmpKit.SnmpLib.Monitor.export_data(:json, :last_hour)
case data do
  "JSON export unavailable" <> _ -> IO.puts("JSON not available")
  json -> File.write!("snmp_metrics.json", json)
end

get_active_alerts()

@spec get_active_alerts() :: [map()]

Gets currently active alerts.

Examples

alerts = SnmpKit.SnmpLib.Monitor.get_active_alerts()
Enum.each(alerts, fn alert ->
  IO.puts("Alert: " <> alert.device_id <> " " <> to_string(alert.metric) <> " " <> to_string(alert.current_value))
end)

get_device_stats(device_id, timeframe \\ :all_time)

@spec get_device_stats(device_id(), atom()) :: device_stats() | {:error, :not_found}

Gets comprehensive statistics for a specific device.

Parameters

device_id: Device identifier
timeframe: Optional timeframe (:last_hour, :last_day, :all_time)

Returns

Device statistics map or {:error, :not_found} if device has no recorded operations.

Examples

# Get current device stats
stats = SnmpKit.SnmpLib.Monitor.get_device_stats("192.168.1.1")
IO.puts("Error rate: " <> to_string(stats.error_rate) <> "%")

# Get stats for specific timeframe
stats = SnmpKit.SnmpLib.Monitor.get_device_stats("192.168.1.1", :last_hour)

get_operation_metrics(operation, timeframe \\ :last_hour)

@spec get_operation_metrics(operation_type(), atom()) :: map()

Gets performance metrics for a specific operation type.

Parameters

operation: SNMP operation type
timeframe: Optional timeframe for analysis

Examples

metrics = SnmpKit.SnmpLib.Monitor.get_operation_metrics(:get_bulk)
IO.puts("Average GETBULK time: " <> to_string(metrics.avg_duration) <> "ms")

get_system_stats()

@spec get_system_stats() :: system_stats()

Gets system-wide statistics and performance metrics.

Returns

Comprehensive system statistics including global performance metrics, device counts, and resource utilization.

Examples

stats = SnmpKit.SnmpLib.Monitor.get_system_stats()
IO.puts("Total devices monitored: " <> to_string(stats.total_devices))
IO.puts("Operations per second: " <> to_string(stats.operations_per_second))

health_check()

@spec health_check() :: :ok

Forces a health check of all monitored devices.

Useful for immediate assessment of system health.

Examples

:ok = SnmpKit.SnmpLib.Monitor.health_check()

record_operation(metric)

@spec record_operation(map()) :: :ok

Records an SNMP operation for monitoring and analysis.

This is the primary interface for feeding operation data into the monitoring system. Should be called after every SNMP operation for comprehensive monitoring.

Parameters

metric: Operation metric map with required fields

Required Fields

device: Target device identifier
operation: Type of SNMP operation
duration: Operation duration in milliseconds
result: Operation result status

Optional Fields

error_type: Specific error classification (if result is :error)
bytes_sent: Number of bytes sent
bytes_received: Number of bytes received
timestamp: Override timestamp (defaults to current time)

Examples

# Basic operation recording
SnmpKit.SnmpLib.Monitor.record_operation(%{
  device: "192.168.1.1",
  operation: :get,
  duration: 245,
  result: :success
})

# Detailed operation recording
SnmpKit.SnmpLib.Monitor.record_operation(%{
  device: "192.168.1.1",
  operation: :get_bulk,
  duration: 1250,
  result: :error,
  error_type: :timeout,
  bytes_sent: 64,
  bytes_received: 0
})

remove_alert_threshold(device_id, metric)

@spec remove_alert_threshold(device_id(), metric_type()) :: :ok

Removes an alert threshold.

Examples

:ok = SnmpKit.SnmpLib.Monitor.remove_alert_threshold("192.168.1.1", :response_time)

set_alert_threshold(device_id, metric, threshold, opts \\ [])

@spec set_alert_threshold(device_id(), metric_type(), number(), keyword()) :: :ok

Sets an alert threshold for automated monitoring.

Alerts fire when the specified metric exceeds the threshold for the given duration.

Parameters

device_id: Device to monitor (use ":global" for system-wide alerts)
metric: Metric type to monitor
threshold: Threshold value
opts: Alert configuration options

Options

condition: :above or :below (default: :above)
duration: How long threshold must be exceeded (default: 60000ms)
callback: Function to call when alert fires

Examples

# Alert on high response times
SnmpKit.SnmpLib.Monitor.set_alert_threshold("192.168.1.1", :response_time, 5000)

# Alert on low availability with custom callback
SnmpKit.SnmpLib.Monitor.set_alert_threshold("core-router", :availability, 95.0,
  condition: :below,
  duration: 300_000,
  callback: &MyApp.Alerts.device_down/1
)

start_link(opts \\ [])

@spec start_link(keyword()) :: {:ok, pid()} | {:error, any()}

Starts the monitoring system.

Options

retention_period: How long to keep historical data (default: 1 hour)
bucket_size: Time bucket size for aggregation (default: 1 minute)
cleanup_interval: How often to clean old data (default: 5 minutes)
health_check_interval: How often to check device health (default: 1 minute)

Examples

{:ok, pid} = SnmpKit.SnmpLib.Monitor.start_link()

{:ok, pid} = SnmpKit.SnmpLib.Monitor.start_link(
  retention_period: 7200_000,  # 2 hours
  bucket_size: 30_000          # 30 second buckets
)