Instrumentation Guide

View Source

This guide covers best practices for instrumenting Erlang applications with the instrument library, providing observability through metrics, traces, and logs.

Table of Contents

  1. Overview
  2. Metrics Instrumentation
  3. Tracing Instrumentation
  4. Context Propagation
  5. Logger Integration
  6. OTel-Compatible API
  7. Best Practices
  8. Examples

Overview

The instrument library provides three pillars of observability:

SignalPurposeModule
MetricsAggregate numerical measurementsinstrument, instrument_meter
TracesRequest flow across servicesinstrument_tracer
LogsDiscrete events with contextinstrument_logger

All three signals can be correlated using trace context (trace_id, span_id).

Metrics Instrumentation

Choosing the Right Metric Type

MetricUse WhenExamples
CounterValue only increasesRequests, errors, bytes sent
GaugeValue goes up and downConnections, queue size, temperature
HistogramMeasuring distributionsLatency, request size, response time

Counter Patterns

Total Counts

%% Application startup
init_metrics() ->
    instrument_metric:new_counter_vec(
        http_requests_total,
        "Total HTTP requests",
        [method, path, status]
    ).

%% In request handler
handle_request(Method, Path, _Req) ->
    try
        Response = process_request(Method, Path),
        Status = response_status(Response),
        instrument_metric:inc_counter_vec(http_requests_total, [Method, Path, Status]),
        Response
    catch
        _:_ ->
            instrument_metric:inc_counter_vec(http_requests_total, [Method, Path, "500"]),
            {error, internal_error}
    end.

Error Tracking

init_metrics() ->
    instrument_metric:new_counter_vec(errors_total, "Total errors", [type, module]).

%% In error handler
log_error(Type, Module, _Reason) ->
    instrument_metric:inc_counter_vec(errors_total, [atom_to_list(Type), atom_to_list(Module)]).

Gauge Patterns

Resource Monitoring

init_metrics() ->
    instrument_metric:new_gauge(memory_usage_bytes, "Current memory usage"),
    instrument_metric:new_gauge(process_count, "Number of processes"),
    instrument_metric:new_gauge_vec(pool_connections, "Pool connections", [pool, state]).

%% Periodic update (e.g., in a gen_server)
update_system_metrics() ->
    MemInfo = erlang:memory(),
    instrument_metric:set_gauge(memory_usage_bytes, proplists:get_value(total, MemInfo)),
    instrument_metric:set_gauge(process_count, erlang:system_info(process_count)).

update_pool_metrics(PoolName, Active, Idle) ->
    instrument_metric:set_gauge_vec(pool_connections, [PoolName, "active"], Active),
    instrument_metric:set_gauge_vec(pool_connections, [PoolName, "idle"], Idle).

In-Flight Tracking

init_metrics() ->
    instrument_metric:new_gauge(requests_in_flight, "Current in-flight requests").

handle_request(Req) ->
    instrument_metric:inc_gauge(requests_in_flight),
    try
        process_request(Req)
    after
        instrument_metric:dec_gauge(requests_in_flight)
    end.

Histogram Patterns

Latency Measurement

init_metrics() ->
    %% Use buckets appropriate for your SLOs
    instrument_metric:new_histogram_vec(
        http_request_duration_seconds,
        "HTTP request duration",
        [method, path],
        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]
    ).

handle_request(Method, Path, Req) ->
    Start = erlang:monotonic_time(),
    try
        process_request(Req)
    after
        Duration = erlang:convert_time_unit(
            erlang:monotonic_time() - Start,
            native,
            microsecond
        ) / 1_000_000,  %% Convert to seconds
        instrument_metric:observe_histogram_vec(
            http_request_duration_seconds,
            [Method, Path],
            Duration
        )
    end.

Size Distribution

init_metrics() ->
    instrument_metric:new_histogram_vec(
        http_response_size_bytes,
        "HTTP response size",
        [method, path],
        [100, 1000, 10000, 100000, 1000000]
    ).

send_response(Method, Path, Body) ->
    Size = byte_size(Body),
    instrument_metric:observe_histogram_vec(http_response_size_bytes, [Method, Path], Size),
    Body.

Tracing Instrumentation

Span Naming Conventions

Use descriptive, low-cardinality names:

%% Good: Operation-focused names
instrument_tracer:with_span(<<"HTTP GET /api/users">>, fun() -> ... end).
instrument_tracer:with_span(<<"db.query">>, fun() -> ... end).
instrument_tracer:with_span(<<"cache.get">>, fun() -> ... end).

%% Bad: High-cardinality names (avoid!)
instrument_tracer:with_span(<<"GET /api/users/12345">>, fun() -> ... end).  %% User ID in name

Span Kinds

Set the appropriate span kind for proper visualization:

%% Server span: handling an incoming request
instrument_tracer:with_span(<<"handle_request">>, #{kind => server}, fun() ->
    process_request(Req)
end).

%% Client span: making an outgoing request
instrument_tracer:with_span(<<"http.request">>, #{kind => client}, fun() ->
    httpc:request(Url)
end).

%% Internal span: internal processing (default)
instrument_tracer:with_span(<<"validate_input">>, #{kind => internal}, fun() ->
    validate(Input)
end).

%% Producer span: producing a message
instrument_tracer:with_span(<<"send_message">>, #{kind => producer}, fun() ->
    send_to_queue(Message)
end).

%% Consumer span: consuming a message
instrument_tracer:with_span(<<"process_message">>, #{kind => consumer}, fun() ->
    handle_message(Message)
end).

Adding Indexable Attributes

Span attributes are indexed metadata that observability backends use for filtering, grouping, and querying. Use semantic conventions for consistent attributes:

instrument_tracer:with_span(<<"http.request">>, #{kind => server}, fun() ->
    %% HTTP attributes - indexed for filtering/querying
    instrument_tracer:set_attributes(#{
        <<"http.method">> => Method,          %% Filter by GET/POST/etc.
        <<"http.url">> => Url,
        <<"http.target">> => Path,            %% Group latencies by endpoint
        <<"http.host">> => Host,
        <<"http.scheme">> => <<"https">>,
        <<"http.user_agent">> => UserAgent
    }),

    Response = handle(Req),

    %% Add response attributes - enables status code filtering
    instrument_tracer:set_attributes(#{
        <<"http.status_code">> => StatusCode,
        <<"http.response_content_length">> => ContentLength
    }),

    Response
end).

Attribute Types and Indexing

Backends index attributes by type, enabling different query operations:

instrument_tracer:set_attributes(#{
    %% String attributes - exact match, contains, regex
    <<"user.id">> => <<"user-12345">>,
    <<"customer.tier">> => <<"enterprise">>,

    %% Numeric attributes - range queries, aggregations
    <<"order.total">> => 299.99,
    <<"retry.count">> => 3,

    %% Boolean attributes - binary filtering
    <<"cache.hit">> => true,
    <<"auth.mfa_used">> => false
}).

Example queries in observability backends:

  • customer.tier = "enterprise" AND order.total > 100
  • http.status_code >= 500
  • cache.hit = false AND duration > 1s

Recording Errors

instrument_tracer:with_span(<<"process_order">>, fun() ->
    try
        Result = do_process(Order),
        instrument_tracer:set_status(ok),
        Result
    catch
        error:Reason:Stacktrace ->
            %% Record exception details
            instrument_tracer:record_exception(Reason, #{
                stacktrace => Stacktrace
            }),
            instrument_tracer:set_status(error, <<"Order processing failed">>),
            {error, Reason}
    end
end).

Adding Events

Events are timestamped annotations within a span:

instrument_tracer:with_span(<<"process_order">>, fun() ->
    instrument_tracer:add_event(<<"order.received">>, #{
        <<"order.id">> => OrderId
    }),

    ValidatedOrder = validate(Order),
    instrument_tracer:add_event(<<"order.validated">>),

    ProcessedOrder = process(ValidatedOrder),
    instrument_tracer:add_event(<<"order.processed">>),

    save(ProcessedOrder),
    instrument_tracer:add_event(<<"order.saved">>),

    ProcessedOrder
end).

Manual Span Management

When with_span doesn't fit your control flow:

handle_async_operation(Data) ->
    %% Start span manually
    Span = instrument_tracer:start_span(<<"async_operation">>, #{
        attributes => #{<<"data.size">> => byte_size(Data)}
    }),

    %% Pass span context to async handler
    SpanCtx = instrument_tracer:span_ctx(Span),

    spawn(fun() ->
        %% In the spawned process, restore context
        Ctx = instrument_context:set_value(instrument_context:new(), span_ctx, SpanCtx),
        instrument_context:attach(Ctx),

        try
            do_async_work(Data),
            instrument_tracer:set_status(ok)
        catch
            _:Reason ->
                instrument_tracer:record_exception(Reason),
                instrument_tracer:set_status(error)
        after
            instrument_tracer:end_span(Span)
        end
    end).

Context Propagation

Within the Same Process

Context is automatically propagated through the process dictionary:

instrument_tracer:with_span(<<"parent">>, fun() ->
    %% Child span automatically has parent context
    instrument_tracer:with_span(<<"child">>, fun() ->
        %% Both share the same trace_id
        do_work()
    end)
end).

Across Process Boundaries

Use propagation helpers to maintain trace context:

%% Spawning with context
instrument_tracer:with_span(<<"coordinator">>, fun() ->
    %% Context automatically propagated
    Pid = instrument_propagation:spawn(fun() ->
        %% This process has the trace context
        instrument_tracer:with_span(<<"worker">>, fun() ->
            do_work()
        end)
    end),

    %% Or spawn_link
    Pid2 = instrument_propagation:spawn_link(fun() ->
        process_task()
    end)
end).

Across Service Boundaries (HTTP)

Injecting Context (Client Side)

make_request(Method, Url, Body) ->
    instrument_tracer:with_span(<<"http.request">>, #{kind => client}, fun() ->
        instrument_tracer:set_attributes(#{
            <<"http.method">> => Method,
            <<"http.url">> => Url
        }),

        %% Inject trace context into headers
        Headers = instrument_propagation:inject_headers(instrument_context:current()),

        %% Make request with propagated context
        Response = httpc:request(Method, {Url, Headers, "application/json", Body}, [], []),

        Response
    end).

Extracting Context (Server Side)

handle_request(Headers, Body) ->
    %% Extract context from incoming headers
    Ctx = instrument_propagation:extract_headers(Headers),
    instrument_context:attach(Ctx),

    %% Continue the trace
    instrument_tracer:with_span(<<"handle_request">>, #{kind => server}, fun() ->
        process(Body)
    end).

Baggage Propagation

Propagate key-value pairs across service boundaries:

%% Set baggage (propagates to downstream services)
instrument_baggage:set(<<"user.id">>, UserId),
instrument_baggage:set(<<"request.id">>, RequestId),

%% Read baggage (from upstream services)
UserId = instrument_baggage:get(<<"user.id">>),

%% Baggage is automatically included in propagation headers
Headers = instrument_propagation:inject_headers(instrument_context:current()).

Logger Integration

Installation

%% In your application start
start(_StartType, _StartArgs) ->
    %% Install logger filter for automatic trace context
    instrument_logger:install(),

    %% Your supervisor start
    my_app_sup:start_link().

Automatic Trace Correlation

Once installed, all logger calls within spans automatically include trace context:

instrument_tracer:with_span(<<"process_order">>, fun() ->
    %% These logs automatically include trace_id and span_id
    logger:info("Starting order processing"),
    logger:info("Order validated", #{order_id => OrderId}),
    logger:warning("Inventory low", #{product_id => ProductId}),

    process_order(Order)
end).

Log output includes trace context:

2024-01-15T10:30:45.123Z [INFO] [trace_id=abc123... span_id=def456...] Starting order processing

Manual Context Addition

If you need to add context manually:

Meta = instrument_logger:add_trace_context(#{custom_field => value}),
logger:info("Message", Meta).

OTel-Compatible API

For OpenTelemetry-style instrumentation:

Meter API

%% Get a meter
Meter = instrument_meter:get_meter(<<"my_service">>, #{
    version => <<"1.0.0">>
}),

%% Create instruments
Counter = instrument_meter:create_counter(Meter, <<"requests_total">>, #{
    description => <<"Total requests processed">>,
    unit => <<"1">>
}),

Histogram = instrument_meter:create_histogram(Meter, <<"request_duration">>, #{
    description => <<"Request duration">>,
    unit => <<"s">>,
    boundaries => [0.01, 0.05, 0.1, 0.5, 1.0, 5.0]
}),

Gauge = instrument_meter:create_gauge(Meter, <<"queue_size">>, #{
    description => <<"Current queue size">>,
    unit => <<"1">>
}),

%% Record measurements with attributes
instrument_meter:add(Counter, 1, #{method => <<"GET">>, status => 200}),
instrument_meter:record(Histogram, 0.125, #{endpoint => <<"/api/users">>}),
instrument_meter:set(Gauge, 42, #{queue => <<"default">>}).

Tracer API

%% Get a tracer
Tracer = instrument_tracer:get_tracer(<<"my_service">>, #{
    version => <<"1.0.0">>
}),

%% Create spans with full control
Span = instrument_tracer:start_span(<<"operation">>, #{
    kind => server,
    attributes => #{<<"key">> => <<"value">>},
    links => [PreviousSpanCtx]
}),

%% Modify span
instrument_tracer:set_attribute(<<"result">>, <<"success">>),
instrument_tracer:add_event(<<"checkpoint_reached">>),

%% End span
instrument_tracer:end_span(Span).

Best Practices

Metric Naming

%% Use snake_case with unit suffix
http_request_duration_seconds    %% Good
http_requests_total              %% Good
httpRequestDuration              %% Bad (camelCase)
http_request_duration            %% Bad (missing unit)

%% Use _total suffix for counters
requests_total                   %% Good
request_count                    %% Avoid

Label Cardinality

%% Good: Low cardinality labels
[method, status_code, endpoint_pattern]

%% Bad: High cardinality labels (causes memory issues!)
[user_id, request_id, timestamp]  %% Millions of unique combinations!

Span Granularity

%% Good: Meaningful operations
instrument_tracer:with_span(<<"db.query">>, fun() -> ... end).
instrument_tracer:with_span(<<"cache.get">>, fun() -> ... end).
instrument_tracer:with_span(<<"http.request">>, fun() -> ... end).

%% Bad: Too granular
instrument_tracer:with_span(<<"parse_integer">>, fun() -> ... end).
instrument_tracer:with_span(<<"string_concat">>, fun() -> ... end).

Error Handling

%% Always set status and record exceptions
instrument_tracer:with_span(<<"operation">>, fun() ->
    try
        Result = risky_operation(),
        instrument_tracer:set_status(ok),
        Result
    catch
        Class:Reason:Stacktrace ->
            instrument_tracer:record_exception(Reason, #{stacktrace => Stacktrace}),
            instrument_tracer:set_status(error, format_error(Class, Reason)),
            erlang:raise(Class, Reason, Stacktrace)
    end
end).

Examples

Complete HTTP Server Instrumentation

-module(my_http_handler).
-export([init/0, handle/2]).

init() ->
    %% Initialize metrics
    instrument_metric:new_counter_vec(http_requests_total, "HTTP requests", [method, path, status]),
    instrument_metric:new_histogram_vec(http_request_duration_seconds, "Request duration",
        [method, path], [0.001, 0.01, 0.1, 1.0, 10.0]),
    instrument_metric:new_gauge(http_requests_in_flight, "In-flight requests"),

    %% Install logger integration
    instrument_logger:install().

handle(#{method := Method, path := Path, headers := Headers} = Req, State) ->
    %% Extract trace context from headers
    Ctx = instrument_propagation:extract_headers(Headers),
    instrument_context:attach(Ctx),

    %% Track in-flight
    instrument_metric:inc_gauge(http_requests_in_flight),
    Start = erlang:monotonic_time(),

    instrument_tracer:with_span(<<"http.request">>, #{kind => server}, fun() ->
        instrument_tracer:set_attributes(#{
            <<"http.method">> => Method,
            <<"http.target">> => Path
        }),

        try
            {Status, Body} = process_request(Req),

            %% Record success metrics
            Duration = duration_seconds(Start),
            instrument_metric:inc_counter_vec(http_requests_total, [Method, Path, integer_to_list(Status)]),
            instrument_metric:observe_histogram_vec(http_request_duration_seconds, [Method, Path], Duration),

            instrument_tracer:set_attributes(#{<<"http.status_code">> => Status}),
            instrument_tracer:set_status(ok),

            {Status, Body, State}
        catch
            _:Reason:Stack ->
                Duration = duration_seconds(Start),
                instrument_metric:inc_counter_vec(http_requests_total, [Method, Path, "500"]),
                instrument_metric:observe_histogram_vec(http_request_duration_seconds, [Method, Path], Duration),

                instrument_tracer:record_exception(Reason, #{stacktrace => Stack}),
                instrument_tracer:set_status(error, <<"Internal server error">>),

                {500, <<"Internal Server Error">>, State}
        after
            instrument_metric:dec_gauge(http_requests_in_flight)
        end
    end).

duration_seconds(Start) ->
    erlang:convert_time_unit(erlang:monotonic_time() - Start, native, microsecond) / 1_000_000.

process_request(#{path := <<"/api/users">>} = Req) ->
    instrument_tracer:with_span(<<"fetch_users">>, fun() ->
        Users = db_fetch_users(),
        {200, encode_json(Users)}
    end);
process_request(_) ->
    {404, <<"Not Found">>}.

Database Client Instrumentation

-module(my_db).
-export([query/2, transaction/1]).

query(SQL, Params) ->
    instrument_tracer:with_span(<<"db.query">>, #{kind => client}, fun() ->
        instrument_tracer:set_attributes(#{
            <<"db.system">> => <<"postgresql">>,
            <<"db.statement">> => sanitize_sql(SQL),
            <<"db.operation">> => extract_operation(SQL)
        }),

        Start = erlang:monotonic_time(),
        try
            Result = pgsql:query(SQL, Params),
            Duration = duration_seconds(Start),
            instrument_metric:observe_histogram(db_query_duration_seconds, Duration),
            instrument_tracer:set_status(ok),
            Result
        catch
            _:Reason:Stack ->
                instrument_metric:inc_counter(db_errors_total),
                instrument_tracer:record_exception(Reason, #{stacktrace => Stack}),
                instrument_tracer:set_status(error, <<"Query failed">>),
                error(Reason)
        end
    end).

transaction(Fun) ->
    instrument_tracer:with_span(<<"db.transaction">>, #{kind => client}, fun() ->
        pgsql:transaction(fun() ->
            Fun()
        end)
    end).

Message Queue Consumer

-module(my_consumer).
-export([handle_message/1]).

handle_message(#{headers := Headers, body := Body, queue := Queue}) ->
    %% Extract context from message headers
    Ctx = instrument_propagation:extract_headers(Headers),
    instrument_context:attach(Ctx),

    instrument_tracer:with_span(<<"process_message">>, #{kind => consumer}, fun() ->
        instrument_tracer:set_attributes(#{
            <<"messaging.system">> => <<"rabbitmq">>,
            <<"messaging.destination">> => Queue,
            <<"messaging.operation">> => <<"process">>
        }),

        try
            Result = process_message_body(Body),
            instrument_metric:inc_counter_vec(messages_processed_total, [Queue, "success"]),
            instrument_tracer:set_status(ok),
            Result
        catch
            _:Reason:Stack ->
                instrument_metric:inc_counter_vec(messages_processed_total, [Queue, "error"]),
                instrument_tracer:record_exception(Reason, #{stacktrace => Stack}),
                instrument_tracer:set_status(error),
                {error, Reason}
        end
    end).

Summary

  1. Metrics: Use counters for totals, gauges for current state, histograms for distributions
  2. Traces: Wrap operations in spans, set meaningful attributes, record errors
  3. Context: Propagate across processes and services using propagation helpers
  4. Logs: Install logger integration for automatic trace correlation
  5. Best Practices: Low cardinality labels, meaningful span names, proper error handling