Client Tracing Guide

View Source

Generic utilities for tracing client operations: databases, HTTP clients, message queues, RPC calls, and more.

Overview

The instrument_client module provides reusable utilities for creating client-kind spans following OpenTelemetry semantic conventions. It works with any client operation type and automatically sets appropriate attributes based on the system type.

Key features:

  • Generic client spans - Works for databases, HTTP, messaging, RPC
  • Text sanitization - Remove sensitive data from queries and URLs
  • Trace context injection - SQLCommenter-style trace propagation
  • Resource pool monitoring - Track pool checkout/checkin
  • Attribute-based sampling - Fine-grained sampling control

Client Span Helpers

Basic Usage

%% Simple client span
instrument_client:with_client_span(postgresql, <<"SELECT">>, fun() ->
    epgsql:equery(Conn, SQL, Params)
end).

%% With options
instrument_client:with_client_span(postgresql, <<"SELECT">>, #{
    target => <<"users">>,
    statement => <<"SELECT * FROM users WHERE id = $1">>,
    sanitize => true,
    attributes => #{<<"db.name">> => <<"mydb">>}
}, fun() ->
    epgsql:equery(Conn, SQL, Params)
end).

Manual Span Control

When you need more control over span lifecycle:

Span = instrument_client:start_client_span(redis, <<"GET">>, #{
    target => <<"session:abc">>
}),
try
    Result = eredis:q(Conn, ["GET", "session:abc"]),
    instrument_tracer:set_status(ok),
    Result
catch
    _:Reason ->
        instrument_tracer:set_status(error, format_error(Reason)),
        erlang:raise(error, Reason, erlang:get_stacktrace())
after
    instrument_tracer:end_span(Span)
end.

Span Naming

Span names follow semantic conventions: system operation [target]

%% Without target: "postgresql SELECT"
instrument_client:client_span_name(postgresql, <<"SELECT">>).

%% With target: "postgresql SELECT users"
instrument_client:client_span_name(postgresql, <<"SELECT">>, #{target => <<"users">>}).

Text Sanitization

Remove sensitive data from queries before including in spans.

Basic Sanitization

%% SQL: Replace string literals and numbers
instrument_client:sanitize_text(<<"SELECT * FROM users WHERE email = 'john@secret.com' AND id = 123">>).
%% => <<"SELECT * FROM users WHERE email = ? AND id = ?">>

Custom Patterns

%% URL path sanitization
instrument_client:sanitize_text(<<"/users/12345/orders">>, #{
    patterns => [<<"/\\d+">>],
    placeholder => <<"/:id">>
}).
%% => <<"/users/:id/orders">>

%% Preserve PostgreSQL placeholders
instrument_client:sanitize_text(
    <<"SELECT * FROM users WHERE id = $1 AND name = 'John'">>,
    #{preserve => [<<"\\$\\d+">>]}
).
%% => <<"SELECT * FROM users WHERE id = $1 AND name = ?">>

Custom Placeholder

instrument_client:sanitize_text(<<"WHERE x = 'secret'">>, #{
    placeholder => <<"<REDACTED>">>
}).
%% => <<"WHERE x = <REDACTED>">>

Trace Context Propagation

Inject trace context into queries for database log correlation (SQLCommenter-style).

SQL Comment Format

instrument_tracer:with_span(<<"db_query">>, fun() ->
    SQL = <<"SELECT * FROM users">>,
    TracedSQL = instrument_client:inject_trace_comment(SQL),
    %% => <<"SELECT * FROM users /*traceparent='00-abc...-def...-01'*/">>
    epgsql:squery(Conn, TracedSQL)
end).

URL Query Parameter Format

URL = <<"/api/data">>,
TracedURL = instrument_client:inject_trace_comment(URL, #{format => url}).
%% => <<"/api/data?traceparent=00-abc...-def...-01">>

Selective Injection

Only inject for verbose/debug mode:

SQL = case instrument_config:is_verbose_tracing() of
    true -> instrument_client:inject_trace_comment(OriginalSQL);
    false -> OriginalSQL
end.

Resource Pool Monitoring

Track connection pool checkout/checkin times separately from operation time.

With Pool Span

%% Wraps pool acquire/release around your operation
instrument_client:with_pool_span(<<"db_pool">>, postgresql, fun() ->
    Conn = poolboy:checkout(db_pool),
    try
        instrument_client:with_client_span(postgresql, <<"SELECT">>, #{
            target => <<"users">>
        }, fun() ->
            epgsql:equery(Conn, SQL, Params)
        end)
    after
        poolboy:checkin(db_pool, Conn)
    end
end).

Manual Pool Spans

%% Start acquisition span
AcquireSpan = instrument_client:pool_acquire_span(<<"db_pool">>, postgresql),
Conn = poolboy:checkout(db_pool),
instrument_tracer:end_span(AcquireSpan),

%% Use connection
Result = query(Conn, SQL),

%% Record release
poolboy:checkin(db_pool, Conn),
instrument_client:pool_release_span(<<"db_pool">>).

Attribute Builders

Build Attributes from Options

Attrs = instrument_client:build_attributes(#{
    system => postgresql,
    operation => <<"SELECT">>,
    target => <<"users">>,
    statement => <<"SELECT * FROM users">>,
    sanitize => true,
    attributes => #{<<"db.name">> => <<"mydb">>}
}).
%% #{<<"db.system">> => <<"postgresql">>,
%%   <<"db.operation">> => <<"SELECT">>,
%%   <<"db.sql.table">> => <<"users">>,
%%   <<"db.statement">> => <<"SELECT * FROM users">>,
%%   <<"db.name">> => <<"mydb">>}

Set Response Attributes

instrument_client:with_client_span(postgresql, <<"SELECT">>, #{}, fun() ->
    case epgsql:equery(Conn, SQL, Params) of
        {ok, Cols, Rows} ->
            instrument_client:set_response_attributes(#{
                rows_returned => length(Rows)
            }),
            instrument_tracer:set_status(ok),
            {ok, Cols, Rows};
        {error, Reason} ->
            instrument_tracer:set_status(error, format_error(Reason)),
            {error, Reason}
    end
end).

Attribute-Based Sampling

The instrument_sampler_attribute module provides fine-grained sampling control based on span attributes.

Important: Sampling Timing

Sampling decisions are made at span start, before your code executes. This means:

  • Attributes must be passed in Opts.attributes to influence sampling
  • Attributes set later with set_attribute/2 do NOT affect sampling
  • set_status(error) does NOT retroactively force sampling

For effective sampling, pass known attributes at span creation time:

%% WRONG: error attribute set too late
instrument_client:with_client_span(postgresql, Op, #{}, fun() ->
    case query() of
        {error, _} ->
            instrument_tracer:set_attribute(<<"error">>, true),  %% Too late!
            ...
    end
end).

%% RIGHT: Use known attributes at span start
instrument_client:with_client_span(postgresql, Op, #{
    attributes => #{
        <<"db.operation">> => Op,
        <<"db.sql.table">> => Table
    }
}, fun() -> ... end).

Configuration

instrument_sampler:set_sampler(instrument_sampler_attribute, #{
    default_ratio => 0.001,  %% 0.1% baseline
    attribute_rules => [
        %% These attributes MUST be set at span creation time
        {<<"db.operation">>, <<"SELECT">>, 0.001},
        {<<"db.operation">>, <<"INSERT">>, 0.01},
        {<<"db.operation">>, <<"UPDATE">>, 0.01},
        {<<"db.operation">>, <<"DELETE">>, 0.05},

        %% Critical tables always sampled
        {<<"db.sql.table">>, <<"payments">>, 1.0},
        {<<"db.sql.table">>, <<"audit_log">>, 1.0}
    ]
}).

Note: Rules like {<<"error">>, true, 1.0} only work if the error attribute is passed at span creation. For error sampling based on execution results, use tail-based sampling (post-export filtering) or custom span processors.

Rule Matching

  • Rules are evaluated in order
  • First matching rule determines sampling rate
  • If no rules match, default_ratio is used
  • Same trace ID always produces same decision (deterministic)

HTTP Example

instrument_sampler:set_sampler(instrument_sampler_attribute, #{
    default_ratio => 0.1,
    attribute_rules => [
        {<<"http.method">>, <<"GET">>, 0.01},
        {<<"http.method">>, <<"POST">>, 0.1},
        {<<"http.status_code">>, 500, 1.0}  %% Always sample 500s
    ]
}).

Messaging Example

instrument_sampler:set_sampler(instrument_sampler_attribute, #{
    default_ratio => 0.05,
    attribute_rules => [
        {<<"messaging.destination">>, <<"critical-events">>, 1.0},
        {<<"messaging.operation">>, <<"process">>, 0.1}
    ]
}).

Sampling Errors After Execution

To sample based on execution results (like errors), use tail-based sampling (see below).

Tail-Based Sampling

The instrument_span_processor_tail_sampler makes sampling decisions after spans complete, enabling filtering based on final state: errors, duration, attributes, and events.

When to Use Tail Sampling

  • Error sampling - Always keep spans that ended with errors
  • Latency sampling - Keep slow operations (e.g., > 100ms)
  • Post-execution attributes - Sample based on attributes set during execution
  • Exception events - Keep spans that recorded exceptions

Configuration

instrument_span_processor:register(instrument_span_processor_tail_sampler, #{
    %% Always keep spans matching these conditions (OR logic)
    always_keep => [
        {status, error},                    %% Keep all error spans
        {duration_ms, '>', 100},            %% Keep spans > 100ms
        {attribute, <<"priority">>, high},  %% Keep high priority spans
        has_exception                       %% Keep spans with exceptions
    ],

    %% Always drop spans matching these conditions (evaluated after always_keep)
    always_drop => [
        {attribute, <<"health_check">>, true}  %% Drop health checks
    ],

    %% Probabilistic sampling for remaining spans
    default_ratio => 0.01,  %% 1% of remaining spans

    %% Forward kept spans to this exporter
    exporter => instrument_exporter_otlp,
    exporter_config => #{endpoint => <<"http://localhost:4318">>}
}).

Rule Types

RuleSyntaxDescription
Status{status, error} or {status, ok}Match span status
Duration{duration_ms, Op, Value}Op: '>', '<', '>=', '<='
Attribute{attribute, Key, Value}Exact attribute match
Attribute exists{attribute_exists, Key}Attribute is present
Has event{has_event, EventName}Span has named event
Has exceptionhas_exceptionSpan has exception event

Processing Order

  1. Check always_keep rules (OR logic) - if any match, span is KEPT
  2. Check always_drop rules (OR logic) - if any match, span is DROPPED
  3. Apply default_ratio probability - if passed, span is KEPT
  4. Otherwise span is DROPPED

Example: Error and Latency Sampling

%% Keep all errors and slow requests, drop health checks, sample 0.1% of the rest
instrument_span_processor:register(instrument_span_processor_tail_sampler, #{
    always_keep => [
        {status, error},
        {duration_ms, '>', 500}   %% > 500ms is slow
    ],
    always_drop => [
        {attribute, <<"http.route">>, <<"/health">>},
        {attribute, <<"http.route">>, <<"/ready">>}
    ],
    default_ratio => 0.001,
    exporter => instrument_exporter_otlp,
    exporter_config => #{endpoint => <<"http://localhost:4318">>}
}).

Example: Debug Spans

%% Keep spans with debug flag or that recorded retries
instrument_span_processor:register(instrument_span_processor_tail_sampler, #{
    always_keep => [
        {attribute_exists, <<"debug">>},
        {has_event, <<"retry">>}
    ],
    default_ratio => 0.0,  %% Drop everything else
    exporter => instrument_exporter_console,
    exporter_config => #{}
}).

Tail vs Head Sampling

AspectHead Sampling (Attribute-Based)Tail Sampling
Decision timeAt span startAt span end
Error samplingMust predict errors upfrontWorks on actual errors
Latency samplingCannot sample by durationCan filter by duration
Memory usageLower (drops spans early)Higher (keeps until end)
Use caseHigh-volume productionDebug, error analysis

For production, combine both: use head sampling for volume control and tail sampling for error/latency capture.

Examples

PostgreSQL with epgsql

-module(myapp_db).
-export([query/3]).

query(Conn, SQL, Params) ->
    Op = detect_operation(SQL),
    Table = detect_table(SQL),

    instrument_client:with_client_span(postgresql, Op, #{
        target => Table,
        statement => SQL,
        sanitize => true,
        attributes => #{
            <<"db.name">> => <<"myapp">>,
            <<"db.user">> => <<"appuser">>
        }
    }, fun() ->
        %% Optional: inject trace context for DB log correlation
        TracedSQL = case instrument_config:is_verbose_tracing() of
            true -> instrument_client:inject_trace_comment(SQL);
            false -> SQL
        end,

        case epgsql:equery(Conn, TracedSQL, Params) of
            {ok, Cols, Rows} ->
                instrument_client:set_response_attributes(#{
                    rows_returned => length(Rows)
                }),
                instrument_tracer:set_status(ok),
                {ok, Cols, Rows};
            {error, Reason} ->
                instrument_tracer:set_status(error, format_error(Reason)),
                instrument_tracer:set_attribute(<<"db.error.code">>,
                    error_code(Reason)),
                {error, Reason}
        end
    end).

detect_operation(SQL) ->
    case SQL of
        <<"SELECT", _/binary>> -> <<"SELECT">>;
        <<"INSERT", _/binary>> -> <<"INSERT">>;
        <<"UPDATE", _/binary>> -> <<"UPDATE">>;
        <<"DELETE", _/binary>> -> <<"DELETE">>;
        _ -> <<"UNKNOWN">>
    end.

detect_table(SQL) ->
    %% Simple table detection - use a proper SQL parser for production
    case re:run(SQL, "(?:FROM|INTO|UPDATE)\\s+(\\w+)", [{capture, [1], binary}, caseless]) of
        {match, [Table]} -> Table;
        nomatch -> <<"unknown">>
    end.

Redis with eredis

-module(myapp_redis).
-export([cmd/2]).

cmd(Conn, Command) ->
    [Op | _] = Command,
    instrument_client:with_client_span(redis, Op, #{
        statement => iolist_to_binary(lists:join(<<" ">>, Command)),
        sanitize => true
    }, fun() ->
        case eredis:q(Conn, Command) of
            {ok, Result} ->
                instrument_tracer:set_status(ok),
                {ok, Result};
            {error, Reason} ->
                instrument_tracer:set_status(error, format_error(Reason)),
                {error, Reason}
        end
    end).

Mnesia

-module(myapp_mnesia).
-export([read/2, write/2]).

read(Tab, Key) ->
    instrument_client:with_client_span(mnesia, <<"read">>, #{
        target => atom_to_binary(Tab)
    }, fun() ->
        case mnesia:read(Tab, Key) of
            [] ->
                instrument_tracer:set_status(ok),
                [];
            Records ->
                instrument_client:set_response_attributes(#{
                    rows_returned => length(Records)
                }),
                instrument_tracer:set_status(ok),
                Records
        end
    end).

write(Tab, Record) ->
    instrument_client:with_client_span(mnesia, <<"write">>, #{
        target => atom_to_binary(Tab)
    }, fun() ->
        ok = mnesia:write(Tab, Record, write),
        instrument_tracer:set_status(ok),
        ok
    end).

HTTP Client

-module(myapp_http).
-export([request/3]).

request(Method, URL, Body) ->
    instrument_client:with_client_span(http, Method, #{
        target => extract_path(URL),
        attributes => #{
            <<"http.url">> => URL,
            <<"http.method">> => Method
        }
    }, fun() ->
        case hackney:request(Method, URL, [], Body, []) of
            {ok, StatusCode, _Headers, ClientRef} ->
                {ok, ResponseBody} = hackney:body(ClientRef),
                instrument_client:set_response_attributes(#{
                    status_code => StatusCode,
                    response_size => byte_size(ResponseBody)
                }),
                case StatusCode >= 400 of
                    true ->
                        instrument_tracer:set_status(error, <<"HTTP error">>);
                    false ->
                        instrument_tracer:set_status(ok)
                end,
                {ok, StatusCode, ResponseBody};
            {error, Reason} ->
                instrument_tracer:set_status(error, format_error(Reason)),
                {error, Reason}
        end
    end).

Kafka Producer

-module(myapp_kafka).
-export([publish/3]).

publish(Topic, Key, Value) ->
    instrument_client:with_client_span(kafka, <<"publish">>, #{
        target => Topic,
        attributes => #{
            <<"messaging.system">> => <<"kafka">>,
            <<"messaging.destination">> => Topic,
            <<"messaging.destination.kind">> => <<"topic">>
        }
    }, fun() ->
        case brod:produce_sync(client, Topic, Key, Value) of
            ok ->
                instrument_tracer:set_status(ok),
                ok;
            {error, Reason} ->
                instrument_tracer:set_status(error, format_error(Reason)),
                {error, Reason}
        end
    end).

Runtime Controls

Verbose Tracing

%% WARNING: Only enable for debugging
instrument_config:set_verbose_tracing(true).
%% ... debug ...
instrument_config:set_verbose_tracing(false).

Exporter Control

%% Disable exporter during incident
instrument_config:disable_exporter(instrument_exporter_otlp).

%% Re-enable
instrument_config:enable_exporter(instrument_exporter_otlp).

Custom Span Processors

Implementing a Custom Processor

Custom span processors implement the instrument_span_processor behaviour:

-module(my_processor).
-behaviour(instrument_span_processor).

-export([init/1, on_start/2, on_end/1, shutdown/0, shutdown/1, force_flush/0, force_flush/1]).

init(Config) -> {ok, #{config => Config}}.

on_start(Span, _ParentCtx) ->
    %% Called when a span starts
    %% Return the (possibly modified) span
    Span.

on_end(Span) ->
    %% Called when a span ends
    %% Perform export, logging, etc.
    ok.

shutdown() -> ok.
shutdown(_State) -> ok.
force_flush() -> ok.
force_flush(_State) -> ok.

Processor Callback Restrictions

WARNING: Processor callbacks must NOT call back into the span processor system.

The on_start/2 and on_end/1 callbacks execute within the span processor gen_server. Calling instrument_span_processor functions from within these callbacks will cause a deadlock:

%% WRONG - Will deadlock!
on_end(Span) ->
    instrument_span_processor:force_flush(),  %% Deadlock!
    ok.

%% WRONG - Will deadlock!
on_start(Span, _ParentCtx) ->
    instrument_span_processor:list(),  %% Deadlock!
    Span.

Safe patterns:

%% OK - Async export in separate process
on_end(Span) ->
    spawn(fun() -> export_span(Span) end),
    ok.

%% OK - Direct API calls that don't use span processor
on_start(Span, _ParentCtx) ->
    instrument_tracer:trace_id(Span),  %% OK, doesn't use processor
    Span.

%% OK - Store in ETS for batch processing
on_end(Span) ->
    ets:insert(my_span_buffer, {make_ref(), Span}),
    ok.

What to avoid in processor callbacks:

Production Recommendations

Performance Overhead

OperationOverheadImpact
Single client span~7 usNegligible for most operations
Text sanitization~1-2 usScales with query length
Trace comment injection~0.5 usSimple string concatenation

For a 10ms database query, tracing adds approximately 0.07% overhead.

Sampling Recommendations

ThroughputRecommended Sampling
< 100 ops/sec100% (AlwaysOn)
100 - 1K ops/sec10-50%
1K - 10K ops/sec1-10%
10K - 100K ops/sec0.1-1%
> 100K ops/sec0.01-0.1% + errors

Note: Error sampling via attributes only works if you pass error hints at span start. For runtime error sampling, use tail-based sampling or span processors.

Production Configuration

-module(myapp_tracing).
-export([configure/0]).

configure() ->
    instrument_sampler:set_sampler(instrument_sampler_attribute, #{
        default_ratio => 0.001,
        attribute_rules => [
            %% Sample slow operations (must be set at span start)
            {<<"slow_operation">>, true, 1.0},

            %% Higher rate for writes
            {<<"db.operation">>, <<"INSERT">>, 0.01},
            {<<"db.operation">>, <<"UPDATE">>, 0.01},
            {<<"db.operation">>, <<"DELETE">>, 0.05},

            %% DDL always sampled
            {<<"db.operation">>, <<"CREATE">>, 1.0},
            {<<"db.operation">>, <<"DROP">>, 1.0},
            {<<"db.operation">>, <<"ALTER">>, 1.0},

            %% Critical tables
            {<<"db.sql.table">>, <<"payments">>, 1.0}
        ]
    }),

    %% Ensure verbose tracing is OFF
    instrument_config:set_verbose_tracing(false),

    ok.

Security Checklist

  • [ ] Queries sanitized (no PII in spans)
  • [ ] Verbose tracing disabled in production
  • [ ] Error sampling at 100%
  • [ ] Exporter timeouts configured
  • [ ] Sensitive tables excluded or specially handled

Marking Slow Operations

{QueryTime, Result} = timer:tc(fun() -> do_query(Conn, SQL) end),
case QueryTime > 100000 of  %% > 100ms
    true ->
        instrument_tracer:set_attribute(<<"slow_operation">>, true);
    false ->
        ok
end.