Client Tracing Guide
View SourceGeneric utilities for tracing client operations: databases, HTTP clients, message queues, RPC calls, and more.
Overview
The instrument_client module provides reusable utilities for creating client-kind spans following OpenTelemetry semantic conventions. It works with any client operation type and automatically sets appropriate attributes based on the system type.
Key features:
- Generic client spans - Works for databases, HTTP, messaging, RPC
- Text sanitization - Remove sensitive data from queries and URLs
- Trace context injection - SQLCommenter-style trace propagation
- Resource pool monitoring - Track pool checkout/checkin
- Attribute-based sampling - Fine-grained sampling control
Client Span Helpers
Basic Usage
%% Simple client span
instrument_client:with_client_span(postgresql, <<"SELECT">>, fun() ->
epgsql:equery(Conn, SQL, Params)
end).
%% With options
instrument_client:with_client_span(postgresql, <<"SELECT">>, #{
target => <<"users">>,
statement => <<"SELECT * FROM users WHERE id = $1">>,
sanitize => true,
attributes => #{<<"db.name">> => <<"mydb">>}
}, fun() ->
epgsql:equery(Conn, SQL, Params)
end).Manual Span Control
When you need more control over span lifecycle:
Span = instrument_client:start_client_span(redis, <<"GET">>, #{
target => <<"session:abc">>
}),
try
Result = eredis:q(Conn, ["GET", "session:abc"]),
instrument_tracer:set_status(ok),
Result
catch
_:Reason ->
instrument_tracer:set_status(error, format_error(Reason)),
erlang:raise(error, Reason, erlang:get_stacktrace())
after
instrument_tracer:end_span(Span)
end.Span Naming
Span names follow semantic conventions: system operation [target]
%% Without target: "postgresql SELECT"
instrument_client:client_span_name(postgresql, <<"SELECT">>).
%% With target: "postgresql SELECT users"
instrument_client:client_span_name(postgresql, <<"SELECT">>, #{target => <<"users">>}).Text Sanitization
Remove sensitive data from queries before including in spans.
Basic Sanitization
%% SQL: Replace string literals and numbers
instrument_client:sanitize_text(<<"SELECT * FROM users WHERE email = 'john@secret.com' AND id = 123">>).
%% => <<"SELECT * FROM users WHERE email = ? AND id = ?">>Custom Patterns
%% URL path sanitization
instrument_client:sanitize_text(<<"/users/12345/orders">>, #{
patterns => [<<"/\\d+">>],
placeholder => <<"/:id">>
}).
%% => <<"/users/:id/orders">>
%% Preserve PostgreSQL placeholders
instrument_client:sanitize_text(
<<"SELECT * FROM users WHERE id = $1 AND name = 'John'">>,
#{preserve => [<<"\\$\\d+">>]}
).
%% => <<"SELECT * FROM users WHERE id = $1 AND name = ?">>Custom Placeholder
instrument_client:sanitize_text(<<"WHERE x = 'secret'">>, #{
placeholder => <<"<REDACTED>">>
}).
%% => <<"WHERE x = <REDACTED>">>Trace Context Propagation
Inject trace context into queries for database log correlation (SQLCommenter-style).
SQL Comment Format
instrument_tracer:with_span(<<"db_query">>, fun() ->
SQL = <<"SELECT * FROM users">>,
TracedSQL = instrument_client:inject_trace_comment(SQL),
%% => <<"SELECT * FROM users /*traceparent='00-abc...-def...-01'*/">>
epgsql:squery(Conn, TracedSQL)
end).URL Query Parameter Format
URL = <<"/api/data">>,
TracedURL = instrument_client:inject_trace_comment(URL, #{format => url}).
%% => <<"/api/data?traceparent=00-abc...-def...-01">>Selective Injection
Only inject for verbose/debug mode:
SQL = case instrument_config:is_verbose_tracing() of
true -> instrument_client:inject_trace_comment(OriginalSQL);
false -> OriginalSQL
end.Resource Pool Monitoring
Track connection pool checkout/checkin times separately from operation time.
With Pool Span
%% Wraps pool acquire/release around your operation
instrument_client:with_pool_span(<<"db_pool">>, postgresql, fun() ->
Conn = poolboy:checkout(db_pool),
try
instrument_client:with_client_span(postgresql, <<"SELECT">>, #{
target => <<"users">>
}, fun() ->
epgsql:equery(Conn, SQL, Params)
end)
after
poolboy:checkin(db_pool, Conn)
end
end).Manual Pool Spans
%% Start acquisition span
AcquireSpan = instrument_client:pool_acquire_span(<<"db_pool">>, postgresql),
Conn = poolboy:checkout(db_pool),
instrument_tracer:end_span(AcquireSpan),
%% Use connection
Result = query(Conn, SQL),
%% Record release
poolboy:checkin(db_pool, Conn),
instrument_client:pool_release_span(<<"db_pool">>).Attribute Builders
Build Attributes from Options
Attrs = instrument_client:build_attributes(#{
system => postgresql,
operation => <<"SELECT">>,
target => <<"users">>,
statement => <<"SELECT * FROM users">>,
sanitize => true,
attributes => #{<<"db.name">> => <<"mydb">>}
}).
%% #{<<"db.system">> => <<"postgresql">>,
%% <<"db.operation">> => <<"SELECT">>,
%% <<"db.sql.table">> => <<"users">>,
%% <<"db.statement">> => <<"SELECT * FROM users">>,
%% <<"db.name">> => <<"mydb">>}Set Response Attributes
instrument_client:with_client_span(postgresql, <<"SELECT">>, #{}, fun() ->
case epgsql:equery(Conn, SQL, Params) of
{ok, Cols, Rows} ->
instrument_client:set_response_attributes(#{
rows_returned => length(Rows)
}),
instrument_tracer:set_status(ok),
{ok, Cols, Rows};
{error, Reason} ->
instrument_tracer:set_status(error, format_error(Reason)),
{error, Reason}
end
end).Attribute-Based Sampling
The instrument_sampler_attribute module provides fine-grained sampling control based on span attributes.
Important: Sampling Timing
Sampling decisions are made at span start, before your code executes. This means:
- Attributes must be passed in
Opts.attributesto influence sampling - Attributes set later with
set_attribute/2do NOT affect sampling set_status(error)does NOT retroactively force sampling
For effective sampling, pass known attributes at span creation time:
%% WRONG: error attribute set too late
instrument_client:with_client_span(postgresql, Op, #{}, fun() ->
case query() of
{error, _} ->
instrument_tracer:set_attribute(<<"error">>, true), %% Too late!
...
end
end).
%% RIGHT: Use known attributes at span start
instrument_client:with_client_span(postgresql, Op, #{
attributes => #{
<<"db.operation">> => Op,
<<"db.sql.table">> => Table
}
}, fun() -> ... end).Configuration
instrument_sampler:set_sampler(instrument_sampler_attribute, #{
default_ratio => 0.001, %% 0.1% baseline
attribute_rules => [
%% These attributes MUST be set at span creation time
{<<"db.operation">>, <<"SELECT">>, 0.001},
{<<"db.operation">>, <<"INSERT">>, 0.01},
{<<"db.operation">>, <<"UPDATE">>, 0.01},
{<<"db.operation">>, <<"DELETE">>, 0.05},
%% Critical tables always sampled
{<<"db.sql.table">>, <<"payments">>, 1.0},
{<<"db.sql.table">>, <<"audit_log">>, 1.0}
]
}).Note: Rules like {<<"error">>, true, 1.0} only work if the error attribute is passed at span creation. For error sampling based on execution results, use tail-based sampling (post-export filtering) or custom span processors.
Rule Matching
- Rules are evaluated in order
- First matching rule determines sampling rate
- If no rules match,
default_ratiois used - Same trace ID always produces same decision (deterministic)
HTTP Example
instrument_sampler:set_sampler(instrument_sampler_attribute, #{
default_ratio => 0.1,
attribute_rules => [
{<<"http.method">>, <<"GET">>, 0.01},
{<<"http.method">>, <<"POST">>, 0.1},
{<<"http.status_code">>, 500, 1.0} %% Always sample 500s
]
}).Messaging Example
instrument_sampler:set_sampler(instrument_sampler_attribute, #{
default_ratio => 0.05,
attribute_rules => [
{<<"messaging.destination">>, <<"critical-events">>, 1.0},
{<<"messaging.operation">>, <<"process">>, 0.1}
]
}).Sampling Errors After Execution
To sample based on execution results (like errors), use tail-based sampling (see below).
Tail-Based Sampling
The instrument_span_processor_tail_sampler makes sampling decisions after spans complete, enabling filtering based on final state: errors, duration, attributes, and events.
When to Use Tail Sampling
- Error sampling - Always keep spans that ended with errors
- Latency sampling - Keep slow operations (e.g., > 100ms)
- Post-execution attributes - Sample based on attributes set during execution
- Exception events - Keep spans that recorded exceptions
Configuration
instrument_span_processor:register(instrument_span_processor_tail_sampler, #{
%% Always keep spans matching these conditions (OR logic)
always_keep => [
{status, error}, %% Keep all error spans
{duration_ms, '>', 100}, %% Keep spans > 100ms
{attribute, <<"priority">>, high}, %% Keep high priority spans
has_exception %% Keep spans with exceptions
],
%% Always drop spans matching these conditions (evaluated after always_keep)
always_drop => [
{attribute, <<"health_check">>, true} %% Drop health checks
],
%% Probabilistic sampling for remaining spans
default_ratio => 0.01, %% 1% of remaining spans
%% Forward kept spans to this exporter
exporter => instrument_exporter_otlp,
exporter_config => #{endpoint => <<"http://localhost:4318">>}
}).Rule Types
| Rule | Syntax | Description |
|---|---|---|
| Status | {status, error} or {status, ok} | Match span status |
| Duration | {duration_ms, Op, Value} | Op: '>', '<', '>=', '<=' |
| Attribute | {attribute, Key, Value} | Exact attribute match |
| Attribute exists | {attribute_exists, Key} | Attribute is present |
| Has event | {has_event, EventName} | Span has named event |
| Has exception | has_exception | Span has exception event |
Processing Order
- Check
always_keeprules (OR logic) - if any match, span is KEPT - Check
always_droprules (OR logic) - if any match, span is DROPPED - Apply
default_ratioprobability - if passed, span is KEPT - Otherwise span is DROPPED
Example: Error and Latency Sampling
%% Keep all errors and slow requests, drop health checks, sample 0.1% of the rest
instrument_span_processor:register(instrument_span_processor_tail_sampler, #{
always_keep => [
{status, error},
{duration_ms, '>', 500} %% > 500ms is slow
],
always_drop => [
{attribute, <<"http.route">>, <<"/health">>},
{attribute, <<"http.route">>, <<"/ready">>}
],
default_ratio => 0.001,
exporter => instrument_exporter_otlp,
exporter_config => #{endpoint => <<"http://localhost:4318">>}
}).Example: Debug Spans
%% Keep spans with debug flag or that recorded retries
instrument_span_processor:register(instrument_span_processor_tail_sampler, #{
always_keep => [
{attribute_exists, <<"debug">>},
{has_event, <<"retry">>}
],
default_ratio => 0.0, %% Drop everything else
exporter => instrument_exporter_console,
exporter_config => #{}
}).Tail vs Head Sampling
| Aspect | Head Sampling (Attribute-Based) | Tail Sampling |
|---|---|---|
| Decision time | At span start | At span end |
| Error sampling | Must predict errors upfront | Works on actual errors |
| Latency sampling | Cannot sample by duration | Can filter by duration |
| Memory usage | Lower (drops spans early) | Higher (keeps until end) |
| Use case | High-volume production | Debug, error analysis |
For production, combine both: use head sampling for volume control and tail sampling for error/latency capture.
Examples
PostgreSQL with epgsql
-module(myapp_db).
-export([query/3]).
query(Conn, SQL, Params) ->
Op = detect_operation(SQL),
Table = detect_table(SQL),
instrument_client:with_client_span(postgresql, Op, #{
target => Table,
statement => SQL,
sanitize => true,
attributes => #{
<<"db.name">> => <<"myapp">>,
<<"db.user">> => <<"appuser">>
}
}, fun() ->
%% Optional: inject trace context for DB log correlation
TracedSQL = case instrument_config:is_verbose_tracing() of
true -> instrument_client:inject_trace_comment(SQL);
false -> SQL
end,
case epgsql:equery(Conn, TracedSQL, Params) of
{ok, Cols, Rows} ->
instrument_client:set_response_attributes(#{
rows_returned => length(Rows)
}),
instrument_tracer:set_status(ok),
{ok, Cols, Rows};
{error, Reason} ->
instrument_tracer:set_status(error, format_error(Reason)),
instrument_tracer:set_attribute(<<"db.error.code">>,
error_code(Reason)),
{error, Reason}
end
end).
detect_operation(SQL) ->
case SQL of
<<"SELECT", _/binary>> -> <<"SELECT">>;
<<"INSERT", _/binary>> -> <<"INSERT">>;
<<"UPDATE", _/binary>> -> <<"UPDATE">>;
<<"DELETE", _/binary>> -> <<"DELETE">>;
_ -> <<"UNKNOWN">>
end.
detect_table(SQL) ->
%% Simple table detection - use a proper SQL parser for production
case re:run(SQL, "(?:FROM|INTO|UPDATE)\\s+(\\w+)", [{capture, [1], binary}, caseless]) of
{match, [Table]} -> Table;
nomatch -> <<"unknown">>
end.Redis with eredis
-module(myapp_redis).
-export([cmd/2]).
cmd(Conn, Command) ->
[Op | _] = Command,
instrument_client:with_client_span(redis, Op, #{
statement => iolist_to_binary(lists:join(<<" ">>, Command)),
sanitize => true
}, fun() ->
case eredis:q(Conn, Command) of
{ok, Result} ->
instrument_tracer:set_status(ok),
{ok, Result};
{error, Reason} ->
instrument_tracer:set_status(error, format_error(Reason)),
{error, Reason}
end
end).Mnesia
-module(myapp_mnesia).
-export([read/2, write/2]).
read(Tab, Key) ->
instrument_client:with_client_span(mnesia, <<"read">>, #{
target => atom_to_binary(Tab)
}, fun() ->
case mnesia:read(Tab, Key) of
[] ->
instrument_tracer:set_status(ok),
[];
Records ->
instrument_client:set_response_attributes(#{
rows_returned => length(Records)
}),
instrument_tracer:set_status(ok),
Records
end
end).
write(Tab, Record) ->
instrument_client:with_client_span(mnesia, <<"write">>, #{
target => atom_to_binary(Tab)
}, fun() ->
ok = mnesia:write(Tab, Record, write),
instrument_tracer:set_status(ok),
ok
end).HTTP Client
-module(myapp_http).
-export([request/3]).
request(Method, URL, Body) ->
instrument_client:with_client_span(http, Method, #{
target => extract_path(URL),
attributes => #{
<<"http.url">> => URL,
<<"http.method">> => Method
}
}, fun() ->
case hackney:request(Method, URL, [], Body, []) of
{ok, StatusCode, _Headers, ClientRef} ->
{ok, ResponseBody} = hackney:body(ClientRef),
instrument_client:set_response_attributes(#{
status_code => StatusCode,
response_size => byte_size(ResponseBody)
}),
case StatusCode >= 400 of
true ->
instrument_tracer:set_status(error, <<"HTTP error">>);
false ->
instrument_tracer:set_status(ok)
end,
{ok, StatusCode, ResponseBody};
{error, Reason} ->
instrument_tracer:set_status(error, format_error(Reason)),
{error, Reason}
end
end).Kafka Producer
-module(myapp_kafka).
-export([publish/3]).
publish(Topic, Key, Value) ->
instrument_client:with_client_span(kafka, <<"publish">>, #{
target => Topic,
attributes => #{
<<"messaging.system">> => <<"kafka">>,
<<"messaging.destination">> => Topic,
<<"messaging.destination.kind">> => <<"topic">>
}
}, fun() ->
case brod:produce_sync(client, Topic, Key, Value) of
ok ->
instrument_tracer:set_status(ok),
ok;
{error, Reason} ->
instrument_tracer:set_status(error, format_error(Reason)),
{error, Reason}
end
end).Runtime Controls
Verbose Tracing
%% WARNING: Only enable for debugging
instrument_config:set_verbose_tracing(true).
%% ... debug ...
instrument_config:set_verbose_tracing(false).Exporter Control
%% Disable exporter during incident
instrument_config:disable_exporter(instrument_exporter_otlp).
%% Re-enable
instrument_config:enable_exporter(instrument_exporter_otlp).Custom Span Processors
Implementing a Custom Processor
Custom span processors implement the instrument_span_processor behaviour:
-module(my_processor).
-behaviour(instrument_span_processor).
-export([init/1, on_start/2, on_end/1, shutdown/0, shutdown/1, force_flush/0, force_flush/1]).
init(Config) -> {ok, #{config => Config}}.
on_start(Span, _ParentCtx) ->
%% Called when a span starts
%% Return the (possibly modified) span
Span.
on_end(Span) ->
%% Called when a span ends
%% Perform export, logging, etc.
ok.
shutdown() -> ok.
shutdown(_State) -> ok.
force_flush() -> ok.
force_flush(_State) -> ok.Processor Callback Restrictions
WARNING: Processor callbacks must NOT call back into the span processor system.
The on_start/2 and on_end/1 callbacks execute within the span processor gen_server. Calling instrument_span_processor functions from within these callbacks will cause a deadlock:
%% WRONG - Will deadlock!
on_end(Span) ->
instrument_span_processor:force_flush(), %% Deadlock!
ok.
%% WRONG - Will deadlock!
on_start(Span, _ParentCtx) ->
instrument_span_processor:list(), %% Deadlock!
Span.Safe patterns:
%% OK - Async export in separate process
on_end(Span) ->
spawn(fun() -> export_span(Span) end),
ok.
%% OK - Direct API calls that don't use span processor
on_start(Span, _ParentCtx) ->
instrument_tracer:trace_id(Span), %% OK, doesn't use processor
Span.
%% OK - Store in ETS for batch processing
on_end(Span) ->
ets:insert(my_span_buffer, {make_ref(), Span}),
ok.What to avoid in processor callbacks:
instrument_span_processor:register/2instrument_span_processor:unregister/1instrument_span_processor:list/0instrument_span_processor:shutdown/0instrument_span_processor:force_flush/0- Any synchronous call that might eventually call back into the span processor
Production Recommendations
Performance Overhead
| Operation | Overhead | Impact |
|---|---|---|
| Single client span | ~7 us | Negligible for most operations |
| Text sanitization | ~1-2 us | Scales with query length |
| Trace comment injection | ~0.5 us | Simple string concatenation |
For a 10ms database query, tracing adds approximately 0.07% overhead.
Sampling Recommendations
| Throughput | Recommended Sampling |
|---|---|
| < 100 ops/sec | 100% (AlwaysOn) |
| 100 - 1K ops/sec | 10-50% |
| 1K - 10K ops/sec | 1-10% |
| 10K - 100K ops/sec | 0.1-1% |
| > 100K ops/sec | 0.01-0.1% + errors |
Note: Error sampling via attributes only works if you pass error hints at span start. For runtime error sampling, use tail-based sampling or span processors.
Production Configuration
-module(myapp_tracing).
-export([configure/0]).
configure() ->
instrument_sampler:set_sampler(instrument_sampler_attribute, #{
default_ratio => 0.001,
attribute_rules => [
%% Sample slow operations (must be set at span start)
{<<"slow_operation">>, true, 1.0},
%% Higher rate for writes
{<<"db.operation">>, <<"INSERT">>, 0.01},
{<<"db.operation">>, <<"UPDATE">>, 0.01},
{<<"db.operation">>, <<"DELETE">>, 0.05},
%% DDL always sampled
{<<"db.operation">>, <<"CREATE">>, 1.0},
{<<"db.operation">>, <<"DROP">>, 1.0},
{<<"db.operation">>, <<"ALTER">>, 1.0},
%% Critical tables
{<<"db.sql.table">>, <<"payments">>, 1.0}
]
}),
%% Ensure verbose tracing is OFF
instrument_config:set_verbose_tracing(false),
ok.Security Checklist
- [ ] Queries sanitized (no PII in spans)
- [ ] Verbose tracing disabled in production
- [ ] Error sampling at 100%
- [ ] Exporter timeouts configured
- [ ] Sensitive tables excluded or specially handled
Marking Slow Operations
{QueryTime, Result} = timer:tc(fun() -> do_query(Conn, SQL) end),
case QueryTime > 100000 of %% > 100ms
true ->
instrument_tracer:set_attribute(<<"slow_operation">>, true);
false ->
ok
end.