Examples

Drop-in patterns for common erllama workflows. Each block is a self-contained snippet that runs from rebar3 shell after {ok, _} = application:ensure_all_started(erllama) (the boot is omitted from the snippets; assume it's there).

10-second smoke test (no model required)

The cache subsystem is independently usable. From rebar3 shell:

1> {ok, _} = application:ensure_all_started(erllama).
2> ok = filelib:ensure_path("/tmp/edemo").
3> {ok, _} = erllama_cache_disk_srv:start_link(d, "/tmp/edemo").
4> Meta = #{save_reason => cold, quant_bits => 16,
           fingerprint => binary:copy(<<170>>, 32),
           fingerprint_mode => safe, quant_type => f16,
           ctx_params_hash => binary:copy(<<187>>, 32),
           tokens => [1,2,3], context_size => 4096,
           prompt_text => <<>>, hostname => <<"d">>,
           erllama_version => <<"0.1.0">>}.
5> {ok, K, _Header, Size} =
       erllama_cache_disk_srv:save(d, Meta, <<"hi">>).
6> {ok, _Info, <<"hi">>} = erllama_cache_disk_srv:load(d, K).
7> erllama_cache:get_counters().

Verified end-to-end: a published .kvc file, the meta-server registers it, the load path round-trips the payload, and the counters reflect one cold save plus one cache miss followed by one exact hit. No model loaded — llama_backend_init doesn't run.

1. Load a model and run a one-shot completion

{ok, _} = erllama_cache_disk_srv:start_link(my_disk, "/var/lib/erllama/kvc"),
{ok, Bin} = file:read_file("/srv/models/tinyllama-1.1b-chat.Q4_K_M.gguf"),
Fp = crypto:hash(sha256, Bin),

{ok, M} = erllama:load_model(#{
    backend          => erllama_model_llama,
    model_path       => "/srv/models/tinyllama-1.1b-chat.Q4_K_M.gguf",
    fingerprint      => Fp,
    fingerprint_mode => safe,
    quant_type       => q4_k_m,
    quant_bits       => 4,
    ctx_params_hash  => crypto:hash(sha256, term_to_binary({2048, 512})),
    context_size     => 2048,
    tier_srv         => my_disk,
    tier             => disk
}),

{ok, Reply, _Tokens} =
    erllama:complete(M, <<"Once upon a time, in a quiet village">>),

io:format("~s~n", [Reply]),
ok = erllama:unload(M).

First call cold-prefills the prompt and async-saves cold + finish rows. Repeating the same call hits the cache via the exact-key path.

2. Stateless HTTP server (resends full conversation per turn)

%% Inside your request handler. The cache walks the new prompt's
%% tokens backward by `boundary_align_tokens` and resumes from the
%% longest published prefix automatically — no parent_key needed.
handle_chat(ModelId, Prompt) ->
    {ok, Reply, _Tokens} =
        erllama:complete(ModelId, Prompt, #{response_tokens => 256}),
    {200, [{"Content-Type", "text/plain"}], Reply}.

Hits show up as hits_longest_prefix in erllama_cache:get_counters/0.

3. Multi-turn Erlang-native session (tracks parent_key)

The session layer is responsible for threading parent_key between turns. The cache does not expose a "last finish key" lookup; you compute the key from the tokens complete/3 returns.

%% Helper: build the finish-save key from the tokens of the previous turn.
finish_key(Fp, QT, CtxHash, Tokens) ->
    erllama_cache_key:make(#{
        fingerprint     => Fp,
        quant_type      => QT,
        ctx_params_hash => CtxHash,
        tokens          => Tokens
    }).

chat(Model, KeyMeta, Prompt, undefined) ->
    {ok, Reply, Tokens} = erllama:complete(Model, Prompt, #{}),
    K = finish_key(maps:get(fingerprint, KeyMeta),
                   maps:get(quant_type, KeyMeta),
                   maps:get(ctx_params_hash, KeyMeta), Tokens),
    {Reply, K};
chat(Model, KeyMeta, Prompt, ParentKey) ->
    {ok, Reply, Tokens} =
        erllama:complete(Model, Prompt, #{parent_key => ParentKey}),
    K = finish_key(maps:get(fingerprint, KeyMeta),
                   maps:get(quant_type, KeyMeta),
                   maps:get(ctx_params_hash, KeyMeta), Tokens),
    {Reply, K}.

%% Driver: `KeyMeta` is whatever you passed to `load_model/2`.
{R1, K1} = chat(M, KeyMeta, <<"User: hello\nAssistant:">>, undefined),
{R2, K2} = chat(M, KeyMeta,
    <<"User: hello\nAssistant: ", R1/binary,
      "\nUser: tell me a joke\nAssistant:">>,
    K1),
ok.

Passing parent_key skips the longest-prefix walk and resumes directly from the previous turn's finish save.

4. Multiple loaded models

Model ids are binary() (the registered name).

{ok, _} = erllama:load_model(<<"tiny">>, TinyConfig),
{ok, _} = erllama:load_model(<<"big">>,  BigConfig),

{ok, R1, _} = erllama:complete(<<"tiny">>, <<"summarise: ...">>),
{ok, R2, _} = erllama:complete(<<"big">>,  <<"deep analysis of: ...">>),

ok = erllama:unload(<<"tiny">>),
ok = erllama:unload(<<"big">>).

Both share one erllama_cache instance. Cache rows are scoped by fingerprint, so the two models never collide.

5. Concurrent agents on a shared system prompt

ModelId = <<"assistant">>,
SharedPrefix = <<"You are a helpful assistant.\n">>,
Parent = self(),

%% Spawn N workers; each appends a different user query but they
%% all start with the same prefix. After the first agent's cold
%% prefill saves, every subsequent agent gets a longest-prefix hit
%% on the shared part and only prefills its tail.
Workers = [
    spawn(fun() ->
        Q = list_to_binary(io_lib:format("Worker ~p question.", [N])),
        Prompt = <<SharedPrefix/binary, Q/binary>>,
        {ok, Reply, _} = erllama:complete(ModelId, Prompt),
        Parent ! {N, Reply}
    end) || N <- lists:seq(1, 8)
],

%% Collect.
Replies = [receive {N, R} -> {N, R} end || N <- lists:seq(1, 8)],
Replies.

6. Streaming tokens (`infer/4`) with cancellation

{ok, Tokens} = erllama:tokenize(ModelId, <<"Once upon a time">>),
{ok, Ref} = erllama:infer(ModelId, Tokens,
                           #{response_tokens => 200}, self()),

loop(Ref) ->
    receive
        {erllama_token, Ref, Fragment} ->
            io:put_chars(Fragment),
            loop(Ref);
        {erllama_done, Ref, _Stats} ->
            io:nl(),
            ok;
        {erllama_error, Ref, Reason} ->
            {error, Reason}
    after 30000 ->
        erllama:cancel(Ref),
        %% Still drain the final done message after cancel.
        loop(Ref)
    end.

cancel/1 is observed at the next inter-token boundary; the model always emits a final {erllama_done, Ref, Stats} with #{cancelled => true}.

7. Chat template + embeddings

%% Render a chat request through the model's built-in GGUF template
%% and tokenise it in one shot. Backed by llama_chat_apply_template.
{ok, ChatTokens} = erllama:apply_chat_template(ModelId, #{
    messages => [
        #{role => system,    content => <<"You are concise.">>},
        #{role => user,      content => <<"What's 2+2?">>}
    ]
}),
{ok, Ref} = erllama:infer(ModelId, ChatTokens, #{response_tokens => 8}, self()).

%% Pooled sentence embedding via llama_get_embeddings_seq.
%% The model must have been loaded with embedding-friendly settings
%% (see guides/loading.md). Returns a list of floats.
{ok, Toks}      = erllama:tokenize(ModelId, <<"The quick brown fox.">>),
{ok, Embedding} = erllama:embed(ModelId, Toks).

8. Grammar-constrained sampling (GBNF)

%% Force the model to emit a JSON-shaped string with a digit value.
Grammar = <<
    "root ::= \"{\" ws \"\\\"n\\\":\" ws digit \"}\"\n"
    "digit ::= [0-9]\n"
    "ws    ::= [ \\t\\n]*"
>>,
{ok, Toks} = erllama:tokenize(ModelId, <<"Reply with JSON:">>),
{ok, Ref}  = erllama:infer(ModelId, Toks,
                           #{response_tokens => 32, grammar => Grammar},
                           self()).

The grammar is per-request: the sampler chain is reset to grammar → greedy for the duration of the request and cleared on completion or cancellation.

9. Inspecting cache state

%% Hit/miss/save counters and per-path latency totals.
Counters = erllama_cache:get_counters(),
io:format("~p~n", [Counters]).

%% Every row in the index. dump/0 returns raw ETS tuples; the layout
%% is documented in include/erllama_cache.hrl:
%%   {Key, Tier, Size, LastUsedNs, Refcount, Status, Header,
%%    Location, TokensRef, Hits}
Dump = erllama_cache_meta_srv:dump(),
[io:format("tier=~p size=~p refs=~p~n", [Tier, Size, Refs])
 || {_Key, Tier, Size, _Lru, Refs, _Status, _Hdr, _Loc, _Tok, _Hits} <- Dump].

%% Free at least 256 MiB, oldest LRU first, RAM tiers only.
erllama_cache:evict_bytes(256 * 1024 * 1024, [ram, ram_file]).

%% Synchronous full eviction pass.
erllama_cache:gc().

10. Memory-pressure-driven eviction (in `sys.config`)

{erllama, [
  {scheduler, #{
    enabled         => true,
    pressure_source => system,        %% memsup-backed, portable
    interval_ms     => 5000,
    high_watermark  => 0.85,
    low_watermark   => 0.75,
    evict_tiers     => [ram, ram_file] %% disk fills to its own quota
  }}
]}.

Sources shipped: noop, system, nvidia_smi, {module, M}. Roll your own with -behaviour(erllama_pressure) and pass {module, M} as the source.

11. Cache-only tests (no model required)

The cache subsystem is independently usable. eunit tests that exercise save/load round-trips never touch llama.cpp:

%% test/my_cache_test.erl
-module(my_cache_test).
-include_lib("eunit/include/eunit.hrl").
-include_lib("erllama/include/erllama_cache.hrl").

with_disk(Body) ->
    {ok, _} = erllama_cache_meta_srv:start_link(),
    {ok, _} = erllama_cache_ram:start_link(),
    {ok, Dir} = file:make_dir("/tmp/my_cache_test"), %% ensure exists
    {ok, _} = erllama_cache_disk_srv:start_link(t, "/tmp/my_cache_test"),
    try Body() after
        catch gen_server:stop(t),
        catch gen_server:stop(erllama_cache_ram),
        catch gen_server:stop(erllama_cache_meta_srv)
    end.

round_trip_test() ->
    with_disk(fun() ->
        Meta = #{
            save_reason => cold,
            quant_bits => 16,
            fingerprint => binary:copy(<<16#AA>>, 32),
            fingerprint_mode => safe,
            quant_type => f16,
            ctx_params_hash => binary:copy(<<16#BB>>, 32),
            tokens => [1, 2, 3],
            context_size => 4096
        },
        {ok, Key, _, _} = erllama_cache_disk_srv:save(t, Meta, <<"data">>),
        ?assertMatch({ok, _Info, <<"data">>},
                     erllama_cache_disk_srv:load(t, Key))
    end).

The lazy llama_backend_init means cache-only tests never trigger ggml_backend_load_all — no Metal/CUDA discovery cost.

12. End-to-end against a real GGUF

LLAMA_TEST_MODEL=/path/to/tinyllama-1.1b-chat.Q4_K_M.gguf \
    rebar3 ct --suite=test/erllama_real_model_SUITE

The 6-case suite covers cold prefill, warm restore, multi-turn parent-key resume, longest-prefix walk, eviction, and a multi-model concurrent run. Without the env var it skips so default rebar3 ct stays green.

13. Microbench: cold vs. warm

bench/run.sh tiny    # TinyLlama 1.1B Q4_K_M
bench/run.sh large   # LLaMA-3 8B Q4_K_M (needs the file)

bench/run.sh drives a cold_vs_warm matrix plus a 4-agent shared-prefix scenario; see bench/README.md.