TiktokenEx.Encoding (TiktokenEx v0.1.0)

View Source

A TikToken-style encoding: regex-based splitting + byte-pair encoding + specials.

When special tokens overlap (one is a prefix of another), pass special_token_matching: :longest to make matching deterministic. The default :parity mode keeps ordering unspecified (closer to upstream tiktoken).

Summary

Types

t()

@type t() :: %TiktokenEx.Encoding{
  decoder: %{required(token_id()) => binary()},
  mergeable_ranks: %{required(binary()) => token_id()},
  pat_regex: Regex.t(),
  pat_str: String.t(),
  special_regex: Regex.t() | nil,
  special_token_matching: :parity | :longest,
  special_tokens: %{required(String.t()) => token_id()},
  special_tokens_by_id: %{required(token_id()) => String.t()}
}

token_id()

@type token_id() :: non_neg_integer()

Functions

decode(encoding, ids)

@spec decode(t(), [token_id()]) :: {:ok, String.t()} | {:error, term()}

encode(encoding, text, opts \\ [])

@spec encode(t(), String.t(), keyword()) :: {:ok, [token_id()]} | {:error, term()}

new(opts)

@spec new(keyword()) :: {:ok, t()} | {:error, term()}