HuggingfaceClient.Inference.TEI (huggingface_client v0.1.0)

Client for HuggingFace Text Embeddings Inference (TEI) servers.

TEI is a high-performance toolkit for deploying embedding and reranking models. It powers the feature-extraction and sentence-similarity tasks at scale.

TEI supports:

Dense embeddings — sentence-transformers, E5, BGE, Nomic
Sparse embeddings — SPLADE models for hybrid search
Reranking — CrossEncoder reranking for RAG pipelines
Classification — text classification with embedding models

See: https://huggingface.co/docs/text-embeddings-inference

Quick start

# Local TEI server
tei = HuggingfaceClient.tei("http://localhost:8080")

# Embed text
{:ok, embedding} = HuggingfaceClient.tei_embed(tei, "Hello world")
# [0.021, -0.134, ...]  # 768-dim vector

# Batch embed
{:ok, embeddings} = HuggingfaceClient.tei_embed_batch(tei,
  ["Hello", "World", "Foo bar"])

# Rerank documents for RAG
{:ok, ranked} = HuggingfaceClient.tei_rerank(tei,
  query: "What is deep learning?",
  texts: ["Deep learning is...", "Machine learning is...", "Python is..."]
)

# Compute similarity
{:ok, score} = HuggingfaceClient.tei_similarity(tei,
  source: "I love cats",
  targets: ["I adore cats", "Dogs are great", "The weather is nice"]
)

Summary

Types

t()

Functions

classify(tei, opts)

Classifies text using an embedding model (via the /classify endpoint).

embed(tei, text, opts \\ [])

Generates a dense embedding vector for a single text.

embed_batch(tei, texts, opts \\ [])

Generates dense embeddings for a batch of texts.

embed_sparse(tei, text, opts \\ [])

Generates sparse embeddings (SPLADE-style) for hybrid search.

embed_sparse_batch(tei, texts, opts \\ [])

Generates sparse embeddings for a batch of texts.

health(tei)

Health check — returns :ok if the server is healthy.

info(tei)

Gets server info (model ID, max batch size, max sequence length, etc.).

new(base_url, opts \\ [])

Creates a new TEI client.

rerank(tei, opts)

Reranks a list of documents for a query (CrossEncoder-style).

similarity(tei, opts)

Computes cosine similarity between a source text and one or more target texts.

tokenize(tei, text, opts \\ [])

Tokenizes text and returns token count and IDs.

Types

t()

@type t() :: %HuggingfaceClient.Inference.TEI{
  base_url: String.t(),
  timeout: pos_integer(),
  token: String.t() | nil
}

Functions

classify(tei, opts)

@spec classify(
  t(),
  keyword()
) :: {:ok, [map()]} | {:error, Exception.t()}

Classifies text using an embedding model (via the /classify endpoint).

Options

:text / :inputs — text to classify (required)
:raw_scores — return raw logit scores (default: false)

Example

{:ok, result} = HuggingfaceClient.Inference.TEI.classify(tei,
  text: "This movie was absolutely terrible"
)
IO.inspect(result)
# [%{"label" => "NEGATIVE", "score" => 0.98}, %{"label" => "POSITIVE", "score" => 0.02}]

embed(tei, text, opts \\ [])

@spec embed(t(), String.t(), keyword()) :: {:ok, [float()]} | {:error, Exception.t()}

Generates a dense embedding vector for a single text.

Returns a list of floats representing the embedding.

Options

:normalize — normalize the output vector (default: true)
:prompt_name — named prompt to prepend (e.g. "query", "passage")
:truncate — truncate to max length instead of erroring (default: true)

Example

{:ok, embedding} = HuggingfaceClient.Inference.TEI.embed(tei,
  "The quick brown fox jumps over the lazy dog"
)
IO.puts("Embedding dims: #{length(embedding)}")
# 768

# Use named prompts (E5 models)
{:ok, query_emb} = HuggingfaceClient.Inference.TEI.embed(tei,
  "What is deep learning?",
  prompt_name: "query"
)

embed_batch(tei, texts, opts \\ [])

@spec embed_batch(t(), [String.t()], keyword()) ::
  {:ok, [[float()]]} | {:error, Exception.t()}

Generates dense embeddings for a batch of texts.

Returns a list of embedding vectors (one per input text).

Options

:normalize — normalize output vectors (default: true)
:prompt_name — named prompt for E5/BGE-style models
:truncate — truncate long inputs (default: true)

Example

texts = ["Hello world", "How are you?", "Deep learning is great"]
{:ok, embeddings} = HuggingfaceClient.Inference.TEI.embed_batch(tei, texts)
IO.puts("Got #{length(embeddings)} embeddings of #{length(hd(embeddings))} dims")

embed_sparse(tei, text, opts \\ [])

@spec embed_sparse(t(), String.t(), keyword()) ::
  {:ok, [%{required(String.t()) => term()}]} | {:error, Exception.t()}

Generates sparse embeddings (SPLADE-style) for hybrid search.

Returns a list of {index, value} pairs representing the sparse vector.

Example

{:ok, sparse} = HuggingfaceClient.Inference.TEI.embed_sparse(tei, "deep learning")
Enum.each(sparse, fn %{"index" => idx, "value" => val} ->
  IO.puts("dim #{idx}: #{val}")
end)

embed_sparse_batch(tei, texts, opts \\ [])

@spec embed_sparse_batch(t(), [String.t()], keyword()) ::
  {:ok, [[%{required(String.t()) => term()}]]} | {:error, Exception.t()}

Generates sparse embeddings for a batch of texts.

health(tei)

@spec health(t()) :: :ok | {:error, Exception.t()}

Health check — returns :ok if the server is healthy.

info(tei)

@spec info(t()) :: {:ok, map()} | {:error, Exception.t()}

Gets server info (model ID, max batch size, max sequence length, etc.).

Example

{:ok, info} = HuggingfaceClient.Inference.TEI.info(tei)
IO.puts("Model: #{info["model_id"]}")
IO.puts("Max seq length: #{info["max_input_length"]}")
IO.puts("Max batch: #{info["max_batch_tokens"]}")

new(base_url, opts \\ [])

@spec new(
  String.t(),
  keyword()
) :: t()

Creates a new TEI client.

Parameters

base_url — URL of the TEI server (e.g. "http://localhost:8080")

Options

:token — Bearer token for authenticated endpoints
:timeout — request timeout in ms (default: 30_000)

Example

tei = HuggingfaceClient.tei("http://localhost:8080")
tei = HuggingfaceClient.tei(
  "https://my-embedding-endpoint.hf.space",
  token: "hf_..."
)

rerank(tei, opts)

@spec rerank(
  t(),
  keyword()
) :: {:ok, [%{required(String.t()) => term()}]} | {:error, Exception.t()}

Reranks a list of documents for a query (CrossEncoder-style).

Used in RAG pipelines to re-rank retrieval results by relevance. Returns documents sorted by relevance score (highest first).

Options

:query — the search query (required)
:texts — list of documents to rerank (required)
:return_text — include original text in results (default: false)
:truncate — truncate long inputs (default: true)
:raw_scores — return raw logit scores (default: false)

Example

{:ok, results} = HuggingfaceClient.Inference.TEI.rerank(tei,
  query: "What is the capital of France?",
  texts: [
    "Paris is the capital of France.",
    "London is the capital of England.",
    "The Eiffel Tower is in Paris."
  ],
  return_text: true
)

# Results sorted by score, highest first
Enum.each(results, fn r ->
  IO.puts("Score: #{r["score"]} | #{r["text"]}")
end)

similarity(tei, opts)

@spec similarity(
  t(),
  keyword()
) :: {:ok, [float()]} | {:error, Exception.t()}

Computes cosine similarity between a source text and one or more target texts.

Returns similarity scores in the range [-1, 1] (higher = more similar).

Options

:source — source text (required)
:targets — list of target texts to compare against (required)

Example

{:ok, scores} = HuggingfaceClient.Inference.TEI.similarity(tei,
  source: "I love machine learning",
  targets: [
    "I enjoy deep learning",
    "The weather is nice today",
    "Neural networks are fascinating"
  ]
)
# [0.89, 0.12, 0.76]

tokenize(tei, text, opts \\ [])

@spec tokenize(t(), String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}

Tokenizes text and returns token count and IDs.

Example

{:ok, tokens} = HuggingfaceClient.Inference.TEI.tokenize(tei, "Hello world!")
IO.puts("Tokens: #{length(tokens)}")