HuggingfaceClient.Hub.Evaluate (huggingface_client v0.1.0)

HuggingFace Evaluate — metrics computation API.

Evaluate provides standardized metrics for NLP, vision, and other ML tasks. Metrics are hosted on the Hub and can be loaded by name.

See: https://huggingface.co/docs/evaluate

Available metric categories

Text: BLEU, ROUGE, BERTScore, SacreBLEU, WER, CER, METEOR, TER
Classification: Accuracy, F1, Precision, Recall, AUC, MCC
Regression: MSE, MAE, RMSE, R²
Code: CodeBLEU, pass@k
Translation: chrF, chrf++, TER

Example

# Compute BLEU score
{:ok, result} = HuggingfaceClient.compute_metric("bleu",
  predictions: ["the cat is on the mat"],
  references: [["the cat sat on the mat"]]
)
IO.puts("BLEU: #{result["bleu"]}")

# Compute accuracy
{:ok, result} = HuggingfaceClient.compute_metric("accuracy",
  predictions: [0, 1, 0, 1],
  references: [0, 1, 1, 1]
)
IO.puts("Accuracy: #{result["accuracy"]}")

# Multiple metrics at once
{:ok, results} = HuggingfaceClient.evaluate_model(
  model: "bert-base-uncased",
  dataset: "glue",
  subset: "mrpc",
  split: "validation",
  metrics: ["accuracy", "f1"],
  access_token: "hf_..."
)

Summary

Functions

compute(metric, opts)

Computes a metric given predictions and references.

compute_multiple(opts)

Computes multiple metrics at once.

evaluate_model(opts)

Evaluates a model on a dataset using specified metrics.

list_metrics(opts \\ [])

Lists all available metrics on the Hub.

metric_info(metric_name, opts \\ [])

Gets detailed information about a specific metric.

model_benchmarks(model_id, opts \\ [])

Gets a list of standard benchmark results for a model.

Functions

compute(metric, opts)

@spec compute(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Computes a metric given predictions and references.

This calls the HuggingFace Evaluate API to compute the metric server-side.

Options

:metric — metric name (required), e.g. "bleu", "rouge", "accuracy"
:predictions — list of model predictions (required)
:references — list of ground truth references (required)
:kwargs — additional metric-specific parameters
:access_token

Examples

# ROUGE score
{:ok, result} = HuggingfaceClient.compute_metric("rouge",
  predictions: ["The cat sat on the mat"],
  references: ["The cat is on the mat"]
)
IO.inspect(result["rouge1"])

# BLEU
{:ok, result} = HuggingfaceClient.compute_metric("bleu",
  predictions: ["hello world"],
  references: [["hello world", "hi world"]]
)

# WER (word error rate) for ASR
{:ok, result} = HuggingfaceClient.compute_metric("wer",
  predictions: ["it is raining"],
  references: ["it is raining"]
)

# F1 for classification
{:ok, result} = HuggingfaceClient.compute_metric("f1",
  predictions: [0, 1, 0, 1],
  references: [0, 1, 1, 1],
  kwargs: %{"average" => "macro"}
)

compute_multiple(opts)

@spec compute_multiple(keyword()) :: {:ok, map()} | {:error, Exception.t()}

Computes multiple metrics at once.

Options

:metrics — list of metric names (required)
:predictions — list of model predictions (required)
:references — list of ground truth references (required)
:access_token

Example

{:ok, results} = HuggingfaceClient.compute_metrics(
  metrics: ["rouge", "bleu"],
  predictions: ["The cat sat"],
  references: ["The cat is on the mat"]
)
IO.inspect(results)

evaluate_model(opts)

@spec evaluate_model(keyword()) :: {:ok, map()} | {:error, Exception.t()}

Evaluates a model on a dataset using specified metrics.

Runs inference + metric computation server-side via HF API.

Options

:model — HF model ID (required)
:dataset — HF dataset ID (required)
:subset — dataset configuration/subset
:split — dataset split (default: "test")
:metrics — list of metric names (required)
:task — task type (e.g. "text-classification")
:access_token

Example

{:ok, results} = HuggingfaceClient.evaluate_model(
  model: "distilbert-base-uncased-finetuned-sst-2-english",
  dataset: "glue",
  subset: "sst2",
  split: "validation",
  metrics: ["accuracy", "f1"],
  task: "text-classification",
  access_token: "hf_..."
)
IO.puts("Accuracy: #{results["accuracy"]}")

list_metrics(opts \\ [])

@spec list_metrics(keyword()) :: {:ok, [map()]} | {:error, Exception.t()}

Lists all available metrics on the Hub.

Options

:search — filter by name
:type — filter by type: "metric", "comparison", "measurement"
:access_token

Example

{:ok, metrics} = HuggingfaceClient.list_metrics()
Enum.each(metrics, fn m -> IO.puts("#{m["id"]}: #{m["description"]}") end)

metric_info(metric_name, opts \\ [])

@spec metric_info(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Gets detailed information about a specific metric.

Example

{:ok, info} = HuggingfaceClient.metric_info("bleu")
IO.puts("Description: #{info["description"]}")
IO.puts("Reference: #{info["reference_urls"]}")

model_benchmarks(model_id, opts \\ [])

@spec model_benchmarks(
  String.t(),
  keyword()
) :: {:ok, [map()]} | {:error, Exception.t()}

Gets a list of standard benchmark results for a model.

Example

{:ok, benchmarks} = HuggingfaceClient.model_benchmarks("meta-llama/Llama-3.1-8B-Instruct")
Enum.each(benchmarks, fn b ->
  IO.puts("#{b["benchmark"]}: #{b["score"]}")
end)