HuggingfaceClient.Hub.DatasetViewer (huggingface_client v0.1.0)

Copy Markdown View Source

HuggingFace Dataset Viewer API.

Provides programmatic access to dataset content, statistics, and structure without downloading entire datasets locally.

See: https://huggingface.co/docs/dataset-viewer

Example

# Check if a dataset is available in the viewer
{:ok, info} = HuggingfaceClient.dataset_viewer_info("rajpurkar/squad")

# Get the first 100 rows of a dataset split
{:ok, rows} = HuggingfaceClient.dataset_viewer_rows("rajpurkar/squad",
  split: "train",
  offset: 0,
  length: 10
)

# Get dataset statistics
{:ok, stats} = HuggingfaceClient.dataset_viewer_statistics("rajpurkar/squad",
  config: "plain_text",
  split: "train"
)

Summary

Functions

Returns column index/statistics for a dataset split.

Returns the feature schema (column types) for a dataset split.

Returns the first 100 rows of a dataset split (as a preview).

Returns dataset info (size, number of rows, features schema, etc.).

Returns the list of configurations (subsets) for a dataset.

Returns the list of splits for a given dataset config.

Returns the Parquet file URLs for a dataset (if available in Parquet format).

Returns rows from a dataset split with pagination.

Performs a search across all rows in a split.

Returns dataset size (rows, bytes) per split.

Returns descriptive statistics for each column in a dataset split.

Checks whether a dataset is available in the Dataset Viewer.

Functions

column_index(dataset_id, opts \\ [])

@spec column_index(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns column index/statistics for a dataset split.

Example

{:ok, index} = HuggingfaceClient.dataset_viewer_column_index("rajpurkar/squad",
  config: "plain_text",
  split: "train"
)

features(dataset_id, opts \\ [])

@spec features(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns the feature schema (column types) for a dataset split.

Example

{:ok, features} = HuggingfaceClient.dataset_viewer_features("rajpurkar/squad",
  config: "plain_text",
  split: "train"
)

first_rows(dataset_id, opts \\ [])

@spec first_rows(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns the first 100 rows of a dataset split (as a preview).

Options

  • :config — configuration/subset name
  • :split — split name (e.g. "train", "test")
  • :token — HF API token for private datasets

Example

{:ok, preview} = HuggingfaceClient.dataset_viewer_first_rows("rajpurkar/squad",
  config: "plain_text",
  split: "train"
)
Enum.each(preview["rows"], fn r -> IO.inspect(r["row"]) end)

info(dataset_id, opts \\ [])

@spec info(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns dataset info (size, number of rows, features schema, etc.).

Example

{:ok, info} = HuggingfaceClient.dataset_viewer_info("rajpurkar/squad")
IO.inspect(info["dataset_info"])

list_configs(dataset_id, opts \\ [])

@spec list_configs(
  String.t(),
  keyword()
) :: {:ok, [map()]} | {:error, Exception.t()}

Returns the list of configurations (subsets) for a dataset.

Example

{:ok, configs} = HuggingfaceClient.dataset_viewer_configs("glue")
Enum.each(configs, fn c -> IO.puts(c["config_name"]) end)

list_splits(dataset_id, opts \\ [])

@spec list_splits(
  String.t(),
  keyword()
) :: {:ok, [map()]} | {:error, Exception.t()}

Returns the list of splits for a given dataset config.

Example

{:ok, splits} = HuggingfaceClient.dataset_viewer_splits("rajpurkar/squad",
  config: "plain_text"
)
Enum.each(splits, fn s -> IO.puts(s["split"]) end)

parquet(dataset_id, opts \\ [])

@spec parquet(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns the Parquet file URLs for a dataset (if available in Parquet format).

Example

{:ok, result} = HuggingfaceClient.dataset_viewer_parquet("rajpurkar/squad")
result["parquet_files"] |> Enum.each(fn f ->
  IO.puts("#{f["split"]}: #{f["url"]}")
end)

rows(dataset_id, opts \\ [])

@spec rows(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns rows from a dataset split with pagination.

Options

  • :config — configuration name
  • :split — split name (required)
  • :offset — row offset (default: 0)
  • :length — number of rows to return, max 100 (default: 100)
  • :access_token

Example

{:ok, result} = HuggingfaceClient.dataset_viewer_rows("rajpurkar/squad",
  config: "plain_text",
  split: "train",
  offset: 0,
  length: 10
)

IO.puts("Total rows: #{result["num_rows_total"]}")
Enum.each(result["rows"], fn r -> IO.inspect(r["row"]) end)

search(dataset_id, opts \\ [])

@spec search(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Performs a search across all rows in a split.

Options

  • :query — search query string (required)
  • :config — configuration name
  • :split — split name (required)
  • :offset — result offset
  • :length — number of results, max 100

Example

{:ok, results} = HuggingfaceClient.dataset_viewer_search("rajpurkar/squad",
  query: "Albert Einstein",
  config: "plain_text",
  split: "train"
)

size(dataset_id, opts \\ [])

@spec size(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns dataset size (rows, bytes) per split.

Example

{:ok, size} = HuggingfaceClient.dataset_viewer_size("rajpurkar/squad")
IO.inspect(size["size"])

statistics(dataset_id, opts \\ [])

@spec statistics(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Returns descriptive statistics for each column in a dataset split.

Example

{:ok, stats} = HuggingfaceClient.dataset_viewer_statistics("rajpurkar/squad",
  config: "plain_text",
  split: "train"
)

stats["statistics"]
|> Enum.each(fn col ->
  IO.puts("#{col["column_name"]}: type=#{col["column_type"]}")
end)

valid?(dataset_id, opts \\ [])

@spec valid?(
  String.t(),
  keyword()
) :: {:ok, map()} | {:error, Exception.t()}

Checks whether a dataset is available in the Dataset Viewer.

Returns {:ok, %{"preview" => bool, "viewer" => bool}} or an error.

Example

{:ok, info} = HuggingfaceClient.dataset_viewer_valid?("rajpurkar/squad")