viva_tensor

High-performance tensor operations for Gleam on the BEAM.

This module is the stable entry point for the package. It re-exports the tensor type, common constructors, shape operations, linear algebra, element-wise math, reductions, neural-network helpers, and TFLOPS measurement utilities.

Lower-level implementation, backend, neural-network, quantization, sparse, telemetry, and benchmark modules are intentionally excluded from the public documentation until their contracts are stable. Prefer this module as the public API surface.

import gleam/result
import viva_tensor as t

let a = t.zeros([2, 3])
let b = t.ones([2, 3])
use c <- result.try(t.add(a, b))
c

Types

Result storage selected by the RTX-first planner.

pub type AcceleratedTensor =
  @internal AcceleratedTensor

Backend selected by the RTX-first planner.

pub type AccelerationBackend =
  @internal AccelerationBackend

Configuration for two-dimensional convolution operations.

pub type Conv2dConfig =
  @internal Conv2dConfig

Workspace for persistent GPU buffers.

pub type GpuWorkspace =
  @internal GpuWorkspace

Persisted linear layer parameters.

pub type LinearLayer =
  @internal LinearLayer

Opaque reference to a tensor stored in native NIF memory.

pub type NativeTensorRef =
  @internal NativeTensorRef

A tensor value backed by dense, strided, or native storage.

pub type Tensor =
  @internal Tensor

Error returned by fallible tensor constructors and operations.

pub type TensorError =
  @internal TensorError

Backend used when measuring matrix-multiplication throughput.

pub type TflopsBackend =
  @internal Backend

Result returned by TFLOPS measurement helpers.

pub type TflopsResult =
  @internal TflopsResult

Values

pub fn accelerated_backend(
  t: AcceleratedTensor,
) -> AccelerationBackend

Inspect which backend was selected by matmul_auto.

pub fn accelerated_shape(t: AcceleratedTensor) -> List(Int)

Shape of an accelerated tensor without forcing a download.

pub fn accelerated_sync() -> Result(Nil, TensorError)

Wait for queued CUDA work to complete.

pub fn accelerated_to_tensor(
  t: AcceleratedTensor,
) -> Result(Tensor, TensorError)

Download an accelerated tensor back to a regular CPU tensor.

pub fn add(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Add element-wise

pub fn add_broadcast(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Add with broadcasting

pub fn add_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a + b into a preallocated native tensor.

pub fn argmax(t: Tensor) -> Int

Index of maximum value

pub fn argmin(t: Tensor) -> Int

Index of minimum value

pub fn avg_pool2d(
  input: Tensor,
  pool_h: Int,
  pool_w: Int,
  stride_h: Int,
  stride_w: Int,
) -> Result(Tensor, TensorError)

Average pooling 2D

pub fn broadcast_to(
  t: Tensor,
  target_shape: List(Int),
) -> Result(Tensor, TensorError)

Broadcast tensor to a target shape.

pub fn can_broadcast(a: List(Int), b: List(Int)) -> Bool

Can these shapes broadcast together?

pub fn clamp(t: Tensor, min_val: Float, max_val: Float) -> Tensor

Clamp values

pub fn conv2d(
  input: Tensor,
  kernel: Tensor,
  config: Conv2dConfig,
) -> Result(Tensor, TensorError)

2D Convolution

pub fn conv2d_config() -> Conv2dConfig

Default conv2d config (3x3 kernel, stride 1, no padding)

pub fn conv2d_same(kernel_h: Int, kernel_w: Int) -> Conv2dConfig

Conv2d config with “same” padding

pub fn detect_backends() -> List(TflopsBackend)

Detect available compute backends

pub fn div(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise division

pub fn dot(a: Tensor, b: Tensor) -> Result(Float, TensorError)

Dot product (vectors only)

pub fn fill(shape: List(Int), value: Float) -> Tensor

Create tensor filled with value

pub fn flatten(t: Tensor) -> Tensor

Flatten to 1D

pub fn from_list(data: List(Float)) -> Tensor

Create tensor from list (1D)

pub fn from_list2d(
  rows: List(List(Float)),
) -> Result(Tensor, TensorError)

Create 2D tensor from list of lists

pub fn from_native_ref(
  ref: NativeTensorRef,
  shape: List(Int),
) -> Tensor

Wrap an existing native NIF tensor resource.

pub fn global_avg_pool2d(
  input: Tensor,
) -> Result(Tensor, TensorError)

Global average pooling

pub fn gpu_workspace() -> Result(GpuWorkspace, TensorError)

Create an RTX 4090 FP16 workspace.

pub fn he_init(fan_in: Int, fan_out: Int) -> Tensor

He initialization (for ReLU networks)

pub fn is_contiguous(t: Tensor) -> Bool

Check if contiguous

pub fn is_native(t: Tensor) -> Bool

Check whether a tensor is backed by native NIF memory.

pub fn linear_gelu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
  bias: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = gelu(a @ b + bias) using the FP16 Tensor Core fused epilogue.

pub fn linear_gelu_forward_into(
  out: AcceleratedTensor,
  input: AcceleratedTensor,
  layer: LinearLayer,
) -> Result(Nil, TensorError)

Run out = gelu(input @ layer.weight + layer.bias).

pub fn linear_layer(
  workspace: GpuWorkspace,
  weight: Tensor,
  bias: Tensor,
) -> Result(LinearLayer, TensorError)

Create a persisted linear layer in workspace memory.

pub fn linear_layer_backend(
  layer: LinearLayer,
) -> AccelerationBackend

Linear layer backend.

pub fn linear_layer_fp16(
  weight: Tensor,
  bias: Tensor,
) -> Result(LinearLayer, TensorError)

Create a persisted FP16 linear layer on the RTX.

pub fn linear_layer_input_features(layer: LinearLayer) -> Int

Linear layer input feature count.

pub fn linear_layer_output_features(layer: LinearLayer) -> Int

Linear layer output feature count.

pub fn linear_output(
  workspace: GpuWorkspace,
  layer: LinearLayer,
  batch_size: Int,
) -> Result(AcceleratedTensor, TensorError)

Allocate a reusable output buffer for a persisted linear layer.

pub fn linear_relu(
  a: Tensor,
  b: Tensor,
  bias: Tensor,
) -> Result(Tensor, TensorError)

Fused linear layer with ReLU: max(0, a @ b + bias).

pub fn linear_relu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
  bias: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = relu(a @ b + bias) using the FP16 Tensor Core fused epilogue.

pub fn linear_relu_forward_into(
  out: AcceleratedTensor,
  input: AcceleratedTensor,
  layer: LinearLayer,
) -> Result(Nil, TensorError)

Run out = relu(input @ layer.weight + layer.bias).

pub fn linear_relu_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
  bias: Tensor,
) -> Result(Nil, TensorError)

Write out = max(0, a @ b + bias) into a preallocated native tensor.

pub fn map(t: Tensor, f: fn(Float) -> Float) -> Tensor

Apply function to each element

pub fn map2(
  a: Tensor,
  b: Tensor,
  f: fn(Float, Float) -> Float,
) -> Result(Tensor, TensorError)

Apply a binary function element-wise over tensors with the same shape.

pub fn matmul(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Matrix-matrix multiplication

pub fn matmul_accelerated(
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(AcceleratedTensor, TensorError)

Matrix multiplication between persistent accelerated tensors.

pub fn matmul_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = a @ b into a persistent accelerated output buffer.

pub fn matmul_auto(
  a: Tensor,
  b: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Matrix multiplication with priority: RTX 4090 first, then MKL/native CPU.

pub fn matmul_gelu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = gelu(a @ b) using the FP16 Tensor Core fused epilogue.

pub fn matmul_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a @ b into a preallocated native tensor.

pub fn matmul_relu_accelerated_into(
  out: AcceleratedTensor,
  a: AcceleratedTensor,
  b: AcceleratedTensor,
) -> Result(Nil, TensorError)

Write out = relu(a @ b) using the FP16 Tensor Core fused epilogue.

pub fn matmul_vec(
  mat: Tensor,
  vec: Tensor,
) -> Result(Tensor, TensorError)

Matrix-vector multiplication

pub fn matrix(
  rows: Int,
  cols: Int,
  data: List(Float),
) -> Result(Tensor, TensorError)

Create matrix (2D tensor)

pub fn max(t: Tensor) -> Float

Maximum value

pub fn max_pool2d(
  input: Tensor,
  pool_h: Int,
  pool_w: Int,
  stride_h: Int,
  stride_w: Int,
) -> Result(Tensor, TensorError)

Max pooling 2D

pub fn mean(t: Tensor) -> Float

Mean of all elements

pub fn measure_tflops(
  backend: TflopsBackend,
  m: Int,
  n: Int,
  k: Int,
) -> TflopsResult

Measure TFLOPS for a single matmul operation

pub fn measure_tflops_averaged(
  backend: TflopsBackend,
  m: Int,
  n: Int,
  k: Int,
  iterations: Int,
) -> TflopsResult

Measure averaged TFLOPS (warmup + iterations)

pub fn min(t: Tensor) -> Float

Minimum value

pub fn mul(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise multiplication

pub fn mul_broadcast(
  a: Tensor,
  b: Tensor,
) -> Result(Tensor, TensorError)

Multiply with broadcasting

pub fn mul_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a * b into a preallocated native tensor.

pub fn native_fill(
  shape: List(Int),
  value: Float,
) -> Result(Tensor, TensorError)

Create a native-backed tensor filled with a value.

pub fn native_from_list(
  data: List(Float),
  shape: List(Int),
) -> Result(Tensor, TensorError)

Create a native-backed tensor from row-major list data.

pub fn native_ones(
  shape: List(Int),
) -> Result(Tensor, TensorError)

Create a native-backed tensor of ones.

pub fn native_ref(t: Tensor) -> Result(NativeTensorRef, Nil)

Extract the native NIF tensor resource when present.

pub fn native_zeros(
  shape: List(Int),
) -> Result(Tensor, TensorError)

Create a native-backed tensor of zeros.

pub fn norm(t: Tensor) -> Float

L2 norm (Euclidean length)

pub fn normalize(t: Tensor) -> Tensor

Normalize to unit length

pub fn ones(shape: List(Int)) -> Tensor

Create tensor of ones

pub fn outer(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Outer product

pub fn pad2d(
  t: Tensor,
  pad_h: Int,
  pad_w: Int,
) -> Result(Tensor, TensorError)

Pad 2D tensor with zeros

pub fn pad4d(
  t: Tensor,
  pad_h: Int,
  pad_w: Int,
) -> Result(Tensor, TensorError)

Pad 4D tensor with zeros

pub fn random_normal(
  shape: List(Int),
  mean: Float,
  std: Float,
) -> Tensor

Tensor with normal random values

pub fn random_uniform(shape: List(Int)) -> Tensor

Random uniform [0, 1)

pub fn rank(t: Tensor) -> Int

Get rank (number of dimensions)

pub fn reshape(
  t: Tensor,
  new_shape: List(Int),
) -> Result(Tensor, TensorError)

Reshape (total size must match)

pub fn scale(t: Tensor, s: Float) -> Tensor

Scale by constant

pub fn scale_into(
  out: Tensor,
  a: Tensor,
  scalar: Float,
) -> Result(Nil, TensorError)

Write out = a * scalar into a preallocated native tensor.

pub fn shape(t: Tensor) -> List(Int)

Shape as list of dimensions

pub fn size(t: Tensor) -> Int

Get total size

pub fn squeeze(t: Tensor) -> Tensor

Remove dimensions of size 1

pub fn std(t: Tensor) -> Float

Standard deviation

pub fn sub(a: Tensor, b: Tensor) -> Result(Tensor, TensorError)

Element-wise subtraction

pub fn sub_into(
  out: Tensor,
  a: Tensor,
  b: Tensor,
) -> Result(Nil, TensorError)

Write out = a - b into a preallocated native tensor.

pub fn sum(t: Tensor) -> Float

Sum everything

pub fn to_accelerated(
  t: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Move a tensor to the best persistent backend: RTX 4090 first, then MKL/CPU.

pub fn to_contiguous(t: Tensor) -> Tensor

Convert to contiguous tensor

pub fn to_list(t: Tensor) -> List(Float)

Convert to list

pub fn to_rtx4090_fp16(
  t: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Upload a tensor to persistent RTX 4090 FP16 memory.

pub fn to_rtx4090_fp32(
  t: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Upload a tensor to persistent RTX 4090 FP32 memory.

pub fn to_strided(t: Tensor) -> Tensor

Convert to strided representation for O(1) element access

pub fn transpose(t: Tensor) -> Result(Tensor, TensorError)

Matrix transpose

pub fn transpose_strided(
  t: Tensor,
) -> Result(Tensor, TensorError)

Zero-copy transpose

pub fn unsqueeze(t: Tensor, axis: Int) -> Tensor

Add dimension of size 1

pub fn variance(t: Tensor) -> Float

Variance

pub fn vector(data: List(Float)) -> Tensor

Create vector (1D tensor)

pub fn workspace_backend(
  workspace: GpuWorkspace,
) -> AccelerationBackend

Workspace backend.

pub fn workspace_from_tensor(
  workspace: GpuWorkspace,
  tensor: Tensor,
) -> Result(AcceleratedTensor, TensorError)

Move a tensor into workspace memory.

pub fn workspace_zeros(
  workspace: GpuWorkspace,
  shape: List(Int),
) -> Result(AcceleratedTensor, TensorError)

Allocate a reusable zero-filled output buffer in workspace memory.

pub fn xavier_init(fan_in: Int, fan_out: Int) -> Tensor

Xavier initialization for neural network weights

pub fn zeros(shape: List(Int)) -> Tensor

All zeros. The tensor equivalent of a blank canvas.

Search Document