viva_tensor
High-performance tensor operations for Gleam on the BEAM.
This module is the stable entry point for the package. It re-exports the tensor type, common constructors, shape operations, linear algebra, element-wise math, reductions, neural-network helpers, and TFLOPS measurement utilities.
Lower-level implementation, backend, neural-network, quantization, sparse, telemetry, and benchmark modules are intentionally excluded from the public documentation until their contracts are stable. Prefer this module as the public API surface.
import gleam/result
import viva_tensor as t
let a = t.zeros([2, 3])
let b = t.ones([2, 3])
use c <- result.try(t.add(a, b))
c
Types
Result storage selected by the RTX-first planner.
pub type AcceleratedTensor =
@internal AcceleratedTensor
Backend selected by the RTX-first planner.
pub type AccelerationBackend =
@internal AccelerationBackend
Configuration for two-dimensional convolution operations.
pub type Conv2dConfig =
@internal Conv2dConfig
Workspace for persistent GPU buffers.
pub type GpuWorkspace =
@internal GpuWorkspace
Persisted linear layer parameters.
pub type LinearLayer =
@internal LinearLayer
Opaque reference to a tensor stored in native NIF memory.
pub type NativeTensorRef =
@internal NativeTensorRef
A tensor value backed by dense, strided, or native storage.
pub type Tensor =
@internal Tensor
Error returned by fallible tensor constructors and operations.
pub type TensorError =
@internal TensorError
Backend used when measuring matrix-multiplication throughput.
pub type TflopsBackend =
@internal Backend
Result returned by TFLOPS measurement helpers.
pub type TflopsResult =
@internal TflopsResult
Values
pub fn accelerated_backend(
t: AcceleratedTensor,
) -> AccelerationBackend
Inspect which backend was selected by matmul_auto.
pub fn accelerated_shape(t: AcceleratedTensor) -> List(Int)
Shape of an accelerated tensor without forcing a download.
pub fn accelerated_sync() -> Result(Nil, TensorError)
Wait for queued CUDA work to complete.
pub fn accelerated_to_tensor(
t: AcceleratedTensor,
) -> Result(Tensor, TensorError)
Download an accelerated tensor back to a regular CPU tensor.
pub fn add_broadcast(
a: Tensor,
b: Tensor,
) -> Result(Tensor, TensorError)
Add with broadcasting
pub fn add_into(
out: Tensor,
a: Tensor,
b: Tensor,
) -> Result(Nil, TensorError)
Write out = a + b into a preallocated native tensor.
pub fn avg_pool2d(
input: Tensor,
pool_h: Int,
pool_w: Int,
stride_h: Int,
stride_w: Int,
) -> Result(Tensor, TensorError)
Average pooling 2D
pub fn broadcast_to(
t: Tensor,
target_shape: List(Int),
) -> Result(Tensor, TensorError)
Broadcast tensor to a target shape.
pub fn can_broadcast(a: List(Int), b: List(Int)) -> Bool
Can these shapes broadcast together?
pub fn conv2d(
input: Tensor,
kernel: Tensor,
config: Conv2dConfig,
) -> Result(Tensor, TensorError)
2D Convolution
pub fn conv2d_config() -> Conv2dConfig
Default conv2d config (3x3 kernel, stride 1, no padding)
pub fn conv2d_same(kernel_h: Int, kernel_w: Int) -> Conv2dConfig
Conv2d config with “same” padding
pub fn detect_backends() -> List(TflopsBackend)
Detect available compute backends
pub fn from_list2d(
rows: List(List(Float)),
) -> Result(Tensor, TensorError)
Create 2D tensor from list of lists
pub fn from_native_ref(
ref: NativeTensorRef,
shape: List(Int),
) -> Tensor
Wrap an existing native NIF tensor resource.
pub fn global_avg_pool2d(
input: Tensor,
) -> Result(Tensor, TensorError)
Global average pooling
pub fn gpu_workspace() -> Result(GpuWorkspace, TensorError)
Create an RTX 4090 FP16 workspace.
pub fn he_init(fan_in: Int, fan_out: Int) -> Tensor
He initialization (for ReLU networks)
pub fn is_native(t: Tensor) -> Bool
Check whether a tensor is backed by native NIF memory.
pub fn linear_gelu_accelerated_into(
out: AcceleratedTensor,
a: AcceleratedTensor,
b: AcceleratedTensor,
bias: AcceleratedTensor,
) -> Result(Nil, TensorError)
Write out = gelu(a @ b + bias) using the FP16 Tensor Core fused epilogue.
pub fn linear_gelu_forward_into(
out: AcceleratedTensor,
input: AcceleratedTensor,
layer: LinearLayer,
) -> Result(Nil, TensorError)
Run out = gelu(input @ layer.weight + layer.bias).
pub fn linear_layer(
workspace: GpuWorkspace,
weight: Tensor,
bias: Tensor,
) -> Result(LinearLayer, TensorError)
Create a persisted linear layer in workspace memory.
pub fn linear_layer_backend(
layer: LinearLayer,
) -> AccelerationBackend
Linear layer backend.
pub fn linear_layer_fp16(
weight: Tensor,
bias: Tensor,
) -> Result(LinearLayer, TensorError)
Create a persisted FP16 linear layer on the RTX.
pub fn linear_layer_input_features(layer: LinearLayer) -> Int
Linear layer input feature count.
pub fn linear_layer_output_features(layer: LinearLayer) -> Int
Linear layer output feature count.
pub fn linear_output(
workspace: GpuWorkspace,
layer: LinearLayer,
batch_size: Int,
) -> Result(AcceleratedTensor, TensorError)
Allocate a reusable output buffer for a persisted linear layer.
pub fn linear_relu(
a: Tensor,
b: Tensor,
bias: Tensor,
) -> Result(Tensor, TensorError)
Fused linear layer with ReLU: max(0, a @ b + bias).
pub fn linear_relu_accelerated_into(
out: AcceleratedTensor,
a: AcceleratedTensor,
b: AcceleratedTensor,
bias: AcceleratedTensor,
) -> Result(Nil, TensorError)
Write out = relu(a @ b + bias) using the FP16 Tensor Core fused epilogue.
pub fn linear_relu_forward_into(
out: AcceleratedTensor,
input: AcceleratedTensor,
layer: LinearLayer,
) -> Result(Nil, TensorError)
Run out = relu(input @ layer.weight + layer.bias).
pub fn linear_relu_into(
out: Tensor,
a: Tensor,
b: Tensor,
bias: Tensor,
) -> Result(Nil, TensorError)
Write out = max(0, a @ b + bias) into a preallocated native tensor.
pub fn map2(
a: Tensor,
b: Tensor,
f: fn(Float, Float) -> Float,
) -> Result(Tensor, TensorError)
Apply a binary function element-wise over tensors with the same shape.
pub fn matmul(
a: Tensor,
b: Tensor,
) -> Result(Tensor, TensorError)
Matrix-matrix multiplication
pub fn matmul_accelerated(
a: AcceleratedTensor,
b: AcceleratedTensor,
) -> Result(AcceleratedTensor, TensorError)
Matrix multiplication between persistent accelerated tensors.
pub fn matmul_accelerated_into(
out: AcceleratedTensor,
a: AcceleratedTensor,
b: AcceleratedTensor,
) -> Result(Nil, TensorError)
Write out = a @ b into a persistent accelerated output buffer.
pub fn matmul_auto(
a: Tensor,
b: Tensor,
) -> Result(AcceleratedTensor, TensorError)
Matrix multiplication with priority: RTX 4090 first, then MKL/native CPU.
pub fn matmul_gelu_accelerated_into(
out: AcceleratedTensor,
a: AcceleratedTensor,
b: AcceleratedTensor,
) -> Result(Nil, TensorError)
Write out = gelu(a @ b) using the FP16 Tensor Core fused epilogue.
pub fn matmul_into(
out: Tensor,
a: Tensor,
b: Tensor,
) -> Result(Nil, TensorError)
Write out = a @ b into a preallocated native tensor.
pub fn matmul_relu_accelerated_into(
out: AcceleratedTensor,
a: AcceleratedTensor,
b: AcceleratedTensor,
) -> Result(Nil, TensorError)
Write out = relu(a @ b) using the FP16 Tensor Core fused epilogue.
pub fn matmul_vec(
mat: Tensor,
vec: Tensor,
) -> Result(Tensor, TensorError)
Matrix-vector multiplication
pub fn matrix(
rows: Int,
cols: Int,
data: List(Float),
) -> Result(Tensor, TensorError)
Create matrix (2D tensor)
pub fn max_pool2d(
input: Tensor,
pool_h: Int,
pool_w: Int,
stride_h: Int,
stride_w: Int,
) -> Result(Tensor, TensorError)
Max pooling 2D
pub fn measure_tflops(
backend: TflopsBackend,
m: Int,
n: Int,
k: Int,
) -> TflopsResult
Measure TFLOPS for a single matmul operation
pub fn measure_tflops_averaged(
backend: TflopsBackend,
m: Int,
n: Int,
k: Int,
iterations: Int,
) -> TflopsResult
Measure averaged TFLOPS (warmup + iterations)
pub fn mul_broadcast(
a: Tensor,
b: Tensor,
) -> Result(Tensor, TensorError)
Multiply with broadcasting
pub fn mul_into(
out: Tensor,
a: Tensor,
b: Tensor,
) -> Result(Nil, TensorError)
Write out = a * b into a preallocated native tensor.
pub fn native_fill(
shape: List(Int),
value: Float,
) -> Result(Tensor, TensorError)
Create a native-backed tensor filled with a value.
pub fn native_from_list(
data: List(Float),
shape: List(Int),
) -> Result(Tensor, TensorError)
Create a native-backed tensor from row-major list data.
pub fn native_ones(
shape: List(Int),
) -> Result(Tensor, TensorError)
Create a native-backed tensor of ones.
pub fn native_ref(t: Tensor) -> Result(NativeTensorRef, Nil)
Extract the native NIF tensor resource when present.
pub fn native_zeros(
shape: List(Int),
) -> Result(Tensor, TensorError)
Create a native-backed tensor of zeros.
pub fn pad2d(
t: Tensor,
pad_h: Int,
pad_w: Int,
) -> Result(Tensor, TensorError)
Pad 2D tensor with zeros
pub fn pad4d(
t: Tensor,
pad_h: Int,
pad_w: Int,
) -> Result(Tensor, TensorError)
Pad 4D tensor with zeros
pub fn random_normal(
shape: List(Int),
mean: Float,
std: Float,
) -> Tensor
Tensor with normal random values
pub fn reshape(
t: Tensor,
new_shape: List(Int),
) -> Result(Tensor, TensorError)
Reshape (total size must match)
pub fn scale_into(
out: Tensor,
a: Tensor,
scalar: Float,
) -> Result(Nil, TensorError)
Write out = a * scalar into a preallocated native tensor.
pub fn sub_into(
out: Tensor,
a: Tensor,
b: Tensor,
) -> Result(Nil, TensorError)
Write out = a - b into a preallocated native tensor.
pub fn to_accelerated(
t: Tensor,
) -> Result(AcceleratedTensor, TensorError)
Move a tensor to the best persistent backend: RTX 4090 first, then MKL/CPU.
pub fn to_rtx4090_fp16(
t: Tensor,
) -> Result(AcceleratedTensor, TensorError)
Upload a tensor to persistent RTX 4090 FP16 memory.
pub fn to_rtx4090_fp32(
t: Tensor,
) -> Result(AcceleratedTensor, TensorError)
Upload a tensor to persistent RTX 4090 FP32 memory.
pub fn transpose_strided(
t: Tensor,
) -> Result(Tensor, TensorError)
Zero-copy transpose
pub fn workspace_backend(
workspace: GpuWorkspace,
) -> AccelerationBackend
Workspace backend.
pub fn workspace_from_tensor(
workspace: GpuWorkspace,
tensor: Tensor,
) -> Result(AcceleratedTensor, TensorError)
Move a tensor into workspace memory.
pub fn workspace_zeros(
workspace: GpuWorkspace,
shape: List(Int),
) -> Result(AcceleratedTensor, TensorError)
Allocate a reusable zero-filled output buffer in workspace memory.
pub fn xavier_init(fan_in: Int, fan_out: Int) -> Tensor
Xavier initialization for neural network weights