VoxCPMEx (voxcpmex v0.3.0)

Elixir wrapper for VoxCPM2 — a tokenizer-free, diffusion autoregressive Text-to-Speech model from OpenBMB.

2B parameters · 30 languages · 48kHz output · 2M+ hours training data.

Features

🌍 30-Language Multilingual — Chinese, English, Japanese, Korean, Arabic, French, German, and 23+ more
🎨 Voice Design — Generate a novel voice from text description alone
🎛️ Controllable Cloning — Clone any voice from a short clip, with style guidance
🎙️ Ultimate Cloning — Audio-continuation cloning for maximum fidelity
🔊 48kHz Studio Output — AudioVAE V2 super-resolution
⚡ True Streaming — Get audio chunks as they're generated
🎓 LoRA Fine-Tuning — Adapt with 5–10 minutes of audio

Protocol (v2)

VoxCPMEx uses MessagePack over binary-framed Erlang Ports. Audio is transmitted as raw bytes — no base64 encoding overhead.

Quick Start

{:ok, pid} = VoxCPMEx.start_link(device: "cuda")
:ok = VoxCPMEx.await_ready(pid)
{:ok, audio} = VoxCPMEx.generate(pid, "Hello, world!")
:ok = VoxCPMEx.save(audio, "output.wav")

Voice Design

{:ok, audio} = VoxCPMEx.generate(pid,
  "(A young woman, gentle and sweet voice) Welcome!"
)

Voice Cloning

{:ok, audio} = VoxCPMEx.generate(pid, "Hello in my voice!",
  audio_prompt: "reference.wav"
)

Streaming (v2 — true chunk-by-chunk)

{:ok, ref} = VoxCPMEx.generate_streaming_async(pid, "Long text...")
stream_loop(ref)

defp stream_loop(ref) do
  case VoxCPMEx.next_chunk(pid, ref) do
    {:ok, chunk} -> IO.puts("got chunk"); stream_loop(ref)
    :eos -> IO.puts("done!")
    {:error, reason} -> IO.puts("error")
  end
end

Or collect everything at once:

{:ok, ref} = VoxCPMEx.generate_streaming_async(pid, "Long text...")
{:ok, audio} = VoxCPMEx.collect_stream(pid, ref)

Requirements

Python ≥ 3.10, voxcpm + msgpack pip packages
CUDA GPU (8+ GB VRAM), Apple Silicon (MPS), or CPU
Elixir ≥ 1.14

Installation

# mix.exs
{:voxcpmex, "~> 0.2.0"}

# Install Python deps
mix voxcpmex.setup

Summary

Types

audio()

generate_opt()

Functions

await_ready(server, timeout \\ 120_000)

Waits for the model to finish loading. Returns :ok when ready.

collect_stream(server, ref)

Collects all remaining chunks from a streaming session and returns the full concatenated audio as raw bytes.

generate(server, text, opts \\ [])

Generates speech audio from text. Returns {:ok, audio_wav}.

generate(server, text, opts, timeout)

See VoxCPMEx.Server.generate/4.

generate_streaming_async(server, text, opts \\ [])

Starts asynchronous streaming generation.

info(server)

Returns runtime model information.

load_lora(server, lora_path)

Loads LoRA fine-tuning weights. Returns {:ok, loaded, skipped}.

next_chunk(server, ref)

Returns the next chunk from an active streaming session.

save(audio, path)

Saves audio binary to a WAV file.

start_link(opts)

Starts a VoxCPMEx model server.

stop(server)

Gracefully stops the server and Python bridge.

unload_lora(server)

Resets all LoRA weights to zero.

Types

audio()

@type audio() :: binary()

generate_opt()

@type generate_opt() ::
  {:audio_prompt, String.t()}
  | {:prompt_wav_path, String.t()}
  | {:prompt_text, String.t()}
  | {:cfg_value, float()}
  | {:inference_timesteps, pos_integer()}
  | {:min_len, pos_integer()}
  | {:max_len, pos_integer()}
  | {:normalize, boolean()}
  | {:denoise, boolean()}

Functions

await_ready(server, timeout \\ 120_000)

@spec await_ready(GenServer.server(), timeout()) :: :ok | {:error, term()}

Waits for the model to finish loading. Returns :ok when ready.

collect_stream(server, ref)

@spec collect_stream(GenServer.server(), reference()) ::
  {:ok, binary()} | {:error, term()}

Collects all remaining chunks from a streaming session and returns the full concatenated audio as raw bytes.

Returns {:ok, audio_bytes} when all chunks are collected, or {:error, reason}.

generate(server, text, opts \\ [])

@spec generate(GenServer.server(), String.t(), [generate_opt()]) ::
  {:ok, audio()} | {:error, term()}

Generates speech audio from text. Returns {:ok, audio_wav}.

Options

:audio_prompt — Reference audio for voice cloning
:prompt_wav_path + :prompt_text — Ultimate cloning
:cfg_value — Guidance scale (1.0–3.0). Default: 2.0
:inference_timesteps — Diffusion steps (4–30). Default: 10
:min_len — Min audio length in tokens. Default: 2
:max_len — Max token length. Default: 4096
:normalize — Text normalization. Default: false
:denoise — Denoise reference audio. Default: false

Examples

# Basic
{:ok, audio} = VoxCPMEx.generate(pid, "Hello!")
:ok = VoxCPMEx.save(audio, "out.wav")

# Voice Design
{:ok, audio} = VoxCPMEx.generate(pid,
  "(warm male voice) Welcome to the demo."
)

# Voice Cloning
{:ok, audio} = VoxCPMEx.generate(pid, "Hello!",
  audio_prompt: "ref.wav"
)

# Quality tuning
{:ok, audio} = VoxCPMEx.generate(pid, "Quality matters.",
  inference_timesteps: 30, cfg_value: 3.0
)

generate(server, text, opts, timeout)

@spec generate(GenServer.server(), String.t(), [generate_opt()], timeout()) ::
  {:ok, audio()} | {:error, term()}

See VoxCPMEx.Server.generate/4.

generate_streaming_async(server, text, opts \\ [])

@spec generate_streaming_async(GenServer.server(), String.t(), [generate_opt()]) ::
  {:ok, reference()} | {:error, term()}

Starts asynchronous streaming generation.

Returns {:ok, stream_ref} immediately — the model generates in the background and chunks are delivered to the GenServer as they're produced.

Poll for chunks with next_chunk/2:

{:ok, chunk} → raw float32 PCM bytes
:eos → stream complete
{:error, reason}

Or collect everything at once with collect_stream/2.

Example

{:ok, ref} = VoxCPMEx.generate_streaming_async(pid, "Long text...")

# Poll for chunks
stream_loop(pid, ref)

defp stream_loop(pid, ref) do
  case VoxCPMEx.next_chunk(pid, ref) do
    {:ok, chunk} ->
      play_chunk(chunk)
      stream_loop(pid, ref)
    :eos -> :ok
    {:error, reason} -> Logger.error("Stream error")
  end
end

info(server)

@spec info(GenServer.server()) :: map()

Returns runtime model information.

load_lora(server, lora_path)

@spec load_lora(GenServer.server(), String.t()) ::
  {:ok, non_neg_integer(), non_neg_integer()} | {:error, term()}

Loads LoRA fine-tuning weights. Returns {:ok, loaded, skipped}.

next_chunk(server, ref)

@spec next_chunk(GenServer.server(), reference()) ::
  {:ok, binary()} | :eos | {:error, term()}

Returns the next chunk from an active streaming session.

Returns:

{:ok, chunk} — raw float32 PCM bytes for this chunk
:eos — stream is complete, no more chunks
{:error, reason}

save(audio, path)

@spec save(audio(), Path.t()) :: :ok | {:error, term()}

Saves audio binary to a WAV file.

start_link(opts)

@spec start_link(VoxCPMEx.Server.start_opts()) :: GenServer.on_start()

Starts a VoxCPMEx model server.

Options

:model — HuggingFace model ID. Default: "openbmb/VoxCPM2"
:device — "cuda", "cpu", "mps". Default: "cuda"
:load_denoiser — Load audio denoiser. Default: false
:optimize — Enable torch.compile. Default: true
:name — Optional GenServer name

stop(server)

@spec stop(GenServer.server()) :: :ok

Gracefully stops the server and Python bridge.

unload_lora(server)

@spec unload_lora(GenServer.server()) :: :ok | {:error, term()}

Resets all LoRA weights to zero.