Vllm.Config.SpeculativeConfig (VLLM v0.3.0)

Copy Markdown View Source

Configuration for speculative decoding.

Summary

Functions

Python method SpeculativeConfig._validate_suffix_decoding.

Python method SpeculativeConfig._verify_args.

WARNING: Whenever a new field is added to this config,

Python method SpeculativeConfig.hf_config_override.

Constructs SpeculativeConfig.

Python method SpeculativeConfig.use_eagle.

Types

t()

@opaque t()

Functions

_maybe_override_draft_max_model_len(ref, speculative_max_model_len, draft_max_model_len, target_max_model_len, opts \\ [])

@spec _maybe_override_draft_max_model_len(
  SnakeBridge.Ref.t(),
  term(),
  integer(),
  integer(),
  keyword()
) :: {:ok, integer()} | {:error, Snakepit.Error.t()}

Determine the max sequence len for the draft model. This is usually

the draft_max_model_len, but may be the target_max_model_len if it is less than the draft_max_model_len, or may be speculative_max_model_len if it is specified.

This is necessary so that sequences do not exceed the capacity of the draft model or the target model.

speculative_max_model_len is mainly used for testing that sequences can skip speculation.

Parameters

  • speculative_max_model_len (term())
  • draft_max_model_len (integer())
  • target_max_model_len (integer())

Returns

  • integer()

_validate_suffix_decoding(ref, opts \\ [])

@spec _validate_suffix_decoding(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

Python method SpeculativeConfig._validate_suffix_decoding.

Returns

  • term()

_verify_and_get_draft_tp(ref, target_parallel_config, speculative_draft_tensor_parallel_size, draft_hf_config, opts \\ [])

@spec _verify_and_get_draft_tp(SnakeBridge.Ref.t(), term(), term(), term(), keyword()) ::
  {:ok, integer()} | {:error, Snakepit.Error.t()}

Verifies and adjusts the tensor parallel size for a draft model

specified using speculative_draft_tensor_parallel_size.

Parameters

  • target_parallel_config (term())
  • speculative_draft_tensor_parallel_size (term())
  • draft_hf_config (term())

Returns

  • integer()

_verify_args(ref, opts \\ [])

@spec _verify_args(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

Python method SpeculativeConfig._verify_args.

Returns

  • term()

code_revision(ref)

@spec code_revision(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

compute_hash(ref, opts \\ [])

@spec compute_hash(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, String.t()} | {:error, Snakepit.Error.t()}

WARNING: Whenever a new field is added to this config,

ensure that it is included in the factors list if it affects the computation graph.

Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states.

Returns

  • String.t()

create_draft_parallel_config(ref, target_parallel_config, speculative_draft_tensor_parallel_size, opts \\ [])

@spec create_draft_parallel_config(SnakeBridge.Ref.t(), term(), integer(), keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

Create a parallel config for use by the draft worker.

This is mostly a copy of the target parallel config, except the tp_size.

Parameters

  • target_parallel_config (term())
  • speculative_draft_tensor_parallel_size (integer())

Returns

  • term()

disable_by_batch_size(ref)

@spec disable_by_batch_size(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

disable_padded_drafter_batch(ref)

@spec disable_padded_drafter_batch(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

draft_model_config(ref)

@spec draft_model_config(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

draft_parallel_config(ref)

@spec draft_parallel_config(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

draft_tensor_parallel_size(ref)

@spec draft_tensor_parallel_size(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

enforce_eager(ref)

@spec enforce_eager(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

hf_config_override(ref, hf_config, opts \\ [])

@spec hf_config_override(SnakeBridge.Ref.t(), term(), keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

Python method SpeculativeConfig.hf_config_override.

Parameters

  • hf_config (term())

Returns

  • term()

max_model_len(ref)

@spec max_model_len(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

method(ref)

@spec method(SnakeBridge.Ref.t()) :: {:ok, term()} | {:error, Snakepit.Error.t()}

model(ref)

@spec model(SnakeBridge.Ref.t()) :: {:ok, term()} | {:error, Snakepit.Error.t()}

new(dataclass_self__, args, kwargs, opts \\ [])

@spec new(term(), term(), term(), keyword()) ::
  {:ok, SnakeBridge.Ref.t()} | {:error, Snakepit.Error.t()}

Constructs SpeculativeConfig.

Parameters

  • dataclass_self__ (term())
  • args (term())
  • kwargs (term())

num_speculative_tokens(ref)

@spec num_speculative_tokens(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

prompt_lookup_max(ref)

@spec prompt_lookup_max(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

prompt_lookup_min(ref)

@spec prompt_lookup_min(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

quantization(ref)

@spec quantization(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

revision(ref)

@spec revision(SnakeBridge.Ref.t()) :: {:ok, term()} | {:error, Snakepit.Error.t()}

speculative_token_tree(ref)

@spec speculative_token_tree(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

suffix_decoding_max_cached_requests(ref)

@spec suffix_decoding_max_cached_requests(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

suffix_decoding_max_spec_factor(ref)

@spec suffix_decoding_max_spec_factor(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

suffix_decoding_max_tree_depth(ref)

@spec suffix_decoding_max_tree_depth(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

suffix_decoding_min_token_prob(ref)

@spec suffix_decoding_min_token_prob(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

target_model_config(ref)

@spec target_model_config(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

target_parallel_config(ref)

@spec target_parallel_config(SnakeBridge.Ref.t()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

use_eagle(ref, opts \\ [])

@spec use_eagle(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, boolean()} | {:error, Snakepit.Error.t()}

Python method SpeculativeConfig.use_eagle.

Returns

  • boolean()