Vllm.LLMEngine (VLLM v0.3.0)

Copy Markdown View Source

Legacy LLMEngine for backwards compatibility.

Summary

Functions

Remove request_ids from EngineCore and Detokenizer.

Load a new LoRA adapter into the engine for future requests.

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Log stats if logging is enabled.

Log stats when the time interval has passed.

Creates an LLM engine from the engine arguments.

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

List all registered adapters.

Initialize self. See help(type(self)) for accurate signature.

Prevent an adapter from being evicted.

Remove an already loaded LoRA adapter.

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Types

t()

@opaque t()

Functions

abort_request(ref, request_ids, args, opts \\ [])

@spec abort_request(SnakeBridge.Ref.t(), [String.t()], [term()], keyword()) ::
  {:ok, nil} | {:error, Snakepit.Error.t()}

Remove request_ids from EngineCore and Detokenizer.

Parameters

  • request_ids (list(String.t()))
  • internal (boolean() default: False)

Returns

  • nil

add_lora(ref, lora_request, opts \\ [])

@spec add_lora(SnakeBridge.Ref.t(), term(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

Load a new LoRA adapter into the engine for future requests.

Parameters

  • lora_request (term())

Returns

  • boolean()

add_request(ref, request_id, prompt, params, args, opts \\ [])

@spec add_request(
  SnakeBridge.Ref.t(),
  String.t(),
  term(),
  term(),
  [term()],
  keyword()
) ::
  {:ok, nil} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • request_id (String.t())
  • prompt (term())
  • params (term())
  • arrival_time (term() default: None)
  • lora_request (term() default: None)
  • tokenization_kwargs (term() default: None)
  • trace_headers (term() default: None)
  • priority (integer() default: 0)
  • prompt_text (term() default: None)

Returns

  • nil

apply_model(ref, func, opts \\ [])

@spec apply_model(SnakeBridge.Ref.t(), term(), keyword()) ::
  {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • func (term())

Returns

  • list(term())

collective_rpc(ref, method, args, opts \\ [])

@spec collective_rpc(SnakeBridge.Ref.t(), term(), [term()], keyword()) ::
  {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • method (term())
  • timeout (term() default: None)
  • args (tuple() default: ())
  • kwargs (term() default: None)

Returns

  • list(term())

do_log_stats(ref, opts \\ [])

@spec do_log_stats(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, nil} | {:error, Snakepit.Error.t()}

Log stats if logging is enabled.

Returns

  • nil

do_log_stats_with_interval(ref, opts \\ [])

@spec do_log_stats_with_interval(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, nil} | {:error, Snakepit.Error.t()}

Log stats when the time interval has passed.

Returns

  • nil

from_engine_args(ref, engine_args, args, opts \\ [])

@spec from_engine_args(SnakeBridge.Ref.t(), term(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

Creates an LLM engine from the engine arguments.

Parameters

  • engine_args (term())
  • usage_context (term() default: <UsageContext.ENGINE_CONTEXT: 'ENGINE_CONTEXT'>)
  • stat_loggers (term() default: None)
  • enable_multiprocessing (boolean() default: False)

Returns

  • term()

from_vllm_config(ref, vllm_config, args, opts \\ [])

@spec from_vllm_config(SnakeBridge.Ref.t(), term(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • vllm_config (term())
  • usage_context (term() default: <UsageContext.ENGINE_CONTEXT: 'ENGINE_CONTEXT'>)
  • stat_loggers (term() default: None)
  • disable_log_stats (boolean() default: False)

Returns

  • term()

get_metrics(ref, opts \\ [])

@spec get_metrics(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • list(term())

get_num_unfinished_requests(ref, opts \\ [])

@spec get_num_unfinished_requests(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, integer()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • integer()

get_supported_tasks(ref, opts \\ [])

@spec get_supported_tasks(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, {term(), term()}} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • {term(), term()}

get_tokenizer(ref, opts \\ [])

@spec get_tokenizer(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • term()

has_unfinished_requests(ref, opts \\ [])

@spec has_unfinished_requests(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • boolean()

has_unfinished_requests_dp(ref, has_unfinished, opts \\ [])

@spec has_unfinished_requests_dp(SnakeBridge.Ref.t(), boolean(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • has_unfinished (boolean())

Returns

  • boolean()

is_sleeping(ref, opts \\ [])

@spec is_sleeping(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • boolean()

list_loras(ref, opts \\ [])

@spec list_loras(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, MapSet.t(integer())} | {:error, Snakepit.Error.t()}

List all registered adapters.

Returns

  • MapSet.t(integer())

new(vllm_config, executor_class, log_stats, args, opts \\ [])

@spec new(term(), term(), boolean(), [term()], keyword()) ::
  {:ok, SnakeBridge.Ref.t()} | {:error, Snakepit.Error.t()}

Initialize self. See help(type(self)) for accurate signature.

Parameters

  • vllm_config (term())
  • executor_class (term())
  • log_stats (boolean())
  • aggregate_engine_logging (boolean() default: False)
  • usage_context (term() default: <UsageContext.ENGINE_CONTEXT: 'ENGINE_CONTEXT'>)
  • stat_loggers (term() default: None)
  • mm_registry (term() default: <vllm.multimodal.registry.MultiModalRegistry object at 0x74247e84e510>)
  • use_cached_outputs (boolean() default: False)
  • multiprocess_mode (boolean() default: False)

pin_lora(ref, lora_id, opts \\ [])

@spec pin_lora(SnakeBridge.Ref.t(), integer(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

Prevent an adapter from being evicted.

Parameters

  • lora_id (integer())

Returns

  • boolean()

remove_lora(ref, lora_id, opts \\ [])

@spec remove_lora(SnakeBridge.Ref.t(), integer(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

Remove an already loaded LoRA adapter.

Parameters

  • lora_id (integer())

Returns

  • boolean()

reset_mm_cache(ref, opts \\ [])

@spec reset_mm_cache(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • term()

reset_prefix_cache(ref, args, opts \\ [])

@spec reset_prefix_cache(SnakeBridge.Ref.t(), [term()], keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • reset_running_requests (boolean() default: False)
  • reset_connector (boolean() default: False)

Returns

  • boolean()

sleep(ref, args, opts \\ [])

@spec sleep(SnakeBridge.Ref.t(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • level (integer() default: 1)

Returns

  • term()

start_profile(ref, opts \\ [])

@spec start_profile(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • term()

step(ref, opts \\ [])

@spec step(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • list(term())

stop_profile(ref, opts \\ [])

@spec stop_profile(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

  • term()

tokenizer(ref)

@spec tokenizer(SnakeBridge.Ref.t()) :: {:ok, term()} | {:error, Snakepit.Error.t()}

validate_outputs(ref, outputs, output_type, opts \\ [])

@spec validate_outputs(SnakeBridge.Ref.t(), term(), term(), keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • outputs (term())
  • output_type (term())

Returns

  • term()

wake_up(ref, args, opts \\ [])

@spec wake_up(SnakeBridge.Ref.t(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

  • tags (term() default: None)

Returns

  • term()