Vllm.LLMEngine (VLLM v0.3.0)

Legacy LLMEngine for backwards compatibility.

Summary

Types

t()

Functions

abort_request(ref, request_ids, args, opts \\ [])

Remove request_ids from EngineCore and Detokenizer.

add_lora(ref, lora_request, opts \\ [])

Load a new LoRA adapter into the engine for future requests.

add_request(ref, request_id, prompt, params, args, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

apply_model(ref, func, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

collective_rpc(ref, method, args, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

do_log_stats(ref, opts \\ [])

Log stats if logging is enabled.

do_log_stats_with_interval(ref, opts \\ [])

Log stats when the time interval has passed.

from_engine_args(ref, engine_args, args, opts \\ [])

Creates an LLM engine from the engine arguments.

from_vllm_config(ref, vllm_config, args, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

get_metrics(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

get_num_unfinished_requests(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

get_supported_tasks(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

get_tokenizer(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

has_unfinished_requests(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

has_unfinished_requests_dp(ref, has_unfinished, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

is_sleeping(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

list_loras(ref, opts \\ [])

List all registered adapters.

new(vllm_config, executor_class, log_stats, args, opts \\ [])

Initialize self. See help(type(self)) for accurate signature.

pin_lora(ref, lora_id, opts \\ [])

Prevent an adapter from being evicted.

remove_lora(ref, lora_id, opts \\ [])

Remove an already loaded LoRA adapter.

reset_mm_cache(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

reset_prefix_cache(ref, args, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

sleep(ref, args, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

start_profile(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

step(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

stop_profile(ref, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

tokenizer(ref)

validate_outputs(ref, outputs, output_type, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

wake_up(ref, args, opts \\ [])

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Types

t()

@opaque t()

Functions

abort_request(ref, request_ids, args, opts \\ [])

@spec abort_request(SnakeBridge.Ref.t(), [String.t()], [term()], keyword()) ::
  {:ok, nil} | {:error, Snakepit.Error.t()}

Remove request_ids from EngineCore and Detokenizer.

Parameters

request_ids (list(String.t()))
internal (boolean() default: False)

Returns

nil

add_lora(ref, lora_request, opts \\ [])

@spec add_lora(SnakeBridge.Ref.t(), term(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

Load a new LoRA adapter into the engine for future requests.

Parameters

lora_request (term())

Returns

boolean()

add_request(ref, request_id, prompt, params, args, opts \\ [])

@spec add_request(
  SnakeBridge.Ref.t(),
  String.t(),
  term(),
  term(),
  [term()],
  keyword()
) ::
  {:ok, nil} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

request_id (String.t())
prompt (term())
params (term())
arrival_time (term() default: None)
lora_request (term() default: None)
tokenization_kwargs (term() default: None)
trace_headers (term() default: None)
priority (integer() default: 0)
prompt_text (term() default: None)

Returns

nil

apply_model(ref, func, opts \\ [])

@spec apply_model(SnakeBridge.Ref.t(), term(), keyword()) ::
  {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

func (term())

Returns

list(term())

collective_rpc(ref, method, args, opts \\ [])

@spec collective_rpc(SnakeBridge.Ref.t(), term(), [term()], keyword()) ::
  {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

method (term())
timeout (term() default: None)
args (tuple() default: ())
kwargs (term() default: None)

Returns

list(term())

do_log_stats(ref, opts \\ [])

@spec do_log_stats(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, nil} | {:error, Snakepit.Error.t()}

Log stats if logging is enabled.

Returns

nil

do_log_stats_with_interval(ref, opts \\ [])

@spec do_log_stats_with_interval(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, nil} | {:error, Snakepit.Error.t()}

Log stats when the time interval has passed.

Returns

nil

from_engine_args(ref, engine_args, args, opts \\ [])

@spec from_engine_args(SnakeBridge.Ref.t(), term(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

Creates an LLM engine from the engine arguments.

Parameters

engine_args (term())
usage_context (term() default: <UsageContext.ENGINE_CONTEXT: 'ENGINE_CONTEXT'>)
stat_loggers (term() default: None)
enable_multiprocessing (boolean() default: False)

Returns

term()

from_vllm_config(ref, vllm_config, args, opts \\ [])

@spec from_vllm_config(SnakeBridge.Ref.t(), term(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

vllm_config (term())
usage_context (term() default: <UsageContext.ENGINE_CONTEXT: 'ENGINE_CONTEXT'>)
stat_loggers (term() default: None)
disable_log_stats (boolean() default: False)

Returns

term()

get_metrics(ref, opts \\ [])

@spec get_metrics(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

list(term())

get_num_unfinished_requests(ref, opts \\ [])

@spec get_num_unfinished_requests(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, integer()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

integer()

get_supported_tasks(ref, opts \\ [])

@spec get_supported_tasks(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, {term(), term()}} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

{term(), term()}

get_tokenizer(ref, opts \\ [])

@spec get_tokenizer(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

term()

has_unfinished_requests(ref, opts \\ [])

@spec has_unfinished_requests(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

boolean()

has_unfinished_requests_dp(ref, has_unfinished, opts \\ [])

@spec has_unfinished_requests_dp(SnakeBridge.Ref.t(), boolean(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

has_unfinished (boolean())

Returns

boolean()

is_sleeping(ref, opts \\ [])

@spec is_sleeping(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

boolean()

list_loras(ref, opts \\ [])

@spec list_loras(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, MapSet.t(integer())} | {:error, Snakepit.Error.t()}

List all registered adapters.

Returns

MapSet.t(integer())

new(vllm_config, executor_class, log_stats, args, opts \\ [])

@spec new(term(), term(), boolean(), [term()], keyword()) ::
  {:ok, SnakeBridge.Ref.t()} | {:error, Snakepit.Error.t()}

Initialize self. See help(type(self)) for accurate signature.

Parameters

vllm_config (term())
executor_class (term())
log_stats (boolean())
aggregate_engine_logging (boolean() default: False)
usage_context (term() default: <UsageContext.ENGINE_CONTEXT: 'ENGINE_CONTEXT'>)
stat_loggers (term() default: None)
mm_registry (term() default: <vllm.multimodal.registry.MultiModalRegistry object at 0x74247e84e510>)
use_cached_outputs (boolean() default: False)
multiprocess_mode (boolean() default: False)

pin_lora(ref, lora_id, opts \\ [])

@spec pin_lora(SnakeBridge.Ref.t(), integer(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

Prevent an adapter from being evicted.

Parameters

lora_id (integer())

Returns

boolean()

remove_lora(ref, lora_id, opts \\ [])

@spec remove_lora(SnakeBridge.Ref.t(), integer(), keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

Remove an already loaded LoRA adapter.

Parameters

lora_id (integer())

Returns

boolean()

reset_mm_cache(ref, opts \\ [])

@spec reset_mm_cache(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

term()

reset_prefix_cache(ref, args, opts \\ [])

@spec reset_prefix_cache(SnakeBridge.Ref.t(), [term()], keyword()) ::
  {:ok, boolean()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

reset_running_requests (boolean() default: False)
reset_connector (boolean() default: False)

Returns

boolean()

sleep(ref, args, opts \\ [])

@spec sleep(SnakeBridge.Ref.t(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

level (integer() default: 1)

Returns

term()

start_profile(ref, opts \\ [])

@spec start_profile(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

term()

step(ref, opts \\ [])

@spec step(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, [term()]} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

list(term())

stop_profile(ref, opts \\ [])

@spec stop_profile(
  SnakeBridge.Ref.t(),
  keyword()
) :: {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Returns

term()

tokenizer(ref)

@spec tokenizer(SnakeBridge.Ref.t()) :: {:ok, term()} | {:error, Snakepit.Error.t()}

validate_outputs(ref, outputs, output_type, opts \\ [])

@spec validate_outputs(SnakeBridge.Ref.t(), term(), term(), keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

outputs (term())
output_type (term())

Returns

term()

wake_up(ref, args, opts \\ [])

@spec wake_up(SnakeBridge.Ref.t(), [term()], keyword()) ::
  {:ok, term()} | {:error, Snakepit.Error.t()}

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Parameters

tags (term() default: None)

Returns

term()