Gemini.RateLimiter.Manager (GeminiEx v0.10.0)

Central rate limiter manager that coordinates request submission.

Wraps outbound requests with:

Rate limit checking and enforcement
Concurrency gating
Token budgeting
Retry handling with backoff

Enabled by default. Use disable_rate_limiter: true to opt out.

Features

ETS-based state for cross-process visibility
Per-model/location/metric tracking
Configurable concurrency limits with adaptive mode
Token budget estimation and tracking
Telemetry event emission

Usage

# Execute a request through the rate limiter
{:ok, response} = Manager.execute(
  fn -> HTTP.post(path, body, opts) end,
  "gemini-flash-lite-latest",
  opts
)

# Non-blocking mode returns immediately if rate limited
case Manager.execute(fn -> ... end, model, non_blocking: true) do
  {:ok, response} -> handle_response(response)
  {:error, {:rate_limited, retry_at, details}} -> schedule_retry(retry_at)
end

Configuration

Configure via application environment or per-request options:

config :gemini_ex, :rate_limiter,
  max_concurrency_per_model: 4,
  max_attempts: 3,
  base_backoff_ms: 1000,
  profile: :prod

Per-request overrides:

Gemini.generate("Hello", [
  disable_rate_limiter: true,  # Bypass rate limiter
  non_blocking: true,          # Return immediately if rate limited
  max_concurrency_per_model: 8 # Override concurrency
])

Summary

Types

execute_opts()

request_fn()

streaming_release_fn()

Functions

check_status(model, opts \\ [])

Check if a request would be rate limited without executing it.

child_spec(init_arg)

Returns a specification to start this module under a supervisor.

execute(request_fn, model, opts \\ [])

Execute a request through the rate limiter.

execute_streaming(start_fn, model, opts \\ [])

Execute a long-lived streaming request through the rate limiter.

execute_with_usage_tracking(request_fn, model, opts \\ [])

Execute a request, extracting and recording usage from the response.

get_retry_state(model, opts \\ [])

Get the current retry state for a model.

get_usage(model, opts \\ [])

Get current token usage for a model.

reset_all()

Reset all rate limiter state (useful for testing).

start_link(opts \\ [])

Start the rate limiter manager.

Types

execute_opts()

@type execute_opts() :: keyword()

request_fn()

@type request_fn() :: (-> {:ok, term()} | {:error, term()})

streaming_release_fn()

@type streaming_release_fn() :: (atom(), map() | nil -> :ok)

Functions

check_status(model, opts \\ [])

@spec check_status(String.t(), execute_opts()) ::
  :ok
  | {:rate_limited, DateTime.t(), map()}
  | {:over_budget, map()}
  | {:no_permits, non_neg_integer()}

Check if a request would be rate limited without executing it.

Returns

:ok - Request can proceed
{:rate_limited, retry_at, details} - Currently rate limited
{:over_budget, usage} - Would exceed token budget
{:no_permits, available} - No concurrency permits available

child_spec(init_arg)

Returns a specification to start this module under a supervisor.

See Supervisor.

execute(request_fn, model, opts \\ [])

@spec execute(request_fn(), String.t(), execute_opts()) ::
  {:ok, term()} | {:error, term()}

Execute a request through the rate limiter.

Parameters

request_fn - Zero-arity function that makes the actual HTTP request
model - Model name for rate limit tracking
opts - Options for rate limiting and the underlying request

Options

:location - Location for rate limit tracking (default: "us-central1")
:disable_rate_limiter - Bypass all rate limiting (default: false)
:non_blocking - Return immediately if rate limited (default: false)
:max_concurrency_per_model - Override concurrency limit
:estimated_input_tokens - Estimated tokens for budget checking
:estimated_cached_tokens - Estimated cached-context tokens for budget checking
:token_budget_per_window - Maximum tokens per window (nil = no limit)

Returns

{:ok, response} - Request succeeded
{:error, {:rate_limited, retry_at, details}} - Rate limited
{:error, {:transient_failure, attempts, last_error}} - Transient failure
{:error, term()} - Other error

execute_streaming(start_fn, model, opts \\ [])

@spec execute_streaming(request_fn(), String.t(), execute_opts()) ::
  {:ok, {term(), streaming_release_fn()}} | {:error, term()}

Execute a long-lived streaming request through the rate limiter.

Returns the start result and a release function that must be called once the stream completes, errors, or is stopped to reconcile budget and release concurrency permits.

execute_with_usage_tracking(request_fn, model, opts \\ [])

@spec execute_with_usage_tracking(request_fn(), String.t(), execute_opts()) ::
  {:ok, term()} | {:error, term()}

Execute a request, extracting and recording usage from the response.

Similar to execute/3 but also records token usage from successful responses.

get_retry_state(model, opts \\ [])

@spec get_retry_state(
  String.t(),
  keyword()
) :: Gemini.RateLimiter.State.retry_state() | nil

Get the current retry state for a model.

get_usage(model, opts \\ [])

@spec get_usage(
  String.t(),
  keyword()
) :: Gemini.RateLimiter.State.usage_window() | nil

Get current token usage for a model.

reset_all()

@spec reset_all() :: :ok

Reset all rate limiter state (useful for testing).

start_link(opts \\ [])

@spec start_link(keyword()) :: GenServer.on_start()

Start the rate limiter manager.