PhoenixAI includes a guardrails pipeline that runs safety policies before your AI call.
Policies execute sequentially and halt on the first violation, following the same
railway-oriented pattern used in PhoenixAI.Pipeline.
Quick Start
alias PhoenixAI.Guardrails.{Pipeline, Request}
alias PhoenixAI.Message
# Build a request from user messages
request = %Request{
messages: [%Message{role: :user, content: "Hello, how are you?"}]
}
# Use a preset or build your own policy list
policies = Pipeline.preset(:default)
case Pipeline.run(policies, request) do
{:ok, request} ->
# Safe — proceed with AI call
AI.chat(request.messages, provider: :openai)
{:error, violation} ->
# Blocked — return a safe response
IO.puts("Blocked: #{violation.reason}")
endPresets
Three built-in presets cover common configurations:
# Minimal — jailbreak detection only
Pipeline.preset(:default)
# => [{JailbreakDetection, []}]
# Maximum protection — all three policies
Pipeline.preset(:strict)
# => [{JailbreakDetection, []}, {ContentFilter, []}, {ToolPolicy, []}]
# Reduced false positives — higher jailbreak threshold
Pipeline.preset(:permissive)
# => [{JailbreakDetection, [threshold: 0.9]}]Configuration with from_config/1
Build a policy list from a keyword config validated by NimbleOptions:
# From a preset
{:ok, policies} = Pipeline.from_config(preset: :strict)
# Preset with overrides
{:ok, policies} = Pipeline.from_config(
preset: :default,
jailbreak_threshold: 0.5,
jailbreak_scope: :all_user_messages
)
# Explicit policy list
{:ok, policies} = Pipeline.from_config(
policies: [
{JailbreakDetection, [threshold: 0.6]},
{ContentFilter, [pre: &MyApp.sanitize/1]},
{ToolPolicy, [allow: ["search", "calculate"]]}
]
)
# Invalid config returns a descriptive error
{:error, %NimbleOptions.ValidationError{}} =
Pipeline.from_config(preset: :unknown)Config Options
| Key | Type | Description |
|---|---|---|
:preset | :default | :strict | :permissive | Named preset |
:policies | [{module, keyword}] | Explicit policy list (overrides preset) |
:jailbreak_threshold | float | Override jailbreak score threshold |
:jailbreak_scope | :last_message | :all_user_messages | Override jailbreak scan scope |
:jailbreak_detector | atom | Override jailbreak detector module |
Built-in Policies
Jailbreak Detection
Detects jailbreak attempts using pattern matching against known attack categories: role overrides, instruction overrides, DAN patterns, and base64 encoding evasion.
# Default threshold (0.7), scans last user message
policies = [{JailbreakDetection, []}]
# Custom threshold and scope
policies = [{JailbreakDetection, [
threshold: 0.5,
scope: :all_user_messages
]}]
# Custom detector module
policies = [{JailbreakDetection, [
detector: MyApp.MLJailbreakDetector
]}]Custom Detector
Implement the JailbreakDetector behaviour to use your own detection logic:
defmodule MyApp.MLJailbreakDetector do
@behaviour PhoenixAI.Guardrails.JailbreakDetector
alias PhoenixAI.Guardrails.JailbreakDetector.DetectionResult
@impl true
def detect(content, _opts) do
score = MyApp.ML.score_jailbreak(content)
result = %DetectionResult{
score: score,
patterns: if(score > 0.5, do: ["ml_detected"], else: []),
details: %{model: "jailbreak-v2"}
}
if score > 0.5, do: {:detected, result}, else: {:safe, result}
end
endContent Filter
Applies user-defined function hooks for content inspection. The :pre hook runs first,
then :post. If :pre rejects, :post is skipped.
# Sanitize messages before AI call
pre_hook = fn request ->
cleaned = Enum.map(request.messages, fn msg ->
%{msg | content: String.replace(msg.content, ~r/<[^>]+>/, "")}
end)
{:ok, %{request | messages: cleaned}}
end
# Validate after all other policies pass
post_hook = fn request ->
if has_pii?(request.messages) do
{:error, "PII detected in messages"}
else
{:ok, request}
end
end
policies = [{ContentFilter, [pre: pre_hook, post: post_hook]}]Hooks must return {:ok, %Request{}} to continue or {:error, reason} to halt.
Tool Policy
Enforces allowlists or denylists for tool calls. Only one mode can be active at a time.
# Allowlist — only these tools are permitted
policies = [{ToolPolicy, [allow: ["search", "calculate"]]}]
# Denylist — these tools are blocked, all others pass
policies = [{ToolPolicy, [deny: ["delete_all", "drop_table"]]}]When no tool calls are present in the request, the policy passes through.
Custom Policies
Implement the Policy behaviour to create your own:
defmodule MyApp.Guardrails.RateLimitPolicy do
@behaviour PhoenixAI.Guardrails.Policy
alias PhoenixAI.Guardrails.{PolicyViolation, Request}
@impl true
def check(%Request{} = request, opts) do
user_id = request.user_id
limit = Keyword.get(opts, :limit, 100)
if MyApp.RateLimiter.within_limit?(user_id, limit) do
{:ok, request}
else
{:halt, %PolicyViolation{
policy: __MODULE__,
reason: "Rate limit exceeded for user #{user_id}",
metadata: %{user_id: user_id, limit: limit}
}}
end
end
end
# Use it in a pipeline
policies = [
{MyApp.Guardrails.RateLimitPolicy, [limit: 50]},
{JailbreakDetection, []}
]Inter-Policy Communication
Use the assigns field on Request to pass data between policies:
# First policy adds data
def check(request, _opts) do
{:ok, %{request | assigns: Map.put(request.assigns, :sanitized, true)}}
end
# Later policy reads it
def check(request, _opts) do
if request.assigns[:sanitized] do
{:ok, request}
else
{:halt, %PolicyViolation{policy: __MODULE__, reason: "Not sanitized"}}
end
endTelemetry
The guardrails pipeline emits telemetry events for observability:
| Event | Metadata |
|---|---|
[:phoenix_ai, :guardrails, :check, :start] | %{policy_count: integer} |
[:phoenix_ai, :guardrails, :check, :stop] | %{policy_count: integer} + duration |
[:phoenix_ai, :guardrails, :check, :exception] | %{policy_count: integer} + exception info |
[:phoenix_ai, :guardrails, :policy, :start] | %{policy: module} |
[:phoenix_ai, :guardrails, :policy, :stop] | %{policy: module, result: :pass | :violation} + duration |
[:phoenix_ai, :guardrails, :jailbreak, :detected] | %{score: float, threshold: float, patterns: [String.t()]} |
Example: Logging policy execution
:telemetry.attach(
"guardrails-logger",
[:phoenix_ai, :guardrails, :policy, :stop],
fn _event, measurements, metadata, _config ->
duration_ms = System.convert_time_unit(measurements.duration, :native, :millisecond)
Logger.info(
"Guardrails policy #{inspect(metadata.policy)} " <>
"result=#{metadata.result} duration=#{duration_ms}ms"
)
end,
nil
)Example: Alerting on jailbreak detection
:telemetry.attach(
"jailbreak-alert",
[:phoenix_ai, :guardrails, :jailbreak, :detected],
fn _event, _measurements, metadata, _config ->
Logger.warning(
"Jailbreak detected: score=#{metadata.score} " <>
"threshold=#{metadata.threshold} patterns=#{inspect(metadata.patterns)}"
)
end,
nil
)Full Example: Integrating with AI.chat/2
defmodule MyApp.SafeChat do
alias PhoenixAI.Guardrails.{Pipeline, Request}
alias PhoenixAI.Message
def chat(user_input, opts \\ []) do
request = %Request{
messages: [
%Message{role: :system, content: "You are a helpful assistant."},
%Message{role: :user, content: user_input}
],
user_id: opts[:user_id]
}
with {:ok, policies} <- Pipeline.from_config(preset: :strict, jailbreak_threshold: 0.5),
{:ok, safe_request} <- Pipeline.run(policies, request) do
AI.chat(safe_request.messages, provider: :openai)
else
{:error, %PhoenixAI.Guardrails.PolicyViolation{} = v} ->
{:error, {:blocked, v.reason}}
{:error, reason} ->
{:error, reason}
end
end
end