Error Handling Guide
View SourceChainex provides comprehensive error handling capabilities to build resilient AI applications that gracefully handle failures, timeouts, and unexpected conditions.
Core Error Handling Features
Chainex offers three main error handling mechanisms:
- Retry Mechanism - Automatic retry with configurable backoff
- Timeout Protection - Prevent long-running operations from hanging
- Fallback Handling - Graceful degradation with static or dynamic fallbacks
Retry Mechanism
Basic Retry Configuration
# Retry up to 3 times with 1 second delay between attempts
chain = Chainex.Chain.new("{{message}}")
|> Chainex.Chain.with_retry(max_attempts: 3, delay: 1000)
|> Chainex.Chain.llm(:openai)
case Chainex.Chain.run(chain, %{message: "Hello"}) do
{:ok, result} ->
# Success (possibly after retries)
IO.puts("Got response: #{result}")
{:error, reason} ->
# Failed even after retries
IO.puts("Failed: #{inspect(reason)}")
endSmart Error Detection
The retry mechanism automatically detects retryable errors:
# These errors will trigger retries:
# - Network timeouts
# - Rate limiting (HTTP 429)
# - Server errors (HTTP 5xx)
# - Connection failures
# These errors will NOT trigger retries:
# - Authentication errors (HTTP 401)
# - Invalid requests (HTTP 400)
# - Not found errors (HTTP 404)Exponential Backoff
# Delay increases with each retry: 1s, 2s, 4s, 8s...
chain = Chainex.Chain.new("{{input}}")
|> Chainex.Chain.with_retry(
max_attempts: 4,
delay: 1000,
backoff: :exponential,
max_delay: 10_000 # Cap at 10 seconds
)
|> Chainex.Chain.llm(:openai)Timeout Protection
Global Chain Timeout
# Entire chain must complete within 30 seconds
chain = Chainex.Chain.new("{{query}}")
|> Chainex.Chain.with_timeout(30_000)
|> Chainex.Chain.llm(:openai)
|> Chainex.Chain.transform(fn result ->
# Long processing step
perform_analysis(result)
end)
|> Chainex.Chain.llm(:anthropic)
case Chainex.Chain.run(chain, %{query: "Analyze this data"}) do
{:ok, result} -> result
{:error, :timeout} -> "Operation timed out"
{:error, reason} -> "Error: #{inspect(reason)}"
endStep-Level Timeouts
# Different timeouts for different operations
chain = Chainex.Chain.new("{{input}}")
|> Chainex.Chain.llm(:openai, timeout: 10_000) # 10s for LLM
|> Chainex.Chain.transform(fn result ->
# This transform has the global timeout
expensive_computation(result)
end)
|> Chainex.Chain.llm(:anthropic, timeout: 5_000) # 5s for second LLMFallback Handling
Static Fallback
# Return a fixed response on any error
chain = Chainex.Chain.new("{{question}}")
|> Chainex.Chain.with_fallback("I apologize, but I'm experiencing technical difficulties. Please try again later.")
|> Chainex.Chain.llm(:openai)
# Always returns {:ok, result} - never {:error, reason}
{:ok, response} = Chainex.Chain.run(chain, %{question: "What is AI?"})Dynamic Fallback
# Generate fallback response based on the error
chain = Chainex.Chain.new("{{user_request}}")
|> Chainex.Chain.with_fallback(fn error ->
case error do
{:llm_error, :rate_limit} ->
"I'm currently receiving high traffic. Please try again in a few minutes."
{:llm_error, :timeout} ->
"That request is taking too long to process. Please try a simpler question."
{:network_error, _} ->
"I'm having trouble connecting to my services. Please check your connection."
_ ->
"Something unexpected happened. Please try again or contact support."
end
end)
|> Chainex.Chain.llm(:openai)Provider Fallback
# Automatically fallback to different LLM providers
chain = Chainex.Chain.new("{{prompt}}")
|> Chainex.Chain.llm(:openai, model: "gpt-4")
|> Chainex.Chain.on_error(fn error ->
case error do
{:llm_error, :rate_limit} ->
# Switch to Anthropic if OpenAI is rate limited
{:fallback_to, :anthropic}
{:llm_error, :model_overloaded} ->
# Use a smaller model
{:fallback_to, {:openai, model: "gpt-3.5-turbo"}}
_ ->
{:continue_error, error}
end
end)Combined Error Handling
Comprehensive Error Strategy
# Combine retry, timeout, and fallback for maximum resilience
robust_chain = Chainex.Chain.new(
system: "You are a helpful assistant",
user: "{{user_message}}"
)
|> Chainex.Chain.with_retry(max_attempts: 3, delay: 1000)
|> Chainex.Chain.with_timeout(30_000)
|> Chainex.Chain.with_fallback(fn error, context ->
# Context includes retry attempts, elapsed time, etc.
Logger.warning("Chain failed after #{context.retry_attempts} attempts: #{inspect(error)}")
case error do
{:error, :timeout} ->
"Your request is taking longer than expected. The system might be under heavy load."
{:llm_error, :rate_limit} ->
"Our AI service is currently at capacity. Please try again in a few minutes."
_ ->
"I encountered an unexpected issue. Our team has been notified."
end
end)
|> Chainex.Chain.llm(:openai)Multi-Step Chain Error Handling
# Different error handling for different steps
analysis_chain = Chainex.Chain.new("{{data}}")
|> Chainex.Chain.transform(fn data ->
# Critical step - no fallback, let it fail
validate_input_data(data)
end)
|> Chainex.Chain.llm(:openai, retries: 2)
|> Chainex.Chain.transform(fn analysis ->
# Optional enhancement - fallback to basic analysis
try do
enhanced_analysis(analysis)
rescue
_ -> analysis # Return basic analysis if enhancement fails
end
end)
|> Chainex.Chain.llm(:anthropic)
|> Chainex.Chain.with_fallback("Analysis completed with limited features due to service issues.")Real-World Error Handling Patterns
Customer Support Bot
def create_support_bot do
Chainex.Chain.new(
system: "You are a customer support agent. Be helpful and understanding.",
user: "Customer: {{message}}"
)
|> Chainex.Chain.with_retry(max_attempts: 2, delay: 500)
|> Chainex.Chain.with_timeout(15_000) # 15 second timeout for customer experience
|> Chainex.Chain.with_fallback(fn error, context ->
# Log for monitoring
CustomerSupport.log_ai_failure(error, context)
# Escalate to human agent
case CustomerSupport.create_ticket(context.variables) do
{:ok, ticket_id} ->
"I apologize for the technical difficulty. I've created ticket ##{ticket_id} and a human agent will assist you shortly."
{:error, _} ->
"I'm experiencing technical issues. Please call our support line at 1-800-SUPPORT for immediate assistance."
end
end)
|> Chainex.Chain.llm(:openai)
endData Processing Pipeline
def create_data_processor do
Chainex.Chain.new("{{raw_data}}")
# Step 1: Data validation (fail fast)
|> Chainex.Chain.transform(fn data ->
case DataValidator.validate(data) do
{:ok, clean_data} -> clean_data
{:error, reason} -> raise "Invalid data: #{reason}"
end
end)
# Step 2: AI analysis (with retry and fallback)
|> Chainex.Chain.llm(:openai, retries: 3)
|> Chainex.Chain.on_error(fn
{:llm_error, _} ->
# Fallback to rule-based analysis
{:fallback_result, RuleBasedAnalyzer.analyze(data)}
error ->
{:continue_error, error}
end)
# Step 3: Report generation (timeout protection)
|> Chainex.Chain.with_timeout(60_000) # 1 minute for report
|> Chainex.Chain.transform(fn analysis ->
ReportGenerator.create(analysis)
end)
|> Chainex.Chain.with_fallback("Analysis completed but report generation failed. Raw results available on request.")
endRate Limiting Handler
def create_rate_aware_chain do
Chainex.Chain.new("{{query}}")
|> Chainex.Chain.with_retry(
max_attempts: 5,
delay: 2000,
backoff: :exponential,
max_delay: 30_000
)
|> Chainex.Chain.llm(:openai)
|> Chainex.Chain.on_error(fn error ->
case error do
{:llm_error, :rate_limit} ->
# Check if we can switch to a different provider
case RateLimiter.get_available_provider() do
{:ok, provider} ->
Logger.info("Switching to provider: #{provider}")
{:fallback_to, provider}
:none_available ->
# Queue the request for later
QueueManager.enqueue_for_retry(context.variables, delay: 300_000)
{:fallback_result, "Your request has been queued and will be processed when capacity is available."}
end
_ ->
{:continue_error, error}
end
end)
endError Monitoring and Logging
Structured Error Logging
# Add error context to all chains
monitored_chain = fn base_chain ->
base_chain
|> Chainex.Chain.with_metadata(%{
request_id: generate_request_id(),
user_id: get_current_user_id(),
timestamp: DateTime.utc_now()
})
|> Chainex.Chain.on_error(fn error, context ->
# Structured logging for monitoring
Logger.error("Chain execution failed", %{
error: inspect(error),
request_id: context.metadata.request_id,
user_id: context.metadata.user_id,
retry_attempts: context.retry_attempts,
elapsed_time_ms: context.elapsed_time,
chain_steps: length(context.executed_steps)
})
# Send to error tracking service
ErrorTracker.report(error, context)
{:continue_error, error}
end)
endHealth Checks
def perform_health_check do
health_chain = Chainex.Chain.new("Health check: respond with 'OK'")
|> Chainex.Chain.with_timeout(5000)
|> Chainex.Chain.llm(:openai)
case Chainex.Chain.run(health_chain) do
{:ok, response} when response =~ ~r/ok/i ->
{:healthy, "LLM services operational"}
{:ok, _unexpected} ->
{:degraded, "LLM services responding but may be impaired"}
{:error, :timeout} ->
{:unhealthy, "LLM services not responding"}
{:error, reason} ->
{:unhealthy, "LLM services error: #{inspect(reason)}"}
end
endTesting Error Handling
Unit Tests
defmodule MyApp.ErrorHandlingTest do
use ExUnit.Case
test "handles rate limiting with retry" do
chain = Chainex.Chain.new("{{message}}")
|> Chainex.Chain.with_retry(max_attempts: 2, delay: 10)
|> Chainex.Chain.with_fallback("Rate limited fallback")
|> Chainex.Chain.llm(:mock, mock_error: {:rate_limit, "Too many requests"})
{:ok, result} = Chainex.Chain.run(chain, %{message: "Hello"})
assert result == "Rate limited fallback"
end
test "timeout with fallback" do
chain = Chainex.Chain.new("Test")
|> Chainex.Chain.with_timeout(50)
|> Chainex.Chain.with_fallback("Timeout fallback")
|> Chainex.Chain.transform(fn _input ->
Process.sleep(100) # Simulate slow operation
"Should timeout"
end)
{:ok, result} = Chainex.Chain.run(chain)
assert result == "Timeout fallback"
end
endBest Practices
1. Layered Error Handling
Apply error handling at appropriate levels:
- Transport level: Network timeouts, connection failures
- Service level: Rate limiting, service unavailability
- Application level: Business logic errors, validation failures
- User level: Friendly error messages, graceful degradation
2. Error Context
Always provide context in error handling:
- Include request IDs for tracing
- Log user actions that led to errors
- Track error patterns for system health monitoring
3. Graceful Degradation
Design fallback strategies that maintain user experience:
- Reduced functionality over complete failure
- Clear communication about limitations
- Alternative paths to achieve user goals
4. Testing
Test error conditions thoroughly:
- Network failures and timeouts
- Rate limiting scenarios
- Invalid responses and parsing errors
- Concurrent error conditions
Error handling is critical for production LLM applications. Chainex's comprehensive error handling features help you build resilient systems that gracefully handle the inherent unpredictability of AI services.