Quick Start Guide
View SourceGet up and running with DeepEvalEx in 5 minutes.
Prerequisites
- Elixir 1.15+
- An OpenAI API key (or other LLM provider)
Installation
Add to mix.exs:
def deps do
[{:deep_eval_ex, "~> 0.1.0"}]
endmix deps.get
Configuration
Set your API key:
export OPENAI_API_KEY="sk-..."
Or in config/config.exs:
config :deep_eval_ex,
openai_api_key: System.get_env("OPENAI_API_KEY")Your First Evaluation
1. Simple Exact Match
alias DeepEvalEx.{TestCase, Metrics}
# Create a test case
test_case = TestCase.new!(
input: "What is 2 + 2?",
actual_output: "4",
expected_output: "4"
)
# Evaluate with ExactMatch
{:ok, result} = Metrics.ExactMatch.measure(test_case)
IO.puts("Score: #{result.score}") # => 1.0
IO.puts("Passed: #{result.success}") # => true2. LLM-as-Judge with GEval
alias DeepEvalEx.{TestCase, Metrics.GEval}
# Define evaluation criteria
metric = GEval.new(
name: "Helpfulness",
criteria: "Is the response helpful and addresses the user's question?",
evaluation_params: [:input, :actual_output]
)
# Create test case
test_case = TestCase.new!(
input: "How do I center a div in CSS?",
actual_output: "Use flexbox: display: flex; justify-content: center; align-items: center;"
)
# Evaluate
{:ok, result} = GEval.evaluate(metric, test_case)
IO.puts("Score: #{result.score}") # => 0.85
IO.puts("Reason: #{result.reason}") # => "The response provides a correct and concise solution..."3. RAG Evaluation
alias DeepEvalEx.{TestCase, Metrics.GEval}
# Evaluate if response is grounded in context
metric = GEval.new(
name: "Groundedness",
criteria: "Is the response accurately based on the provided context?",
evaluation_params: [:input, :actual_output, :retrieval_context]
)
test_case = TestCase.new!(
input: "What is the company's remote work policy?",
actual_output: "Employees can work remotely up to 3 days per week.",
retrieval_context: [
"Policy 4.2: Full-time employees may work from home up to 3 days per week.",
"Policy 4.3: Remote work requests must be approved by the direct manager."
]
)
{:ok, result} = GEval.evaluate(metric, test_case)Batch Evaluation
Evaluate multiple test cases concurrently:
test_cases = [
TestCase.new!(input: "Q1", actual_output: "A1", expected_output: "A1"),
TestCase.new!(input: "Q2", actual_output: "A2", expected_output: "A2"),
TestCase.new!(input: "Q3", actual_output: "A3", expected_output: "A3")
]
results = DeepEvalEx.evaluate_batch(test_cases, [Metrics.ExactMatch])
# Results is a list of result lists (one per test case)
Enum.each(results, fn [result] ->
IO.puts("#{result.metric}: #{result.score}")
end)Using in Tests
defmodule MyApp.LLMTest do
use ExUnit.Case
alias DeepEvalEx.{TestCase, Metrics}
test "LLM returns accurate responses" do
test_case = TestCase.new!(
input: "What is the capital of France?",
actual_output: get_llm_response("What is the capital of France?"),
expected_output: "Paris"
)
{:ok, result} = Metrics.ExactMatch.measure(test_case,
case_sensitive: false
)
assert result.success, "Expected pass but got: #{result.reason}"
end
test "LLM responses are helpful" do
metric = Metrics.GEval.new(
name: "Helpfulness",
criteria: "The response should be helpful and actionable",
evaluation_params: [:input, :actual_output],
threshold: 0.7
)
test_case = TestCase.new!(
input: "How do I learn Elixir?",
actual_output: get_llm_response("How do I learn Elixir?")
)
{:ok, result} = Metrics.GEval.evaluate(metric, test_case)
assert result.success, """
Helpfulness score too low: #{result.score}
Reason: #{result.reason}
"""
end
endNext Steps
- Metrics Overview - Learn about all available metrics
- GEval Guide - Deep dive into LLM-as-judge evaluation
- Configuration - Advanced configuration options
- Custom Metrics - Build your own metrics