Kreuzberg (kreuzberg v4.9.5)

High-performance document extraction for Elixir.

Examples

# Extract from binary with MIME type
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf")

# With configuration
config = %Kreuzberg.ExtractionConfig{force_ocr: true}
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf", config)

# Bang variant
result = Kreuzberg.extract!(pdf_binary, "application/pdf")

Summary

Functions

batch_extract_bytes(data_list, mime_types, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_bytes/3.

batch_extract_bytes!(data_list, mime_types, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_bytes!/3.

batch_extract_bytes_async(data_list, mime_types, config \\ nil)

See Kreuzberg.AsyncAPI.batch_extract_bytes_async/3.

batch_extract_files(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_files/3.

batch_extract_files!(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_files!/3.

batch_extract_files_async(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.AsyncAPI.batch_extract_files_async/3.

cache_stats()

See Kreuzberg.CacheAPI.cache_stats/0.

cache_stats!()

See Kreuzberg.CacheAPI.cache_stats!/0.

classify_error(reason)

See Kreuzberg.UtilityAPI.classify_error/1.

clear_cache()

See Kreuzberg.CacheAPI.clear_cache/0.

clear_cache!()

See Kreuzberg.CacheAPI.clear_cache!/0.

detect_mime_type(data)

See Kreuzberg.UtilityAPI.detect_mime_type/1.

detect_mime_type_from_path(path)

See Kreuzberg.UtilityAPI.detect_mime_type_from_path/1.

discover_extraction_config()

See Kreuzberg.ExtractionConfig.discover/0.

do_embed(texts, config \\ nil)

Generate text embeddings for a list of strings.

do_embed!(texts, config \\ nil)

Generate text embeddings, raising on error.

embed(texts, config \\ nil)

See Kreuzberg.do_embed/2.

embed!(texts, config \\ nil)

See Kreuzberg.do_embed!/2.

extract(input, mime_type, config \\ nil)

Extract content from binary document data.

extract!(input, mime_type, config \\ nil)

Extract content, raising on error

extract_async(input, mime_type, config \\ nil)

See Kreuzberg.AsyncAPI.extract_async/3.

extract_file(path, mime_type \\ nil, config \\ nil)

Extract content from a file at the given path.

extract_file!(path, mime_type \\ nil, config \\ nil)

Extract content from a file, raising on error.

extract_file_async(path, mime_type \\ nil, config \\ nil)

See Kreuzberg.AsyncAPI.extract_file_async/3.

extract_with_plugins(input, mime_type, config \\ nil, plugin_opts \\ [])

Extract content with plugin processing support.

get_embedding_preset(name)

See Kreuzberg.UtilityAPI.get_embedding_preset/1.

get_error_details()

See Kreuzberg.UtilityAPI.get_error_details/0.

get_extensions_for_mime(mime_type)

See Kreuzberg.UtilityAPI.get_extensions_for_mime/1.

list_embedding_presets()

See Kreuzberg.UtilityAPI.list_embedding_presets/0.

render_pdf_page(path, page_index, opts \\ [])

Render a single PDF page as a PNG image.

render_pdf_pages_stream(path, opts \\ [])

Return a lazy Stream that yields {page_index, png_binary} tuples.

validate_binarization_method(method)

See Kreuzberg.Validators.validate_binarization_method/1.

validate_chunking_params(params)

See Kreuzberg.Validators.validate_chunking_params/1.

validate_confidence(confidence)

See Kreuzberg.Validators.validate_confidence/1.

validate_dpi(dpi)

See Kreuzberg.Validators.validate_dpi/1.

validate_language_code(code)

See Kreuzberg.Validators.validate_language_code/1.

validate_mime_type(mime_type)

See Kreuzberg.UtilityAPI.validate_mime_type/1.

validate_ocr_backend(backend)

See Kreuzberg.Validators.validate_ocr_backend/1.

validate_tesseract_oem(oem)

See Kreuzberg.Validators.validate_tesseract_oem/1.

validate_tesseract_psm(psm)

See Kreuzberg.Validators.validate_tesseract_psm/1.

Functions

batch_extract_bytes(data_list, mime_types, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_bytes/3.

batch_extract_bytes!(data_list, mime_types, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_bytes!/3.

batch_extract_bytes_async(data_list, mime_types, config \\ nil)

See Kreuzberg.AsyncAPI.batch_extract_bytes_async/3.

batch_extract_files(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_files/3.

batch_extract_files!(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_files!/3.

batch_extract_files_async(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.AsyncAPI.batch_extract_files_async/3.

cache_stats()

See Kreuzberg.CacheAPI.cache_stats/0.

cache_stats!()

See Kreuzberg.CacheAPI.cache_stats!/0.

classify_error(reason)

See Kreuzberg.UtilityAPI.classify_error/1.

clear_cache()

See Kreuzberg.CacheAPI.clear_cache/0.

clear_cache!()

See Kreuzberg.CacheAPI.clear_cache!/0.

detect_mime_type(data)

See Kreuzberg.UtilityAPI.detect_mime_type/1.

detect_mime_type_from_path(path)

See Kreuzberg.UtilityAPI.detect_mime_type_from_path/1.

discover_extraction_config()

See Kreuzberg.ExtractionConfig.discover/0.

do_embed(texts, config \\ nil)

Generate text embeddings for a list of strings.

Parameters

texts - List of strings to embed
config - EmbeddingConfig struct or nil

Returns

{:ok, [[float()]]} - List of embedding vectors
{:error, reason} - Embedding failed

Examples

# Embed with default config (balanced preset)
iex> {:ok, embeddings} = Kreuzberg.embed(["Hello world", "How are you?"])
iex> length(embeddings) == 2
true

# Embed with a specific preset
iex> config = %Kreuzberg.EmbeddingConfig{model: {:preset, "fast"}}
iex> {:ok, embeddings} = Kreuzberg.embed(["Hello world"], config)
iex> is_list(hd(embeddings))
true

do_embed!(texts, config \\ nil)

Generate text embeddings, raising on error.

Same as do_embed/2 but raises a Kreuzberg.Error on failure.

Examples

# Embed and get results directly
iex> embeddings = Kreuzberg.embed!(["Hello world"])
iex> is_list(embeddings)
true

# Each embedding is a list of floats
iex> [vector | _rest] = Kreuzberg.embed!(["Test sentence"])
iex> is_float(hd(vector))
true

embed(texts, config \\ nil)

See Kreuzberg.do_embed/2.

embed!(texts, config \\ nil)

See Kreuzberg.do_embed!/2.

extract(input, mime_type, config \\ nil)

@spec extract(
  binary(),
  String.t(),
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil
) ::
  {:ok, Kreuzberg.ExtractionResult.t()} | {:error, String.t()}

Extract content from binary document data.

Performs document extraction on binary input with support for various file formats. Returns extracted content including text, metadata, tables, images, and more. If no configuration is provided, uses default extraction settings.

Parameters

input - Binary document data to extract from
mime_type - MIME type of the document (e.g., "application/pdf", "text/plain")
config - ExtractionConfig struct, map, keyword list, or nil (optional, defaults to nil)

Returns

{:ok, ExtractionResult.t()} - Successfully extracted content with metadata
{:error, reason} - Extraction failed with error message

Examples

# Extract from binary with MIME type
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf")
result.content

# Extract with configuration
config = %Kreuzberg.ExtractionConfig{ocr: %{"enabled" => true}}
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf", config)

# With keyword list configuration
{:ok, result} = Kreuzberg.extract(
  pdf_binary,
  "application/pdf",
  ocr: %{"enabled" => true}
)

extract!(input, mime_type, config \\ nil)

@spec extract!(
  binary(),
  String.t(),
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil
) ::
  Kreuzberg.ExtractionResult.t()

Extract content, raising on error

extract_async(input, mime_type, config \\ nil)

See Kreuzberg.AsyncAPI.extract_async/3.

extract_file(path, mime_type \\ nil, config \\ nil)

@spec extract_file(
  String.t() | Path.t(),
  String.t() | nil,
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil
) :: {:ok, Kreuzberg.ExtractionResult.t()} | {:error, String.t()}

Extract content from a file at the given path.

Accepts a file path and optional MIME type, returning extracted content. If no MIME type is provided, the library will attempt to detect it from the file.

Parameters

path - File path (String or Path.t())
mime_type - MIME type of the file (optional, defaults to nil for auto-detection)
config - ExtractionConfig struct or map with extraction options (optional)

Returns

{:ok, ExtractionResult.t()} - Successfully extracted content
{:error, reason} - Extraction failed with error message

Examples

# Extract with explicit MIME type
{:ok, result} = Kreuzberg.extract_file("document.pdf", "application/pdf")
result.content

# Extract with auto-detection
{:ok, result} = Kreuzberg.extract_file("document.pdf")

# With configuration
config = %Kreuzberg.ExtractionConfig{force_ocr: true}
{:ok, result} = Kreuzberg.extract_file("document.pdf", "application/pdf", config)

# With keyword list configuration
{:ok, result} = Kreuzberg.extract_file(
  "document.pdf",
  "application/pdf",
  ocr: %{"enabled" => true}
)

extract_file!(path, mime_type \\ nil, config \\ nil)

@spec extract_file!(
  String.t() | Path.t(),
  String.t() | nil,
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil
) :: Kreuzberg.ExtractionResult.t()

Extract content from a file, raising on error.

Same as extract_file/3 but raises a Kreuzberg.Error exception if extraction fails.

Parameters

path - File path (String or Path.t())
mime_type - MIME type of the file (optional, defaults to nil for auto-detection)
config - ExtractionConfig struct or map with extraction options (optional)

Returns

ExtractionResult.t() - Successfully extracted content

Raises

Kreuzberg.Error - If extraction fails

Examples

# Extract with explicit MIME type, raising on error
result = Kreuzberg.extract_file!("document.pdf", "application/pdf")
result.content

# Extract with auto-detection, raising on error
result = Kreuzberg.extract_file!("document.pdf")
result.content

# With configuration
config = %Kreuzberg.ExtractionConfig{ocr: %{"enabled" => true}}
result = Kreuzberg.extract_file!("document.pdf", "application/pdf", config)

extract_file_async(path, mime_type \\ nil, config \\ nil)

See Kreuzberg.AsyncAPI.extract_file_async/3.

extract_with_plugins(input, mime_type, config \\ nil, plugin_opts \\ [])

@spec extract_with_plugins(
  binary(),
  String.t(),
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil,
  keyword()
) :: {:ok, Kreuzberg.ExtractionResult.t()} | {:error, String.t()}

Extract content with plugin processing support.

Performs document extraction with additional processing through registered plugins. Applies validators before extraction, post-processors by stage (early, middle, late) after extraction, and optional final validators to the result.

Plugins are retrieved from the Plugin.Registry if not explicitly provided in plugin_opts.

Parameters

input - Binary document data to extract from
mime_type - MIME type of the document (e.g., "application/pdf")
config - ExtractionConfig struct, map, keyword list, or nil for extraction (optional)
plugin_opts - Keyword list of plugin options (optional):
- :validators - List of validator modules to run before extraction
- :post_processors - Map of stage atoms to lists of post-processor modules
  - :early - Applied first to extraction result
  - :middle - Applied after early processors
  - :late - Applied last before final validators
- :final_validators - List of validator modules to run after post-processing

Returns

{:ok, ExtractionResult.t()} - Successfully extracted and processed content
{:error, reason} - Extraction or processing failed with error message

Plugin Processing Flow

Validators - If specified, run input validators to check extraction preconditions
Extraction - Call extract/3 to get initial result
Post-Processors - Apply by stage in order (early → middle → late)
- Each processor receives the extraction result or output from previous processor
- Processors should return modified result or data
Final Validators - If specified, validate the processed result
Return - Return enhanced extraction result

Examples

# Extract with registered validators and post-processors
{:ok, result} = Kreuzberg.extract_with_plugins(
  pdf_binary,
  "application/pdf",
  nil,
  validators: [MyApp.InputValidator],
  post_processors: %{
    early: [MyApp.EarlyProcessor],
    middle: [MyApp.MiddleProcessor],
    late: [MyApp.FinalProcessor]
  },
  final_validators: [MyApp.ResultValidator]
)

# Extract with only post-processors
{:ok, result} = Kreuzberg.extract_with_plugins(
  pdf_binary,
  "application/pdf",
  %{use_cache: true},
  post_processors: %{
    early: [MyApp.Processor1, MyApp.Processor2]
  }
)

# Extract with configuration and validators only
config = %Kreuzberg.ExtractionConfig{ocr: %{"enabled" => true}}
{:ok, result} = Kreuzberg.extract_with_plugins(
  pdf_binary,
  "application/pdf",
  config,
  validators: [MyApp.Validator]
)

# Extract with no plugins (standard extraction)
{:ok, result} = Kreuzberg.extract_with_plugins(pdf_binary, "application/pdf")

get_embedding_preset(name)

See Kreuzberg.UtilityAPI.get_embedding_preset/1.

get_error_details()

See Kreuzberg.UtilityAPI.get_error_details/0.

get_extensions_for_mime(mime_type)

See Kreuzberg.UtilityAPI.get_extensions_for_mime/1.

list_embedding_presets()

See Kreuzberg.UtilityAPI.list_embedding_presets/0.

render_pdf_page(path, page_index, opts \\ [])

@spec render_pdf_page(String.t(), non_neg_integer(), keyword()) ::
  {:ok, binary()} | {:error, String.t()}

Render a single PDF page as a PNG image.

Parameters

path - Path to the PDF file
page_index - Zero-based page index
opts - Keyword list of options:
- :dpi - Rendering resolution (default 150)

Returns

{:ok, binary()} - PNG-encoded binary
{:error, reason} - Rendering failed

Examples

{:ok, png} = Kreuzberg.render_pdf_page("document.pdf", 0)
{:ok, png} = Kreuzberg.render_pdf_page("document.pdf", 2, dpi: 300)

render_pdf_pages_stream(path, opts \\ [])

@spec render_pdf_pages_stream(
  String.t(),
  keyword()
) :: Enumerable.t()

Return a lazy Stream that yields {page_index, png_binary} tuples.

Pages are rendered one at a time via the native PDF page iterator, so only one page's worth of PNG bytes is in memory at a time.

Parameters

path - Path to the PDF file
opts - Keyword list of options:
- :dpi - Rendering resolution (default 150)

Returns

Enumerable.t() - A Stream of {non_neg_integer(), binary()} tuples

Examples

Kreuzberg.render_pdf_pages_stream("document.pdf")
|> Enum.each(fn {page_index, png} ->
  File.write!("page_#{page_index}.png", png)
end)

validate_binarization_method(method)

See Kreuzberg.Validators.validate_binarization_method/1.

validate_chunking_params(params)

See Kreuzberg.Validators.validate_chunking_params/1.

validate_confidence(confidence)

See Kreuzberg.Validators.validate_confidence/1.

validate_dpi(dpi)

See Kreuzberg.Validators.validate_dpi/1.

validate_language_code(code)

See Kreuzberg.Validators.validate_language_code/1.

validate_mime_type(mime_type)

See Kreuzberg.UtilityAPI.validate_mime_type/1.

validate_ocr_backend(backend)

See Kreuzberg.Validators.validate_ocr_backend/1.

validate_tesseract_oem(oem)

See Kreuzberg.Validators.validate_tesseract_oem/1.

validate_tesseract_psm(psm)

See Kreuzberg.Validators.validate_tesseract_psm/1.