Kreuzberg (kreuzberg v4.9.5)

Copy Markdown View Source

High-performance document extraction for Elixir.

Examples

# Extract from binary with MIME type
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf")

# With configuration
config = %Kreuzberg.ExtractionConfig{force_ocr: true}
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf", config)

# Bang variant
result = Kreuzberg.extract!(pdf_binary, "application/pdf")

Summary

Functions

Generate text embeddings for a list of strings.

Generate text embeddings, raising on error.

Extract content from binary document data.

Extract content, raising on error

Extract content from a file at the given path.

Extract content from a file, raising on error.

Extract content with plugin processing support.

Render a single PDF page as a PNG image.

Return a lazy Stream that yields {page_index, png_binary} tuples.

Functions

batch_extract_bytes(data_list, mime_types, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_bytes/3.

batch_extract_bytes!(data_list, mime_types, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_bytes!/3.

batch_extract_bytes_async(data_list, mime_types, config \\ nil)

See Kreuzberg.AsyncAPI.batch_extract_bytes_async/3.

batch_extract_files(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_files/3.

batch_extract_files!(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.BatchAPI.batch_extract_files!/3.

batch_extract_files_async(paths, mime_type \\ nil, config \\ nil)

See Kreuzberg.AsyncAPI.batch_extract_files_async/3.

cache_stats()

See Kreuzberg.CacheAPI.cache_stats/0.

cache_stats!()

See Kreuzberg.CacheAPI.cache_stats!/0.

classify_error(reason)

See Kreuzberg.UtilityAPI.classify_error/1.

clear_cache()

See Kreuzberg.CacheAPI.clear_cache/0.

clear_cache!()

See Kreuzberg.CacheAPI.clear_cache!/0.

detect_mime_type(data)

See Kreuzberg.UtilityAPI.detect_mime_type/1.

detect_mime_type_from_path(path)

See Kreuzberg.UtilityAPI.detect_mime_type_from_path/1.

discover_extraction_config()

See Kreuzberg.ExtractionConfig.discover/0.

do_embed(texts, config \\ nil)

Generate text embeddings for a list of strings.

Parameters

  • texts - List of strings to embed
  • config - EmbeddingConfig struct or nil

Returns

  • {:ok, [[float()]]} - List of embedding vectors
  • {:error, reason} - Embedding failed

Examples

# Embed with default config (balanced preset)
iex> {:ok, embeddings} = Kreuzberg.embed(["Hello world", "How are you?"])
iex> length(embeddings) == 2
true

# Embed with a specific preset
iex> config = %Kreuzberg.EmbeddingConfig{model: {:preset, "fast"}}
iex> {:ok, embeddings} = Kreuzberg.embed(["Hello world"], config)
iex> is_list(hd(embeddings))
true

do_embed!(texts, config \\ nil)

Generate text embeddings, raising on error.

Same as do_embed/2 but raises a Kreuzberg.Error on failure.

Examples

# Embed and get results directly
iex> embeddings = Kreuzberg.embed!(["Hello world"])
iex> is_list(embeddings)
true

# Each embedding is a list of floats
iex> [vector | _rest] = Kreuzberg.embed!(["Test sentence"])
iex> is_float(hd(vector))
true

embed(texts, config \\ nil)

See Kreuzberg.do_embed/2.

embed!(texts, config \\ nil)

See Kreuzberg.do_embed!/2.

extract(input, mime_type, config \\ nil)

@spec extract(
  binary(),
  String.t(),
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil
) ::
  {:ok, Kreuzberg.ExtractionResult.t()} | {:error, String.t()}

Extract content from binary document data.

Performs document extraction on binary input with support for various file formats. Returns extracted content including text, metadata, tables, images, and more. If no configuration is provided, uses default extraction settings.

Parameters

  • input - Binary document data to extract from
  • mime_type - MIME type of the document (e.g., "application/pdf", "text/plain")
  • config - ExtractionConfig struct, map, keyword list, or nil (optional, defaults to nil)

Returns

  • {:ok, ExtractionResult.t()} - Successfully extracted content with metadata
  • {:error, reason} - Extraction failed with error message

Examples

# Extract from binary with MIME type
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf")
result.content

# Extract with configuration
config = %Kreuzberg.ExtractionConfig{ocr: %{"enabled" => true}}
{:ok, result} = Kreuzberg.extract(pdf_binary, "application/pdf", config)

# With keyword list configuration
{:ok, result} = Kreuzberg.extract(
  pdf_binary,
  "application/pdf",
  ocr: %{"enabled" => true}
)

extract!(input, mime_type, config \\ nil)

Extract content, raising on error

extract_async(input, mime_type, config \\ nil)

See Kreuzberg.AsyncAPI.extract_async/3.

extract_file(path, mime_type \\ nil, config \\ nil)

@spec extract_file(
  String.t() | Path.t(),
  String.t() | nil,
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil
) :: {:ok, Kreuzberg.ExtractionResult.t()} | {:error, String.t()}

Extract content from a file at the given path.

Accepts a file path and optional MIME type, returning extracted content. If no MIME type is provided, the library will attempt to detect it from the file.

Parameters

  • path - File path (String or Path.t())
  • mime_type - MIME type of the file (optional, defaults to nil for auto-detection)
  • config - ExtractionConfig struct or map with extraction options (optional)

Returns

  • {:ok, ExtractionResult.t()} - Successfully extracted content
  • {:error, reason} - Extraction failed with error message

Examples

# Extract with explicit MIME type
{:ok, result} = Kreuzberg.extract_file("document.pdf", "application/pdf")
result.content

# Extract with auto-detection
{:ok, result} = Kreuzberg.extract_file("document.pdf")

# With configuration
config = %Kreuzberg.ExtractionConfig{force_ocr: true}
{:ok, result} = Kreuzberg.extract_file("document.pdf", "application/pdf", config)

# With keyword list configuration
{:ok, result} = Kreuzberg.extract_file(
  "document.pdf",
  "application/pdf",
  ocr: %{"enabled" => true}
)

extract_file!(path, mime_type \\ nil, config \\ nil)

@spec extract_file!(
  String.t() | Path.t(),
  String.t() | nil,
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil
) :: Kreuzberg.ExtractionResult.t()

Extract content from a file, raising on error.

Same as extract_file/3 but raises a Kreuzberg.Error exception if extraction fails.

Parameters

  • path - File path (String or Path.t())
  • mime_type - MIME type of the file (optional, defaults to nil for auto-detection)
  • config - ExtractionConfig struct or map with extraction options (optional)

Returns

  • ExtractionResult.t() - Successfully extracted content

Raises

Examples

# Extract with explicit MIME type, raising on error
result = Kreuzberg.extract_file!("document.pdf", "application/pdf")
result.content

# Extract with auto-detection, raising on error
result = Kreuzberg.extract_file!("document.pdf")
result.content

# With configuration
config = %Kreuzberg.ExtractionConfig{ocr: %{"enabled" => true}}
result = Kreuzberg.extract_file!("document.pdf", "application/pdf", config)

extract_file_async(path, mime_type \\ nil, config \\ nil)

See Kreuzberg.AsyncAPI.extract_file_async/3.

extract_with_plugins(input, mime_type, config \\ nil, plugin_opts \\ [])

@spec extract_with_plugins(
  binary(),
  String.t(),
  Kreuzberg.ExtractionConfig.t() | map() | keyword() | nil,
  keyword()
) :: {:ok, Kreuzberg.ExtractionResult.t()} | {:error, String.t()}

Extract content with plugin processing support.

Performs document extraction with additional processing through registered plugins. Applies validators before extraction, post-processors by stage (early, middle, late) after extraction, and optional final validators to the result.

Plugins are retrieved from the Plugin.Registry if not explicitly provided in plugin_opts.

Parameters

  • input - Binary document data to extract from
  • mime_type - MIME type of the document (e.g., "application/pdf")
  • config - ExtractionConfig struct, map, keyword list, or nil for extraction (optional)
  • plugin_opts - Keyword list of plugin options (optional):
    • :validators - List of validator modules to run before extraction
    • :post_processors - Map of stage atoms to lists of post-processor modules
      • :early - Applied first to extraction result
      • :middle - Applied after early processors
      • :late - Applied last before final validators
    • :final_validators - List of validator modules to run after post-processing

Returns

  • {:ok, ExtractionResult.t()} - Successfully extracted and processed content
  • {:error, reason} - Extraction or processing failed with error message

Plugin Processing Flow

  1. Validators - If specified, run input validators to check extraction preconditions
  2. Extraction - Call extract/3 to get initial result
  3. Post-Processors - Apply by stage in order (early → middle → late)
    • Each processor receives the extraction result or output from previous processor
    • Processors should return modified result or data
  4. Final Validators - If specified, validate the processed result
  5. Return - Return enhanced extraction result

Examples

# Extract with registered validators and post-processors
{:ok, result} = Kreuzberg.extract_with_plugins(
  pdf_binary,
  "application/pdf",
  nil,
  validators: [MyApp.InputValidator],
  post_processors: %{
    early: [MyApp.EarlyProcessor],
    middle: [MyApp.MiddleProcessor],
    late: [MyApp.FinalProcessor]
  },
  final_validators: [MyApp.ResultValidator]
)

# Extract with only post-processors
{:ok, result} = Kreuzberg.extract_with_plugins(
  pdf_binary,
  "application/pdf",
  %{use_cache: true},
  post_processors: %{
    early: [MyApp.Processor1, MyApp.Processor2]
  }
)

# Extract with configuration and validators only
config = %Kreuzberg.ExtractionConfig{ocr: %{"enabled" => true}}
{:ok, result} = Kreuzberg.extract_with_plugins(
  pdf_binary,
  "application/pdf",
  config,
  validators: [MyApp.Validator]
)

# Extract with no plugins (standard extraction)
{:ok, result} = Kreuzberg.extract_with_plugins(pdf_binary, "application/pdf")

get_embedding_preset(name)

See Kreuzberg.UtilityAPI.get_embedding_preset/1.

get_error_details()

See Kreuzberg.UtilityAPI.get_error_details/0.

get_extensions_for_mime(mime_type)

See Kreuzberg.UtilityAPI.get_extensions_for_mime/1.

list_embedding_presets()

See Kreuzberg.UtilityAPI.list_embedding_presets/0.

render_pdf_page(path, page_index, opts \\ [])

@spec render_pdf_page(String.t(), non_neg_integer(), keyword()) ::
  {:ok, binary()} | {:error, String.t()}

Render a single PDF page as a PNG image.

Parameters

  • path - Path to the PDF file
  • page_index - Zero-based page index
  • opts - Keyword list of options:
    • :dpi - Rendering resolution (default 150)

Returns

  • {:ok, binary()} - PNG-encoded binary
  • {:error, reason} - Rendering failed

Examples

{:ok, png} = Kreuzberg.render_pdf_page("document.pdf", 0)
{:ok, png} = Kreuzberg.render_pdf_page("document.pdf", 2, dpi: 300)

render_pdf_pages_stream(path, opts \\ [])

@spec render_pdf_pages_stream(
  String.t(),
  keyword()
) :: Enumerable.t()

Return a lazy Stream that yields {page_index, png_binary} tuples.

Pages are rendered one at a time via the native PDF page iterator, so only one page's worth of PNG bytes is in memory at a time.

Parameters

  • path - Path to the PDF file
  • opts - Keyword list of options:
    • :dpi - Rendering resolution (default 150)

Returns

  • Enumerable.t() - A Stream of {non_neg_integer(), binary()} tuples

Examples

Kreuzberg.render_pdf_pages_stream("document.pdf")
|> Enum.each(fn {page_index, png} ->
  File.write!("page_#{page_index}.png", png)
end)

validate_binarization_method(method)

See Kreuzberg.Validators.validate_binarization_method/1.

validate_chunking_params(params)

See Kreuzberg.Validators.validate_chunking_params/1.

validate_confidence(confidence)

See Kreuzberg.Validators.validate_confidence/1.

validate_dpi(dpi)

See Kreuzberg.Validators.validate_dpi/1.

validate_language_code(code)

See Kreuzberg.Validators.validate_language_code/1.

validate_mime_type(mime_type)

See Kreuzberg.UtilityAPI.validate_mime_type/1.

validate_ocr_backend(backend)

See Kreuzberg.Validators.validate_ocr_backend/1.

validate_tesseract_oem(oem)

See Kreuzberg.Validators.validate_tesseract_oem/1.

validate_tesseract_psm(psm)

See Kreuzberg.Validators.validate_tesseract_psm/1.