Performance Tuning

View Source

Optimizing TantivyEx for production requires understanding how indexing, searching, and memory usage affect performance.

Index Design for Performance

Choose the Right Field Options

Different field options have different performance characteristics:

# For fields you only search (no retrieval needed)
Schema.add_text_field(schema, "content", :text)

# For fields you search and retrieve
Schema.add_text_field(schema, "title", :text_stored)

# For fast filtering and aggregation
Schema.add_u64_field(schema, "timestamp", :fast)

# For both retrieval and fast operations
Schema.add_f64_field(schema, "price", :fast_stored)

Performance Guidelines:

  • Use :text for content you only search, not retrieve
  • Use :fast for fields used in range queries or sorting
  • Use _stored variants only when you need to retrieve the original value
  • Avoid storing large text fields if you don't need them in results

Optimize Your Schema

# ❌ Poor performance - storing large content unnecessarily
schema = Schema.add_text_field(schema, "full_content", :text_stored)

# ✅ Better - only index for search
schema = Schema.add_text_field(schema, "full_content", :text)

# Store a separate summary field for display
schema = Schema.add_text_field(schema, "summary", :text_stored)

Field Type Selection Impact

defmodule SchemaOptimizer do
  def create_optimized_schema() do
    schema = Schema.new()

    # Text fields - choose based on use case
    schema = Schema.add_text_field(schema, "title", :text_stored)     # Search + display
    schema = Schema.add_text_field(schema, "content", :text)          # Search only
    schema = Schema.add_text_field(schema, "summary", :stored)        # Display only

    # Numeric fields - optimize for operations
    schema = Schema.add_u64_field(schema, "timestamp", :fast)         # Filtering/sorting
    schema = Schema.add_f64_field(schema, "price", :fast_stored)      # Filter + display
    schema = Schema.add_u64_field(schema, "view_count", :stored)      # Display only

    # Facet fields - for navigation
    {:ok, schema} = Schema.add_facet_field(schema, "category", :facet)

    {:ok, schema}
  end
end

Indexing Performance

Batch Operations

Always prefer batch operations over individual document additions:

# ❌ Slow - individual commits
{:ok, writer} = TantivyEx.IndexWriter.new(index)
Enum.each(documents, fn doc ->
  TantivyEx.IndexWriter.add_document(writer, doc)
  TantivyEx.IndexWriter.commit(writer)  # Don't do this!
end)

# ✅ Fast - batch commit
{:ok, writer} = TantivyEx.IndexWriter.new(index)
Enum.each(documents, fn doc ->
  TantivyEx.IndexWriter.add_document(writer, doc)
end)
TantivyEx.IndexWriter.commit(writer)  # Single commit at the end

Optimize Commit Frequency

defmodule BulkIndexer do
  @batch_size 1000
  @commit_interval_ms 5000

  def index_documents(index, documents) do
    {:ok, writer} = TantivyEx.IndexWriter.new(index)

    documents
    |> Enum.chunk_every(@batch_size)
    |> Enum.each(fn batch ->
      add_batch(writer, batch)
      TantivyEx.IndexWriter.commit(writer)

      # Optional: brief pause to prevent overwhelming the system
      Process.sleep(100)
    end)
  end

  defp add_batch(writer, documents) do
    Enum.each(documents, fn doc ->
      case TantivyEx.IndexWriter.add_document(writer, doc) do
        :ok -> :ok
        {:error, reason} ->
          Logger.warning("Failed to add document: #{inspect(reason)}")
      end
    end)
  end
end

Parallel Indexing

defmodule ParallelIndexer do
  def index_documents_parallel(index, documents, num_workers \\ 4) do
    documents
    |> Enum.chunk_every(div(length(documents), num_workers))
    |> Task.async_stream(fn chunk ->
      {:ok, writer} = TantivyEx.IndexWriter.new(index)

      Enum.each(chunk, fn doc ->
        TantivyEx.IndexWriter.add_document(writer, doc)
      end)

      TantivyEx.IndexWriter.commit(writer)
    end, timeout: 60_000)
    |> Enum.to_list()
  end
end

Search Performance

Query Optimization

# ❌ Slow - overly broad queries
{:ok, searcher} = TantivyEx.Searcher.new(index)
TantivyEx.Searcher.search(searcher, "*", 10000)

# ✅ Fast - specific queries with reasonable limits
TantivyEx.Searcher.search(searcher, "specific terms", 50)

# ❌ Slow - complex boolean queries without field targeting
TantivyEx.Searcher.search(searcher, "(a OR b OR c) AND (d OR e OR f)", 100)

# ✅ Fast - field-specific queries
TantivyEx.Searcher.search(searcher, "title:(important terms) AND category:specific", 100)

Result Limiting and Pagination

defmodule SearchOptimizer do
  # Don't retrieve more results than you need
  def search_with_limit(index, query, limit \\ 20) do
    {:ok, searcher} = TantivyEx.Searcher.new(index)
    TantivyEx.Searcher.search(searcher, query, limit)
  end

  # Efficient pagination for moderate depths
  def paginated_search(index, query, page, per_page) when page <= 100 do
    limit = page * per_page
    {:ok, searcher} = TantivyEx.Searcher.new(index)

    case TantivyEx.Searcher.search(searcher, query, limit) do
      {:ok, all_results} ->
        start_index = (page - 1) * per_page
        page_results = Enum.slice(all_results, start_index, per_page)
        {:ok, page_results}

      error -> error
    end
  end

  # For deep pagination, consider cursor-based approaches
  def cursor_based_search(index, query, cursor, per_page) do
    # Implementation depends on your specific use case
    # Consider using a timestamp or ID field for cursor
    enhanced_query = "#{query} AND timestamp:>#{cursor}"
    search_with_limit(index, enhanced_query, per_page)
  end
end

Query Caching

defmodule QueryCache do
  use GenServer

  # Simple in-memory cache for frequent queries
  def start_link(_opts) do
    GenServer.start_link(__MODULE__, %{}, name: __MODULE__)
  end

  def search_cached(index, query, limit) do
    cache_key = {query, limit}

    case GenServer.call(__MODULE__, {:get, cache_key}) do
      nil ->
        {:ok, results} = search_and_cache(index, query, limit, cache_key)
        results

      cached_results ->
        cached_results
    end
  end

  defp search_and_cache(index, query, limit, cache_key) do
    {:ok, searcher} = TantivyEx.Searcher.new(index)

    case TantivyEx.Searcher.search(searcher, query, limit) do
      {:ok, results} = success ->
        GenServer.cast(__MODULE__, {:put, cache_key, results})
        success

      error -> error
    end
  end

  # GenServer callbacks
  def init(state), do: {:ok, state}

  def handle_call({:get, key}, _from, cache) do
    {:reply, Map.get(cache, key), cache}
  end

  def handle_cast({:put, key, value}, cache) do
    # Simple cache with size limit
    new_cache =
      cache
      |> Map.put(key, value)
      |> maybe_evict_old_entries()

    {:noreply, new_cache}
  end

  defp maybe_evict_old_entries(cache) when map_size(cache) > 1000 do
    # Keep only the most recent 500 entries
    cache
    |> Enum.take(500)
    |> Map.new()
  end

  defp maybe_evict_old_entries(cache), do: cache
end

Memory Management

Index Size Monitoring

defmodule IndexMonitor do
  require Logger

  def check_index_stats(index_path) do
    case File.stat(index_path) do
      {:ok, %{size: size}} ->
        size_mb = size / (1024 * 1024)
        Logger.info("Index size: #{Float.round(size_mb, 2)} MB")
        {:ok, size_mb}

      {:error, reason} ->
        Logger.error("Could not get index stats: #{reason}")
        {:error, reason}
    end
  end

  def monitor_index_growth(index_path, threshold_mb \\ 1000) do
    case check_index_stats(index_path) do
      {:ok, size_mb} when size_mb > threshold_mb ->
        Logger.warning("Index size (#{size_mb} MB) exceeds threshold (#{threshold_mb} MB)")
        :threshold_exceeded

      {:ok, _size_mb} ->
        :ok

      error -> error
    end
  end
end

RAM vs Disk Indexes

defmodule IndexStrategy do
  def choose_index_type(dataset_size_mb, available_ram_mb) do
    cond do
      dataset_size_mb < 100 and available_ram_mb > 1000 ->
        {:ram_index, "Small dataset, use RAM for speed"}

      dataset_size_mb < available_ram_mb * 0.5 ->
        {:ram_index, "Dataset fits comfortably in RAM"}

      true ->
        {:disk_index, "Dataset too large for RAM or limited memory"}
    end
  end

  def create_optimized_index(schema, strategy, path \\ nil) do
    case strategy do
      {:ram_index, _reason} ->
        Index.create_in_ram(schema)

      {:disk_index, _reason} ->
        path = path || generate_temp_path()
        Index.create_in_dir(path, schema)
    end
  end

  defp generate_temp_path do
    timestamp = System.system_time(:second)
    "/tmp/tantivy_index_#{timestamp}"
  end
end

Performance Benchmarking

defmodule PerformanceBenchmark do
  def benchmark_indexing(documents, batch_sizes \\ [100, 500, 1000, 5000]) do
    schema = create_test_schema()

    Enum.map(batch_sizes, fn batch_size ->
      {time, _result} = :timer.tc(fn ->
        index_with_batch_size(documents, schema, batch_size)
      end)

      time_ms = time / 1000
      docs_per_second = length(documents) / (time_ms / 1000)

      %{
        batch_size: batch_size,
        time_ms: time_ms,
        docs_per_second: Float.round(docs_per_second, 2)
      }
    end)
  end

  def benchmark_queries(index, queries) do
    {:ok, searcher} = TantivyEx.Searcher.new(index)

    Enum.map(queries, fn query ->
      {time, result} = :timer.tc(fn ->
        TantivyEx.Searcher.search(searcher, query, 100)
      end)

      time_ms = time / 1000
      result_count = case result do
        {:ok, results} -> length(results)
        _ -> 0
      end

      %{
        query: query,
        time_ms: time_ms,
        result_count: result_count
      }
    end)
  end

  defp create_test_schema do
    {:ok, schema} = Schema.new()
    {:ok, schema} = Schema.add_text_field(schema, "title", :text_stored)
    {:ok, schema} = Schema.add_text_field(schema, "content", :text)
    {:ok, schema} = Schema.add_u64_field(schema, "timestamp", :fast)
    schema
  end

  defp index_with_batch_size(documents, schema, batch_size) do
    {:ok, index} = Index.create_in_ram(schema)
    {:ok, writer} = TantivyEx.IndexWriter.new(index)

    documents
    |> Enum.chunk_every(batch_size)
    |> Enum.each(fn batch ->
      Enum.each(batch, &TantivyEx.IndexWriter.add_document(writer, &1))
      TantivyEx.IndexWriter.commit(writer)
    end)

    index
  end
end

Performance Best Practices Summary

Do's ✅

  • Batch document operations
  • Use appropriate field types and options
  • Monitor index size and performance
  • Cache frequent queries
  • Use specific, targeted queries
  • Profile your application's search patterns

Don'ts ❌

  • Don't commit after every document
  • Don't store fields you don't need to retrieve
  • Don't use overly broad queries (*)
  • Don't request more results than needed
  • Don't ignore memory usage patterns
  • Don't skip performance testing

Monitoring in Production

defmodule ProductionMonitoring do
  use GenServer
  require Logger

  def start_link(index_path) do
    GenServer.start_link(__MODULE__, %{index_path: index_path}, name: __MODULE__)
  end

  def init(state) do
    schedule_monitoring()
    {:ok, state}
  end

  def handle_info(:monitor, %{index_path: index_path} = state) do
    case IndexMonitor.check_index_stats(index_path) do
      {:ok, size_mb} ->
        :telemetry.execute([:tantivy_ex, :index, :size], %{megabytes: size_mb})

      {:error, reason} ->
        Logger.error("Index monitoring failed: #{inspect(reason)}")
    end

    schedule_monitoring()
    {:noreply, state}
  end

  defp schedule_monitoring do
    Process.send_after(self(), :monitor, 60_000)  # Every minute
  end
end