View Source Sudoku Puzzle Extractor

Mix.install([
  {:evision, "~> 0.2"},
  {:kino, "~> 0.7"},
  {:req, "~> 0.3"},
  {:torchx, "~> 0.4"},
  {:nx, "~> 0.4"},
  {:scidata, "~> 0.1"},
  {:axon, "~> 0.3.0"}
], system_env: [
  # optional, defaults to `true`
  # set `EVISION_PREFER_PRECOMPILED` to `false`
  # if you prefer `:evision` to be compiled from source
  # note that to compile from source, you may need at least 1GB RAM
  {"EVISION_PREFER_PRECOMPILED", true},

  # optional, defaults to `true`
  # set `EVISION_ENABLE_CONTRIB` to `false`
  # if you don't need modules from `opencv_contrib`
  {"EVISION_ENABLE_CONTRIB", true},

  # optional, defaults to `false`
  # set `EVISION_ENABLE_CUDA` to `true`
  # if you wish to use CUDA related functions
  # note that `EVISION_ENABLE_CONTRIB` also has to be `true`
  # because cuda related modules come from the `opencv_contrib` repo
  {"EVISION_ENABLE_CUDA", false},

  # required when 
  # - `EVISION_ENABLE_CUDA` is `true`
  # - and `EVISION_PREFER_PRECOMPILED` is `true`
  #
  # set `EVISION_CUDA_VERSION` to the version that matches 
  # your local CUDA runtime version
  #
  # current available versions are
  # - 118
  # - 121
  {"EVISION_CUDA_VERSION", "118"},

  # require for Windows users when 
  # - `EVISION_ENABLE_CUDA` is `true`
  # set `EVISION_CUDA_RUNTIME_DIR` to the directory that contains
  # CUDA runtime libraries
  {"EVISION_CUDA_RUNTIME_DIR", "C:/PATH/TO/CUDA/RUNTIME"}
])
:ok

References

Some code in this example was basically literal translation from the code in OpenCV Sudoku Solver and OCR - PyImageSearch. The sample input image is also taken from that post.

The code of the neural network was taken from https://github.com/elixir-nx/axon/blob/main/examples/vision/mnist.exs with some minor changes.

Define Some Helper Functions

defmodule Helper do
  def download!(url, save_as, overwrite? \\ false) do
    unless File.exists?(save_as) do
      Req.get!(url, http_errors: :raise, output: save_as, cache: not overwrite?)
    end

    :ok
  end
end

Download the test image.

Helper.download!(
  "https://raw.githubusercontent.com/cocoa-xu/evision/main/test/testdata/sudoku_puzzle.webp",
  "sudoku_puzzle.webp"
)

Load Original Image and Convert It to Gray Scale

# read the original image
original = Evision.imread("sudoku_puzzle.webp")
%Evision.Mat{
  channels: 3,
  dims: 2,
  type: {:u, 8},
  raw_type: 16,
  shape: {1024, 962, 3},
  ref: #Reference<0.736436033.3583639578.183370>
}
# convert it to grayscale
gray = Evision.cvtColor(original, Evision.Constant.cv_COLOR_BGR2GRAY())
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {1024, 962},
  ref: #Reference<0.736436033.3588096019.145887>
}
# apply some Gaussian Blue to the image
# we are doing so to reduce the noise and prepare it for the next step
blurred = Evision.gaussianBlur(gray, {7, 7}, 3)
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {1024, 962},
  ref: #Reference<0.736436033.3588096019.145890>
}
# binarization with Evision.adaptiveThreshold
bw =
  Evision.adaptiveThreshold(
    blurred,
    255,
    Evision.Constant.cv_ADAPTIVE_THRESH_GAUSSIAN_C(),
    Evision.Constant.cv_THRESH_BINARY(),
    11,
    2
  )

# bitwise not
threshold = Evision.Mat.bitwise_not(bw)
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {1024, 962},
  ref: #Reference<0.736436033.3588096019.145894>
}

Find the Sudoku Puzzle

# First thing first, we need to find all the contours in the thresholded image
{contours, _} =
  Evision.findContours(
    threshold,
    Evision.Constant.cv_RETR_EXTERNAL(),
    Evision.Constant.cv_CHAIN_APPROX_SIMPLE()
  )

IO.puts("Find #{Enum.count(contours)} contour(s)")
Find 377 contour(s)
:ok
# and our assumptions are that
#   1. the contour that contains the puzzle should be fairly large: 
#      hence we are sorting them by their area in descending order
contours =
  Enum.sort_by(contours, fn c ->
    -Evision.contourArea(c)
  end)

IO.puts("area of the largest contour: #{Evision.contourArea(Enum.at(contours, 0))}")
area of the largest contour: 430559.5
:ok
# 2.the puzzle should be a rectangular
#   which means its contour should be approximately an rectangular
#   which means the approximated polygonal of the contour should have 4 corners (keypoints)
#   hence we will need to find the contour which its approximated polygonal's shape is {4, 1, 2}
puzzle_keypoints =
  Enum.reduce_while(contours, nil, fn c, _acc ->
    peri = Evision.arcLength(c, true)
    approx = Evision.approxPolyDP(c, 0.02 * peri, true)

    case approx.shape do
      {4, 1, 2} ->
        {:halt, approx}

      _ ->
        {:cont, nil}
    end
  end)

if puzzle_keypoints do
  IO.puts("Found puzzle")
  Evision.drawContours(original, [puzzle_keypoints], -1, {0, 255, 0}, thickness: 2)
else
  IO.puts("""
  Could not find Sudoku puzzle outline.
  Try debugging your thresholding and contour steps.
  """)
end
Found puzzle
%Evision.Mat{
  channels: 3,
  dims: 2,
  type: {:u, 8},
  raw_type: 16,
  shape: {1024, 962, 3},
  ref: #Reference<0.736436033.3588096019.146280>
}

Extract the Puzzle

To extract the puzzle, we need to apply some affine transformations.

To apply any affine transformations, we will first need to know the four corners/points, namely,

  1. Top left
  2. Top right
  3. Bottom right
  4. Bottom left

And these four points have to be arranged in the order above -- so that when you connect them 1 => 2 => 3=> 4 => 1, they can form a closed rectangular.

# this function will arrange the keypoints in the order discussed above
order_points = fn pts ->
  # the top-left point will have the smallest sum, whereas
  # the bottom-right point will have the largest sum
  sum = Nx.sum(pts, axes: [1])
  tl = pts[Nx.argmin(sum)]
  br = pts[Nx.argmax(sum)]

  # now, compute the difference between the points, the
  # top-right point will have the smallest difference,
  # whereas the bottom-left will have the largest difference
  diff = Nx.subtract(pts[[0..3, 1]], pts[[0..3, 0]])
  tr = pts[Nx.argmin(diff)]
  bl = pts[Nx.argmax(diff)]
  {tl, tr, br, bl}
end

input =
  Evision.Mat.as_shape(puzzle_keypoints, {4, 2})
  |> Evision.Mat.to_nx(Nx.BinaryBackend)
  |> Nx.as_type(:f32)

{tl, tr, br, bl} = order_points.(input)
rect = Nx.stack([tl, tr, br, bl])
#Nx.Tensor<
  f32[4][2]
  [
    [163.0, 206.0],
    [809.0, 180.0],
    [877.0, 785.0],
    [106.0, 809.0]
  ]
>

After that, we can calculate the expected output height and width.

point_distance = fn p1, p2 ->
  round(
    Nx.to_number(
      Nx.sqrt(
        Nx.add(
          Nx.power(Nx.subtract(p1[[0]], p2[[0]]), 2),
          Nx.power(Nx.subtract(p1[[1]], p2[[1]]), 2)
        )
      )
    )
  )
end

# compute the width of the new image, which will be the
# maximum distance between bottom-right and bottom-left
# x-coordiates or the top-right and top-left x-coordinates
output_width =
  Nx.to_number(
    Nx.max(
      point_distance.(br, bl),
      point_distance.(tr, tl)
    )
  )

# compute the height of the new image, which will be the
# maximum distance between the top-right and bottom-right
# y-coordinates or the top-left and bottom-left y-coordinates
output_height =
  Nx.to_number(
    Nx.max(
      point_distance.(tr, br),
      point_distance.(tl, bl)
    )
  )

{output_height, output_width}
{609, 771}

Then we specify output coordinates for corners of the puzzle, [top-left, top-right, bottom-right, bottom-left] as output.

output =
  Nx.tensor(
    [
      [0, 0],
      [output_width - 1, 0],
      [output_width - 1, output_height - 1],
      [0, output_height - 1]
    ],
    type: :f32
  )
#Nx.Tensor<
  f32[4][2]
  Torchx.Backend(cpu)
  [
    [0.0, 0.0],
    [770.0, 0.0],
    [770.0, 608.0],
    [0.0, 608.0]
  ]
>

Get the perspective transformation matrix and warp perspective for both the original image and the grayscale one

matrix = Evision.getPerspectiveTransform(rect, output)

puzzle =
  Evision.warpPerspective(
    original,
    matrix,
    {output_width, output_height},
    flags: Evision.Constant.cv_INTER_LINEAR(),
    borderMode: Evision.Constant.cv_BORDER_CONSTANT(),
    borderValue: {0, 0, 0}
  )

puzzle_gray =
  Evision.warpPerspective(
    gray,
    matrix,
    {output_width, output_height},
    flags: Evision.Constant.cv_INTER_LINEAR(),
    borderMode: Evision.Constant.cv_BORDER_CONSTANT(),
    borderValue: {0, 0, 0}
  )
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {609, 771},
  ref: #Reference<0.736436033.3588096019.146349>
}

Since the Sudoku puzzel is 9x9, we can resize it to a square

{h, w} = puzzle_gray.shape
len = max(h, w)
puzzle_gray = Evision.resize(puzzle_gray, {len, len})
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {771, 771},
  ref: #Reference<0.736436033.3588096019.146352>
}

Extract Digits in Each Cell of the Puzzle

Let's start with the basic case, the top-left one.

step = len / 9
current_cell = puzzle_gray[[0..round(step), 0..round(step)]]
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {87, 87},
  ref: #Reference<0.736436033.3588096019.146355>
}
# import Bitwise so that we can use `|||` (bitwise or)
import Bitwise

extract_digit = fn cell ->
  # get a binary image of the cell
  {_, threshold} =
    Evision.threshold(cell, 10, 255, Evision.Constant.cv_THRESH_BINARY_INV() ||| Evision.Constant.cv_THRESH_OTSU())

  # cut off border
  {h, w} = threshold.shape
  threshold = threshold[[8..(h - 8), 8..(w - 8)]]

  # find contours in the thresholded cell
  {contours, _} =
    Evision.findContours(
      threshold,
      Evision.Constant.cv_RETR_EXTERNAL(),
      Evision.Constant.cv_CHAIN_APPROX_SIMPLE()
    )

  # if no contours were found than this is an empty cell
  unless Enum.count(contours) == 0 do
    # otherwise, find the largest contour in the cell and create a
    # mask for the contour
    c = Enum.max_by(contours, &Evision.contourArea/1)
    mask = Evision.Mat.zeros(threshold.shape, :u8)
    mask = Evision.drawContours(mask, [c], -1, {255}, thickness: 2)

    # compute the percentage of masked pixels relative to the total
    # area of the image
    {h, w} = threshold.shape
    percent_filled = Evision.countNonZero(mask) / (w * h)

    # if less than 5% of the mask is filled then we are looking at
    # noise and can safely ignore the contour
    unless percent_filled < 0.05 do
      threshold
    end
  end
end
#Function<42.3316493/1 in :erl_eval.expr/6>

Let's try this for the top-left cell

current_cell = extract_digit.(current_cell)
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {72, 72},
  ref: #Reference<0.736436033.3588096019.146359>
}

And it looks great!

Train A Neural Network that Recognizes Digits

# Here I'm using :torchx as the backend
# you can use :exla or other nx backend
Nx.default_backend(Torchx.Backend)
{Torchx.Backend, []}

Define the neural network module, MNIST.

Code from https://github.com/elixir-nx/axon/blob/main/examples/vision/mnist.exs with minor changes.

defmodule MNIST do
  require Axon

  def transform_images({bin, type, shape}) do
    bin
    |> Nx.from_binary(type, backend: Torchx.Backend)
    # 28, 28})
    |> Nx.reshape({elem(shape, 0), 784})
    |> Nx.as_type(:f32)
    |> Nx.divide(255.0)
    |> Nx.to_batched(500, leftover: :discard)
    |> Enum.to_list()
    # Test split
    |> Enum.split(-3)
  end

  def transform_labels({bin, type, _}) do
    bin
    |> Nx.from_binary(type, backend: Torchx.Backend)
    |> Nx.new_axis(-1)
    |> Nx.equal(Nx.tensor(Enum.to_list(0..9)))
    |> Nx.to_batched(500, leftover: :discard)
    |> Enum.to_list()
    # Test split
    |> Enum.split(-3)
  end

  def build_model(input_shape) do
    Axon.input("input", shape: input_shape)
    |> Axon.dense(128, activation: :relu)
    |> Axon.dropout()
    |> Axon.dense(10, activation: :softmax)
  end

  def train_model(model, train_images, train_labels, epochs) do
    model
    |> Axon.Loop.trainer(:categorical_cross_entropy, Axon.Optimizers.adamw(0.005))
    |> Axon.Loop.metric(:accuracy, "Accuracy")
    |> Axon.Loop.run(Stream.zip(train_images, train_labels), %{}, epochs: epochs)
  end

  def test_model(model, model_state, test_images, test_labels) do
    model
    |> Axon.Loop.evaluator()
    |> Axon.Loop.metric(:accuracy, "Accuracy")
    |> Axon.Loop.run(Stream.zip(test_images, test_labels), model_state)
  end
end
{:module, MNIST, <<70, 79, 82, 49, 0, 0, 16, ...>>, {:test_model, 4}}

Get the MNIST dataset and prepare the model.

{images, labels} = Scidata.MNIST.download()

{train_images, test_images} = MNIST.transform_images(images)
{train_labels, test_labels} = MNIST.transform_labels(labels)

model = MNIST.build_model({nil, 784})
#Axon<
  inputs: %{"input" => {nil, 784}}
  outputs: "softmax_0"
  nodes: 6
>

Train and Test the MNIST Model

IO.puts("Training Model...")
num_epoch = 5
model_state = MNIST.train_model(model, train_images, train_labels, num_epoch)
Training Model...
Epoch: 0, Batch: 100, Accuracy: 0.8545348 loss: 0.4818023
Epoch: 1, Batch: 100, Accuracy: 0.9332873 loss: 0.3481621
Epoch: 2, Batch: 100, Accuracy: 0.9500000 loss: 0.2871155
Epoch: 3, Batch: 100, Accuracy: 0.9596242 loss: 0.2489783
Epoch: 4, Batch: 100, Accuracy: 0.9664556 loss: 0.2216917
%{
  "dense_0" => %{
    "bias" => #Nx.Tensor<
      f32[128]
      Torchx.Backend(cpu)
      [-0.17988327145576477, 0.1716635376214981, 0.0018325354903936386, 0.1798122078180313, 0.09567301720380783, 0.10200180858373642, 0.008338148705661297, 0.10609600692987442, -0.0847468227148056, 0.08587154000997543, -7.477949257008731e-4, -0.03447800129652023, 0.03325527906417847, 0.029068496078252792, -0.1250336617231369, -0.030752241611480713, 0.08610563725233078, -0.03146626055240631, -0.09594535827636719, -0.02172447182238102, 0.058523084968328476, -0.03135503828525543, -0.002279756823554635, -0.027246493846178055, 0.05487725883722305, -0.28020963072776794, 0.1807883381843567, 0.039404697716236115, 0.17562659084796906, 0.10994528979063034, 0.11735761165618896, 0.19422295689582825, -0.027378687635064125, 0.0191771499812603, -0.031338877975940704, 0.06676501035690308, 0.10198114812374115, -0.0012608487159013748, 0.09346475452184677, 0.05980100482702255, -0.029884351417422295, 0.12892872095108032, 0.23887237906455994, 0.020879346877336502, -0.142303928732872, 0.1269598752260208, 0.11833193153142929, -0.04432889074087143, ...]
    >,
    "kernel" => #Nx.Tensor<
      f32[784][128]
      Torchx.Backend(cpu)
      [
        [-0.004997961223125458, -0.056927893310785294, 0.012524336576461792, -0.04678211361169815, -0.07006479054689407, 0.03179212659597397, 0.05232561379671097, -0.060292698442935944, 0.07966595143079758, -0.05596722662448883, 0.007738783955574036, 0.024696074426174164, -0.0667371079325676, -0.02877001464366913, -0.03603668883442879, -0.05976931005716324, 0.03925761580467224, -0.056572359055280685, -0.056350238621234894, 0.03452708572149277, 0.07506700605154037, 0.01048319786787033, 0.027114614844322205, -0.05841222405433655, 0.01039694994688034, -0.058553546667099, -0.02925030142068863, 0.03921818733215332, -0.009020909667015076, 0.0156695693731308, -0.021413378417491913, -0.03247242793440819, 0.04008222371339798, -0.05680907890200615, 0.06702771037817001, -0.03168267011642456, 0.04423173516988754, 0.06032135337591171, 0.0627024844288826, -0.018609225749969482, -0.07648104429244995, -0.015578106045722961, 0.06719689816236496, 1.7232447862625122e-4, -0.022565998136997223, 0.04633399099111557, 0.05553805083036423, ...],
        ...
      ]
    >
  },
  "dense_1" => %{
    "bias" => #Nx.Tensor<
      f32[10]
      Torchx.Backend(cpu)
      [-0.052650559693574905, 0.04494607821106911, -0.2200334221124649, -0.08993491530418396, -0.02432980388402939, -0.005615420173853636, -0.038794878870248795, 0.0233138520270586, 0.19364574551582336, 0.10583429038524628]
    >,
    "kernel" => #Nx.Tensor<
      f32[128][10]
      Torchx.Backend(cpu)
      [
        [-0.033295780420303345, 0.04774130508303642, 0.20219796895980835, -0.1599399298429489, -0.4607354402542114, 0.04164411500096321, 0.31128883361816406, -0.18477322161197662, 0.195823535323143, -0.2146347016096115],
        [-0.47767552733421326, 0.2479187399148941, 0.3272024393081665, -0.3602463901042938, -0.021782109513878822, 0.37739643454551697, -0.08139888197183609, 0.11731932312250137, -0.21786180138587952, -0.2060997635126114],
        [-0.29257020354270935, -0.27153488993644714, 0.3574492633342743, 0.25863510370254517, -0.7012501955032349, 0.09472404420375824, -0.376261442899704, 0.3126865029335022, 0.030163684859871864, 0.11671186238527298],
        [-0.1575060784816742, 0.35154038667678833, -0.5979138016700745, -0.21251395344734192, 0.08020816743373871, 0.47368210554122925, -0.09995336085557938, 0.17019565403461456, -0.10945288836956024, 0.027018314227461815],
        [0.2525313198566437, -0.153570294380188, 0.20948101580142975, 0.1662921905517578, -0.054898977279663086, 0.22147339582443237, ...],
        ...
      ]
    >
  }
}
IO.puts("Testing Model...")
MNIST.test_model(model, model_state, test_images, test_labels)
Testing Model...
Batch: 2, Accuracy: 0.9813333
%{
  0 => %{
    "Accuracy" => #Nx.Tensor<
      f32
      Torchx.Backend(cpu)
      0.981333315372467
    >
  }
}

Apply the Trained Neural Network to the Digits Extracted from the Puzzle

# Get the predict function
{_init_fn, predict_fn} = Axon.build(model)

# make a helper function for our use -- predict the digit in the cell
predict_cell = fn predict_fn, model_state, input_cell ->
  input_cell =
    Nx.as_type(Evision.Mat.to_nx(Evision.resize(input_cell, {28, 28}), Torchx.Backend), :f32)
    |> Nx.reshape({1, 784})
    |> Nx.divide(255.0)

  pred = predict_fn.(model_state, input_cell)
  Nx.to_number(Nx.argmax(pred))
end
#Function<40.3316493/3 in :erl_eval.expr/6>

Let's use the top-left one as input

pred = predict_cell.(predict_fn, model_state, current_cell)

IO.puts("Prediction: #{pred}")
current_cell
Prediction: 8
%Evision.Mat{
  channels: 1,
  dims: 2,
  type: {:u, 8},
  raw_type: 0,
  shape: {72, 72},
  ref: #Reference<0.736436033.3588096019.146359>
}

Looks good! Now if we do this for each cell in the puzzle, we can get what we want

step = trunc(step)

digits =
  for x <- 0..8, y <- 0..8 do
    x1 = step * x
    y1 = step * y

    # remember that we return nil if extract_digit thinks there is no
    # digits present in the cell 
    current_cell = extract_digit.(puzzle_gray[[x1..(x1 + step), y1..(y1 + step)]])

    if current_cell do
      # if there is a digit, we use the neural network to predict its value
      predict_cell.(predict_fn, model_state, current_cell)
    else
      # otherwise we use `0` for empty ones
      # this should be okay since in normal sudoku puzzle we only use 1-9
      0
    end
  end

extracted_puzzle = Nx.reshape(Nx.tensor(digits), {9, 9})
IO.inspect(extracted_puzzle, limit: :infinity)
:ok
#Nx.Tensor<
  s64[9][9]
  Torchx.Backend(cpu)
  [
    [8, 0, 0, 0, 1, 0, 0, 0, 8],
    [0, 5, 0, 8, 0, 7, 0, 7, 0],
    [0, 0, 4, 0, 9, 0, 7, 0, 0],
    [0, 8, 0, 2, 0, 7, 0, 2, 0],
    [5, 0, 8, 0, 8, 0, 7, 0, 3],
    [0, 7, 0, 5, 0, 2, 0, 8, 0],
    [0, 0, 2, 0, 4, 0, 8, 0, 0],
    [0, 8, 0, 3, 0, 8, 0, 4, 0],
    [3, 0, 0, 0, 5, 0, 0, 0, 8]
  ]
>
:ok

Visualise the Results

{h, w, _} = puzzle.shape
len = max(h, w)
puzzle = Evision.resize(puzzle, {len, len})

for x <- 0..8, y <- 0..8, reduce: puzzle do
  vis ->
    # compute the coordinates of where the digit will be drawn
    # on the output puzzle image
    {x1, y1} = {step * x, step * y}
    {x2, y2} = {step + x1, step + y1}
    textX = trunc((x2 - x1) * 0.33) + x1
    textY = trunc((y2 - y1) * -0.2) + y2

    text = to_string(Nx.to_number(extracted_puzzle[[y, x]]))

    Evision.putText(
      vis,
      text,
      {textX, textY},
      Evision.Constant.cv_FONT_HERSHEY_SIMPLEX(),
      0.9,
      {0, 255, 255},
      thickness: 2
    )
end
%Evision.Mat{
  channels: 3,
  dims: 2,
  type: {:u, 8},
  raw_type: 16,
  shape: {771, 771, 3},
  ref: #Reference<0.736436033.3588620308.228888>
}