TXTOCR

Third place solution for "Text OCR"

At a high-level, there are two steps involved:

1. Detect word boundaries and trim the image.

2. Train a deep learning model with LSTM and an CTC layer.

In [ ]:

# Inspired from
# https://keras.io/examples/vision/captcha_ocr/

In [ ]:

# from google.colab import drive
# drive.mount('/content/drive')

Setup¶

In [ ]:

import os
import numpy as np
import matplotlib.pyplot as plt

import math
import cv2

import pandas as pd

from scipy.stats import mode

from pathlib import Path

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from google.colab import files

from tqdm.notebook import tnrange, tqdm

In [ ]:

%%time

!pip install git+https://gitlab.aicrowd.com/yoogottamk/aicrowd-cli.git > /dev/null
API_KEY = """SECRET_KEY""" #Input your API key here, you can get it from your profile page.
!aicrowd login --api-key $API_KEY > /dev/null

  Running command git clone -q https://gitlab.aicrowd.com/yoogottamk/aicrowd-cli.git /tmp/pip-req-build-w80c7itu
CPU times: user 27 ms, sys: 13.9 ms, total: 40.9 ms
Wall time: 9.28 s

In [ ]:

%%time
!aicrowd dataset download -c txtocr >/dev/null

CPU times: user 33.3 ms, sys: 8.27 ms, total: 41.5 ms
Wall time: 21.2 s

In [ ]:

%%time

!rm -rf data
!mkdir data

!mv train.csv data/train.csv
!mv val.csv data/val.csv

!unzip train.zip -d data/ > /dev/null
!unzip val.zip -d data/ > /dev/null
!unzip test.zip -d data/ > /dev/null

CPU times: user 28.1 ms, sys: 42.4 ms, total: 70.5 ms
Wall time: 4.34 s

In [ ]:

train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")

In [ ]:

# Adding full image path
train_df['image_id'] = "data/train/"+train_df['image_id'].astype(str)+".png"
train_df

Out[ ]:

	image_id	label
0	data/train/0.png	inventory
1	data/train/1.png	directories letter
2	data/train/2.png	growth splints
3	data/train/3.png	kicks
4	data/train/4.png	seventies
...	...	...
39995	data/train/39995.png	output dioxide
39996	data/train/39996.png	cruises fellow
39997	data/train/39997.png	turn
39998	data/train/39998.png	drift search
39999	data/train/39999.png	handler

40000 rows × 2 columns

In [ ]:

images = train_df['image_id'].astype(str).values

# https://github.com/tensorflow/tensorflow/issues/40919
raw_labels = train_df['label'].astype(str).values
max_length = max([len(label) for label in raw_labels])
labels = [label.ljust(max_length) for label in raw_labels]

characters = set(char for label in labels for char in label)

In [ ]:

print("Number of images found: ", len(images))
print("Number of labels found: ", len(labels))
print("Number of unique characters: ", len(characters))
print("Characters present: ", characters)

Number of images found:  40000
Number of labels found:  40000
Number of unique characters:  28
Characters present:  {'o', 'j', '.', 'f', 'u', 'x', 'q', ' ', 'b', 'z', 'p', 'w', 'v', 'l', 'h', 't', 'm', 'a', 'r', 's', 'y', 'g', 'd', 'n', 'c', 'k', 'e', 'i'}

In [ ]:

# Batch size for training and validation
batch_size = 32

# # Desired image dimensions
img_width = 235
img_height = 25

# Maximum length of any captcha in the dataset
max_length = max([len(label) for label in labels])
max_length

Out[ ]:

Preprocessing¶

In [ ]:

# https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup

# num_oov_indices
# The number of out-of-vocabulary tokens to use
# if this value is 0, passing an OOV input will result in a '-1' being returned for that value in the output tensor. 

# mask_token
# A token that represents masked values, and which is mapped to index 0.
# If set to None, no mask term will be added and the OOV tokens, if any, will be indexed from (0...num_oov_indices) instead of (1...num_oov_indices+1).

# invert
# If true, this layer will map indices to vocabulary items instead of mapping vocabulary items to indices.

# Mapping characters to integers
char_to_num = layers.experimental.preprocessing.StringLookup(vocabulary=list(characters), num_oov_indices=0, mask_token=None)

# Mapping integers back to original characters
num_to_char = layers.experimental.preprocessing.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)

In [ ]:

def split_data(images, labels, train_size=0.9, shuffle=True):
    # 1. Get the total size of the dataset
    size = len(images)

    # 2. Make an indices array and shuffle it, if required
    indices = np.arange(size)

    if shuffle:
        np.random.shuffle(indices)

    # 3. Get the size of training samples
    train_samples = int(size * train_size)

    # 4. Split data into training and validation sets
    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]

    return x_train, x_valid, y_train, y_valid

In [ ]:

# Splitting data into training and validation sets
x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels))

Word Segmentation¶

Use word segmentation to find the words and trim the image

In [ ]:

# https://github.com/githubharald/WordDetector/blob/master/src/WordSegmentation.py
def wordSegmentation(img, kernelSize=25, sigma=11, theta=7, minArea=0):
	"""Scale space technique for word segmentation proposed by R. Manmatha: http://ciir.cs.umass.edu/pubfiles/mm-27.pdf
	
	Args:
		img: grayscale uint8 image of the text-line to be segmented.
		kernelSize: size of filter kernel, must be an odd integer.
		sigma: standard deviation of Gaussian function used for filter kernel.
		theta: approximated width/height ratio of words, filter function is distorted by this factor.
		minArea: ignore word candidates smaller than specified area.
		
	Returns:
		List of tuples. Each tuple contains the bounding box and the image of the segmented word.
	"""

	# apply filter kernel
	kernel = createKernel(kernelSize, sigma, theta)
	imgFiltered = cv2.filter2D(img, -1, kernel, borderType=cv2.BORDER_REPLICATE).astype(np.uint8)
	(_, imgThres) = cv2.threshold(imgFiltered, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
	imgThres = 255 - imgThres

	# find connected components. OpenCV: return type differs between OpenCV2 and 3
	if cv2.__version__.startswith('3.'):
		(_, components, _) = cv2.findContours(imgThres, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
	else:
		(components, _) = cv2.findContours(imgThres, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

	# append components to result
	res = []
	for c in components:
		# skip small word candidates
		if cv2.contourArea(c) < minArea:
			continue
		# append bounding box and image of word to result list
		currBox = cv2.boundingRect(c) # returns (x, y, w, h)
		(x, y, w, h) = currBox
		currImg = img[y:y+h, x:x+w]
		res.append((currBox, currImg))

	# return list of words, sorted by x-coordinate
	return sorted(res, key=lambda entry:entry[0][0])


def prepareImg(img, height):
  """convert given image to grayscale image (if needed) and resize to desired height"""
  assert img.ndim in (2, 3)
  if img.ndim == 3:
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  h = img.shape[0]
  factor = height / h
  return cv2.resize(img, dsize=None, fx=factor, fy=factor)


def createKernel(kernelSize, sigma, theta):
	"""create anisotropic filter kernel according to given parameters"""
	assert kernelSize % 2 # must be odd size
	halfSize = kernelSize // 2
	
	kernel = np.zeros([kernelSize, kernelSize])
	sigmaX = sigma
	sigmaY = sigma * theta
	
	for i in range(kernelSize):
		for j in range(kernelSize):
			x = i - halfSize
			y = j - halfSize
			
			expTerm = np.exp(-x**2 / (2 * sigmaX) - y**2 / (2 * sigmaY))
			xTerm = (x**2 - sigmaX**2) / (2 * math.pi * sigmaX**5 * sigmaY)
			yTerm = (y**2 - sigmaY**2) / (2 * math.pi * sigmaY**5 * sigmaX)
			
			kernel[i, j] = (xTerm + yTerm) * expTerm

	kernel = kernel / np.sum(kernel)
	return kernel

In [ ]:

def trim_image(img_path, label):
  test_img = cv2.imread(img_path)
  if test_img.shape[0] > 25:
    test_img = test_img.reshape((256*256,3))
    mx = mode(test_img)[0][0]
    mask = (test_img[:,0] == mx[0]) & (test_img[:,1] == mx[1]) & (test_img[:,2] == mx[2])
    mask0 = np.logical_not(mask)
    ixs = np.where(mask)
    ixs0 = np.where(mask0)
    test_img[ixs] = [255,255,255]
    # test_img[ixs0] = [0,0,0]
    
    img = test_img.reshape((256,256,3))
    img = prepareImg(img, 256)

    res = wordSegmentation(img, kernelSize=11, sigma=11, theta=7, minArea=100)

    img = np.full((img_height,img_width,1), 255)
    for (j, w) in enumerate(res):
      (wordBox, wordImg) = w
      (x, y, w, h) = wordBox
      if x > 0 or y > 0:
        img[:h, :w, 0] = wordImg
        break
    cv2.imwrite(img_path, img)

In [ ]:

for i in tnrange(x_train.shape[0]):
  try:
    trim_image(x_train[i], y_train[i])  
  except: 
    print ("error " + str(i))

In [ ]:

for i in tnrange(x_valid.shape[0]):
  try:
    trim_image(x_valid[i], y_valid[i])  
  except: 
    print ("error " + str(i))

In [ ]:

def encode_single_sample(img_path, label):
  # 1. Read image
  img = tf.io.read_file(img_path)
  # 2. Decode and convert to grayscale
  img = tf.io.decode_png(img, channels=1)

  # 3. Convert to float32 in [0, 1] range
  img = tf.image.convert_image_dtype(img, tf.float32)

  # 4. Resize to the desired size
  img = tf.image.resize(img, [img_height, img_width])

  # 5. Transpose the image because we want the time
  # dimension to correspond to the width of the image.
  img = tf.transpose(img, perm=[1, 0, 2])

  # 6. Map the characters in label to numbers
  label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))

  # 7. Return a dict as our model is expecting two inputs
  return {"image": img, "label": label}

Create `Dataset` objects¶

In [ ]:

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = (
    train_dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
validation_dataset = (
    validation_dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

Visualize the data¶

In [ ]:

_, ax = plt.subplots(4, 4, figsize=(15, 10))
for batch in train_dataset.take(2):
    images = batch["image"]
    labels = batch["label"]
    for i in range(16):
        img = (images[i] * 255).numpy().astype("uint8")
        label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8")
        ax[i // 4, i % 4].imshow(img[:, :, 0].T, cmap="gray")
        ax[i // 4, i % 4].set_title(label)
        # ax[i // 4, i % 4].axis("off")
plt.show()

In [ ]:

_, ax = plt.subplots(4, 4, figsize=(15, 10))
for batch in validation_dataset.take(1):
    images = batch["image"]
    labels = batch["label"]
    for i in range(16):
        img = (images[i] * 255).numpy().astype("uint8")
        label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8")
        ax[i // 4, i % 4].imshow(img[:, :, 0].T, cmap="gray")
        ax[i // 4, i % 4].set_title(label)
        # ax[i // 4, i % 4].axis("off")
plt.show()

Model¶

In [ ]:

class CTCLayer(layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred


def build_model():
    # Inputs to the model
    input_img = layers.Input(
        shape=(img_width, img_height, 1), name="image", dtype="float32"
    )
    labels = layers.Input(name="label", shape=(None,), dtype="float32")

    # First conv block
    x = layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        padding="same",
        name="Conv1",
    )(input_img)
    x = layers.MaxPooling2D((2, 2), name="pool1")(x)

    new_shape = (117, 12*64)
    x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = layers.Dense(128, activation="relu", name="dense1")(x)
    x = layers.Dropout(0.2)(x)

    # RNNs
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)

    # Output layer
    x = layers.Dense(len(characters) + 1, activation="softmax", name="dense2")(x)

    # Add CTC layer for calculating CTC loss at each step
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
    )
    # Optimizer
    opt = keras.optimizers.Adam()
    # Compile the model and return
    model.compile(optimizer=opt)
    return model


# Get the model
model = build_model()
model.summary()

Model: "ocr_model_v1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
image (InputLayer)              [(None, 235, 25, 1)] 0                                            
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (None, 235, 25, 64)  640         image[0][0]                      
__________________________________________________________________________________________________
pool1 (MaxPooling2D)            (None, 117, 12, 64)  0           Conv1[0][0]                      
__________________________________________________________________________________________________
reshape (Reshape)               (None, 117, 768)     0           pool1[0][0]                      
__________________________________________________________________________________________________
dense1 (Dense)                  (None, 117, 128)     98432       reshape[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 117, 128)     0           dense1[0][0]                     
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 117, 256)     263168      dropout[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 117, 128)     164352      bidirectional[0][0]              
__________________________________________________________________________________________________
label (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
dense2 (Dense)                  (None, 117, 29)      3741        bidirectional_1[0][0]            
__________________________________________________________________________________________________
ctc_loss (CTCLayer)             (None, 117, 29)      0           label[0][0]                      
                                                                 dense2[0][0]                     
==================================================================================================
Total params: 530,333
Trainable params: 530,333
Non-trainable params: 0
__________________________________________________________________________________________________

Training¶

In [ ]:

epochs = 400
# epochs = 3

# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=30, restore_best_weights=True
)

In [ ]:

%%time

# Train the model
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=epochs,
    callbacks=[early_stopping],
)

Epoch 1/400
1125/1125 [==============================] - 95s 77ms/step - loss: 46.8700 - val_loss: 36.9004
Epoch 2/400
1125/1125 [==============================] - 83s 74ms/step - loss: 37.7375 - val_loss: 35.5993
Epoch 3/400
1125/1125 [==============================] - 89s 79ms/step - loss: 35.5441 - val_loss: 35.4597
Epoch 4/400
1125/1125 [==============================] - 86s 77ms/step - loss: 35.0401 - val_loss: 33.9017
Epoch 5/400
1125/1125 [==============================] - 87s 77ms/step - loss: 35.8359 - val_loss: 33.0631
Epoch 6/400
1125/1125 [==============================] - 87s 77ms/step - loss: 33.1114 - val_loss: 34.3963
Epoch 7/400
1125/1125 [==============================] - 89s 79ms/step - loss: 31.8982 - val_loss: 27.9485
Epoch 8/400
1125/1125 [==============================] - 89s 79ms/step - loss: 28.4396 - val_loss: 24.4276
Epoch 9/400
1125/1125 [==============================] - 89s 79ms/step - loss: 27.0137 - val_loss: 19.4449
Epoch 10/400
1125/1125 [==============================] - 90s 80ms/step - loss: 20.6134 - val_loss: 15.3185
Epoch 11/400
1125/1125 [==============================] - 90s 80ms/step - loss: 16.9711 - val_loss: 12.6856
Epoch 12/400
1125/1125 [==============================] - 90s 80ms/step - loss: 14.5089 - val_loss: 11.3033
Epoch 13/400
1125/1125 [==============================] - 89s 79ms/step - loss: 12.9571 - val_loss: 10.3907
Epoch 14/400
1125/1125 [==============================] - 89s 79ms/step - loss: 11.8991 - val_loss: 9.6273
Epoch 15/400
1125/1125 [==============================] - 89s 79ms/step - loss: 11.2281 - val_loss: 9.2238
Epoch 16/400
1125/1125 [==============================] - 89s 79ms/step - loss: 10.7809 - val_loss: 8.7400
Epoch 17/400
1125/1125 [==============================] - 89s 79ms/step - loss: 10.0599 - val_loss: 8.3878
Epoch 18/400
1125/1125 [==============================] - 89s 79ms/step - loss: 9.6996 - val_loss: 8.3386
Epoch 19/400
1125/1125 [==============================] - 89s 79ms/step - loss: 9.2963 - val_loss: 7.8502
Epoch 20/400
1125/1125 [==============================] - 89s 79ms/step - loss: 9.0327 - val_loss: 7.7680
Epoch 21/400
1125/1125 [==============================] - 88s 78ms/step - loss: 8.7109 - val_loss: 7.6727
Epoch 22/400
1125/1125 [==============================] - 88s 78ms/step - loss: 8.5610 - val_loss: 7.5941
Epoch 23/400
1125/1125 [==============================] - 88s 78ms/step - loss: 8.3549 - val_loss: 7.3699
Epoch 24/400
1125/1125 [==============================] - 88s 79ms/step - loss: 8.1578 - val_loss: 7.4173
Epoch 25/400
1125/1125 [==============================] - 89s 79ms/step - loss: 8.0239 - val_loss: 7.1488
Epoch 26/400
1125/1125 [==============================] - 89s 79ms/step - loss: 8.7627 - val_loss: 7.3471
Epoch 27/400
1125/1125 [==============================] - 89s 79ms/step - loss: 8.0096 - val_loss: 7.1765
Epoch 28/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.7866 - val_loss: 6.9899
Epoch 29/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.6530 - val_loss: 7.0307
Epoch 30/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.5798 - val_loss: 7.0151
Epoch 31/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.5237 - val_loss: 7.0099
Epoch 32/400
1125/1125 [==============================] - 89s 79ms/step - loss: 7.4506 - val_loss: 6.9158
Epoch 33/400
1125/1125 [==============================] - 89s 79ms/step - loss: 7.3844 - val_loss: 7.0288
Epoch 34/400
1125/1125 [==============================] - 89s 79ms/step - loss: 7.4062 - val_loss: 6.8837
Epoch 35/400
1125/1125 [==============================] - 88s 79ms/step - loss: 7.3170 - val_loss: 6.8047
Epoch 36/400
1125/1125 [==============================] - 89s 79ms/step - loss: 7.2100 - val_loss: 6.8833
Epoch 37/400
1125/1125 [==============================] - 87s 77ms/step - loss: 7.2245 - val_loss: 6.7532
Epoch 38/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.1663 - val_loss: 6.8330
Epoch 39/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.2074 - val_loss: 6.6692
Epoch 40/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.0731 - val_loss: 6.6811
Epoch 41/400
1125/1125 [==============================] - 88s 79ms/step - loss: 7.5129 - val_loss: 6.7899
Epoch 42/400
1125/1125 [==============================] - 88s 79ms/step - loss: 7.1186 - val_loss: 6.6455
Epoch 43/400
1125/1125 [==============================] - 88s 79ms/step - loss: 6.9863 - val_loss: 6.6346
Epoch 44/400
1125/1125 [==============================] - 88s 78ms/step - loss: 6.9285 - val_loss: 6.7346
Epoch 45/400
1125/1125 [==============================] - 88s 78ms/step - loss: 7.0492 - val_loss: 6.6364
Epoch 46/400
1125/1125 [==============================] - 87s 78ms/step - loss: 6.9148 - val_loss: 6.6134
Epoch 47/400
1125/1125 [==============================] - 87s 78ms/step - loss: 6.8988 - val_loss: 6.6835
Epoch 48/400
1125/1125 [==============================] - 87s 77ms/step - loss: 6.9926 - val_loss: 6.7123
Epoch 49/400
1125/1125 [==============================] - 87s 77ms/step - loss: 6.9189 - val_loss: 6.6878
Epoch 50/400
1125/1125 [==============================] - 86s 76ms/step - loss: 6.9223 - val_loss: 6.7753
Epoch 51/400
1125/1125 [==============================] - 85s 75ms/step - loss: 6.9467 - val_loss: 6.6063
Epoch 52/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.8585 - val_loss: 6.5611
Epoch 53/400
1125/1125 [==============================] - 86s 76ms/step - loss: 6.7276 - val_loss: 6.5575
Epoch 54/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.7755 - val_loss: 6.6025
Epoch 55/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.8066 - val_loss: 6.6395
Epoch 56/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.7391 - val_loss: 6.5602
Epoch 57/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.7054 - val_loss: 6.8991
Epoch 58/400
1125/1125 [==============================] - 84s 74ms/step - loss: 6.8656 - val_loss: 6.8013
Epoch 59/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.9445 - val_loss: 6.5150
Epoch 60/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.7712 - val_loss: 6.5681
Epoch 61/400
1125/1125 [==============================] - 84s 74ms/step - loss: 6.6712 - val_loss: 6.5647
Epoch 62/400
1125/1125 [==============================] - 83s 74ms/step - loss: 6.8137 - val_loss: 6.5368
Epoch 63/400
1125/1125 [==============================] - 85s 75ms/step - loss: 6.7327 - val_loss: 6.6847
Epoch 64/400
1125/1125 [==============================] - 85s 76ms/step - loss: 7.1577 - val_loss: 6.5169
Epoch 65/400
1125/1125 [==============================] - 83s 74ms/step - loss: 6.7037 - val_loss: 6.5050
Epoch 66/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.7560 - val_loss: 7.0524
Epoch 67/400
1125/1125 [==============================] - 85s 75ms/step - loss: 6.9598 - val_loss: 6.5437
Epoch 68/400
1125/1125 [==============================] - 85s 75ms/step - loss: 6.9838 - val_loss: 7.2453
Epoch 69/400
1125/1125 [==============================] - 85s 76ms/step - loss: 7.4513 - val_loss: 7.2874
Epoch 70/400
1125/1125 [==============================] - 84s 74ms/step - loss: 7.5807 - val_loss: 7.5899
Epoch 71/400
1125/1125 [==============================] - 86s 76ms/step - loss: 7.5248 - val_loss: 7.3639
Epoch 72/400
1125/1125 [==============================] - 83s 74ms/step - loss: 8.1322 - val_loss: 7.2352
Epoch 73/400
1125/1125 [==============================] - 83s 74ms/step - loss: 7.6353 - val_loss: 6.7875
Epoch 74/400
1125/1125 [==============================] - 85s 76ms/step - loss: 7.1127 - val_loss: 6.9016
Epoch 75/400
1125/1125 [==============================] - 85s 75ms/step - loss: 8.3543 - val_loss: 7.1245
Epoch 76/400
1125/1125 [==============================] - 84s 75ms/step - loss: 7.3673 - val_loss: 6.7835
Epoch 77/400
1125/1125 [==============================] - 82s 73ms/step - loss: 7.0334 - val_loss: 6.9173
Epoch 78/400
1125/1125 [==============================] - 82s 73ms/step - loss: 7.1928 - val_loss: 6.9093
Epoch 79/400
1125/1125 [==============================] - 85s 75ms/step - loss: 7.1595 - val_loss: 6.8475
Epoch 80/400
1125/1125 [==============================] - 83s 74ms/step - loss: 7.0391 - val_loss: 6.8728
Epoch 81/400
1125/1125 [==============================] - 83s 74ms/step - loss: 6.9991 - val_loss: 6.8571
Epoch 82/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.9960 - val_loss: 6.8143
Epoch 83/400
1125/1125 [==============================] - 85s 75ms/step - loss: 6.9563 - val_loss: 6.7544
Epoch 84/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.9012 - val_loss: 6.8077
Epoch 85/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.9911 - val_loss: 6.8287
Epoch 86/400
1125/1125 [==============================] - 85s 75ms/step - loss: 7.1026 - val_loss: 6.7481
Epoch 87/400
1125/1125 [==============================] - 83s 74ms/step - loss: 6.8594 - val_loss: 6.7476
Epoch 88/400
1125/1125 [==============================] - 85s 75ms/step - loss: 7.0192 - val_loss: 6.8026
Epoch 89/400
1125/1125 [==============================] - 85s 76ms/step - loss: 6.8400 - val_loss: 6.8680
Epoch 90/400
1125/1125 [==============================] - 84s 74ms/step - loss: 6.8117 - val_loss: 6.7519
Epoch 91/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.8644 - val_loss: 6.7582
Epoch 92/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.7561 - val_loss: 6.8289
Epoch 93/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.7425 - val_loss: 6.7381
Epoch 94/400
1125/1125 [==============================] - 84s 75ms/step - loss: 6.7423 - val_loss: 6.6709
Epoch 95/400
1125/1125 [==============================] - 85s 75ms/step - loss: 6.8552 - val_loss: 7.0135
CPU times: user 3h 13min 51s, sys: 21min 54s, total: 3h 35min 46s
Wall time: 2h 16min 53s

In [ ]:

# model.save('/content/drive/My Drive/Colab Notebooks/txtocr.h5')

Inference¶

In [ ]:

# Get the prediction model by extracting layers till the output layer
prediction_model = keras.models.Model(
    model.get_layer(name="image").input, model.get_layer(name="dense2").output
)
prediction_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
image (InputLayer)           [(None, 235, 25, 1)]      0         
_________________________________________________________________
Conv1 (Conv2D)               (None, 235, 25, 64)       640       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 117, 12, 64)       0         
_________________________________________________________________
reshape (Reshape)            (None, 117, 768)          0         
_________________________________________________________________
dense1 (Dense)               (None, 117, 128)          98432     
_________________________________________________________________
dropout (Dropout)            (None, 117, 128)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 117, 256)          263168    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 117, 128)          164352    
_________________________________________________________________
dense2 (Dense)               (None, 117, 29)           3741      
=================================================================
Total params: 530,333
Trainable params: 530,333
Non-trainable params: 0
_________________________________________________________________

In [ ]:

# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
        :, :max_length
    ]
    # Iterate over the results and get back the text
    output_text = []
    for res in results:
        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
        output_text.append(res)
    return output_text

In [ ]:

#  Let's check results on some validation samples
for batch in validation_dataset.take(3):
    batch_images = batch["image"]
    batch_labels = batch["label"]

    preds = prediction_model.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)

    orig_texts = []
    for label in batch_labels:
        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
        orig_texts.append(label)

    _, ax = plt.subplots(4, 4, figsize=(15, 5))
    for i in range(16):
        img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)
        img = img.T
        title = f"Prediction: {pred_texts[i]}"
        ax[i // 4, i % 4].imshow(img, cmap="gray")
        ax[i // 4, i % 4].set_title(title)
        ax[i // 4, i % 4].axis("off")
plt.show()

Prediction¶

In [ ]:

path = "data/test"
test_imgs_paths = os.listdir(path)
test_imgs_paths.sort()

image_ids = []
labels = []

for test_img_path in tqdm(test_imgs_paths):
  image_id = test_img_path.split(".")[0]
  image_ids.append(image_id)

  test_img_path = path+"/"+test_img_path

  test_img = cv2.imread(test_img_path)

  if test_img.shape[0] > 25:
    trim_image(test_img_path, "")
    test_img = cv2.imread(test_img_path)

  enc = encode_single_sample(test_img_path, "label")

  preds = prediction_model.predict(tf.expand_dims(enc['image'], axis=0))
  pred_texts = decode_batch_predictions(preds)
  label = pred_texts[0].strip()
  labels.append(label)

  visualize = False
  if visualize:
    print(test_img_path)
    fig, ax = plt.subplots()
    im = ax.imshow(test_img, cmap="gray")
    ax.set_title(label)
    ax.axis('off')
    plt.show()    

test_df = pd.DataFrame.from_dict({'image_id': image_ids, 'label': labels})

In [ ]:

test_df.sort_values('image_id', inplace=True)
test_df.to_csv("submission.csv", index=False)
files.download('submission.csv')

Well Done! 👍 We are all set to make a submission and see your name on leaderborad. Let navigate to challenge page and make one.¶

Content

1159

Show Comments

Comments

You must login before you can post a comment.

TXTOCR

Third place solution for "Text OCR"

Setup¶

Preprocessing¶

Word Segmentation¶

Create Dataset objects¶

Visualize the data¶

Model¶

Training¶

Inference¶

Prediction¶

Well Done! 👍 We are all set to make a submission and see your name on leaderborad. Let navigate to challenge page and make one.¶

Content

Create `Dataset` objects¶