Skip to content

Commit

Permalink
[orientation] page orientation improvements (#1553)
Browse files Browse the repository at this point in the history
  • Loading branch information
felixdittrich92 committed Jun 12, 2024
1 parent 4d9552b commit a26bea5
Show file tree
Hide file tree
Showing 12 changed files with 217 additions and 72 deletions.
73 changes: 55 additions & 18 deletions doctr/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import numpy as np
from langdetect import LangDetectException, detect_langs

from doctr.utils.geometry import rotate_image

__all__ = ["estimate_orientation", "get_language", "invert_data_structure"]


Expand All @@ -29,42 +31,63 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
return max(w / h, h / w)


def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> int:
def estimate_orientation(
img: np.ndarray,
general_page_orientation: Optional[Tuple[int, float]] = None,
n_ct: int = 70,
ratio_threshold_for_lines: float = 3,
min_confidence: float = 0.2,
lower_area: int = 100,
) -> int:
"""Estimate the angle of the general document orientation based on the
lines of the document and the assumption that they should be horizontal.
Args:
----
img: the img or bitmap to analyze (H, W, C)
general_page_orientation: the general orientation of the page (angle [0, 90, 180, 270 (-90)], confidence)
estimated by a model
n_ct: the number of contours used for the orientation estimation
ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines
min_confidence: the minimum confidence to consider the general_page_orientation
lower_area: the minimum area of a contour to be considered
Returns:
-------
the angle of the general document orientation
the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
"""
assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
max_value = np.max(img)
min_value = np.min(img)
if max_value <= 1 and min_value >= 0 or (max_value <= 255 and min_value >= 0 and img.shape[-1] == 1):
thresh = img.astype(np.uint8)
if max_value <= 255 and min_value >= 0 and img.shape[-1] == 3:
thresh = None
# Convert image to grayscale if necessary
if img.shape[-1] == 3:
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray_img = cv2.medianBlur(gray_img, 5)
thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# try to merge words in lines
(h, w) = img.shape[:2]
k_x = max(1, (floor(w / 100)))
k_y = max(1, (floor(h / 100)))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
thresh = cv2.dilate(thresh, kernel, iterations=1)
else:
thresh = img.astype(np.uint8) # type: ignore[assignment]

page_orientation, orientation_confidence = general_page_orientation or (None, 0.0)
if page_orientation and orientation_confidence >= min_confidence:
# We rotate the image to the general orientation which improves the detection
# No expand needed bitmap is already padded
thresh = rotate_image(thresh, -page_orientation) # type: ignore
else: # That's only required if we do not work on the detection models bin map
# try to merge words in lines
(h, w) = img.shape[:2]
k_x = max(1, (floor(w / 100)))
k_y = max(1, (floor(h / 100)))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
thresh = cv2.dilate(thresh, kernel, iterations=1)

# extract contours
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

# Sort contours
contours = sorted(contours, key=get_max_width_length_ratio, reverse=True)
# Filter & Sort contours
contours = sorted(
[contour for contour in contours if cv2.contourArea(contour) > lower_area],
key=get_max_width_length_ratio,
reverse=True,
)

angles = []
for contour in contours[:n_ct]:
Expand All @@ -75,10 +98,24 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li
angles.append(angle - 90)

if len(angles) == 0:
return 0 # in case no angles is found
estimated_angle = 0 # in case no angles is found
else:
median = -median_low(angles)
return round(median) if abs(median) != 0 else 0
estimated_angle = -round(median) if abs(median) != 0 else 0

# combine with the general orientation and the estimated angle
if page_orientation and orientation_confidence >= min_confidence:
# special case where the estimated angle is mostly wrong:
# case 1: - and + swapped
# case 2: estimated angle is completely wrong
# so in this case we prefer the general page orientation
if abs(estimated_angle) == abs(page_orientation):
return page_orientation
estimated_angle = estimated_angle if page_orientation == 0 else page_orientation + estimated_angle
if estimated_angle > 180:
estimated_angle -= 360

return estimated_angle # return the clockwise angle (negative - left side rotation, positive - right side rotation)


def rectify_crops(
Expand Down
3 changes: 3 additions & 0 deletions doctr/models/classification/mobilenet/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,15 +252,18 @@ def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any)
"""MobileNetV3-Small architecture as described in
`"Searching for MobileNetV3",
<https://arxiv.org/pdf/1905.02244.pdf>`_.
>>> import torch
>>> from doctr.models import mobilenet_v3_small_page_orientation
>>> model = mobilenet_v3_small_page_orientation(pretrained=False)
>>> input_tensor = torch.rand((1, 3, 512, 512), dtype=torch.float32)
>>> out = model(input_tensor)
Args:
----
pretrained: boolean, True if model is pretrained
**kwargs: keyword arguments of the MobileNetV3 architecture
Returns:
-------
a torch.nn.Module
Expand Down
3 changes: 3 additions & 0 deletions doctr/models/classification/mobilenet/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,15 +421,18 @@ def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any)
"""MobileNetV3-Small architecture as described in
`"Searching for MobileNetV3",
<https://arxiv.org/pdf/1905.02244.pdf>`_.
>>> import tensorflow as tf
>>> from doctr.models import mobilenet_v3_small_page_orientation
>>> model = mobilenet_v3_small_page_orientation(pretrained=False)
>>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
>>> out = model(input_tensor)
Args:
----
pretrained: boolean, True if model is pretrained
**kwargs: keyword arguments of the MobileNetV3 architecture
Returns:
-------
a keras.Model
Expand Down
8 changes: 7 additions & 1 deletion doctr/models/kie_predictor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,25 @@ class _KIEPredictor(_OCRPredictor):
accordingly. Doing so will improve performances for documents with page-uniform rotations.
preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding)
symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically.
detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
page. Doing so will slightly deteriorate the overall latency.
kwargs: keyword args of `DocumentBuilder`
"""

crop_orientation_predictor: Optional[OrientationPredictor]
page_orientation_predictor: Optional[OrientationPredictor]

def __init__(
self,
assume_straight_pages: bool = True,
straighten_pages: bool = False,
preserve_aspect_ratio: bool = True,
symmetric_pad: bool = True,
detect_orientation: bool = False,
**kwargs: Any,
) -> None:
super().__init__(assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs)
super().__init__(
assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, detect_orientation, **kwargs
)

self.doc_builder: KIEDocumentBuilder = KIEDocumentBuilder(**kwargs)
25 changes: 14 additions & 11 deletions doctr/models/kie_predictor/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from torch import nn

from doctr.io.elements import Document
from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
from doctr.models._utils import get_language, invert_data_structure
from doctr.models.detection.predictor import DetectionPredictor
from doctr.models.recognition.predictor import RecognitionPredictor
from doctr.utils.geometry import detach_scores, rotate_image
from doctr.utils.geometry import detach_scores

from .base import _KIEPredictor

Expand Down Expand Up @@ -55,7 +55,13 @@ def __init__(
self.det_predictor = det_predictor.eval() # type: ignore[attr-defined]
self.reco_predictor = reco_predictor.eval() # type: ignore[attr-defined]
_KIEPredictor.__init__(
self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
self,
assume_straight_pages,
straighten_pages,
preserve_aspect_ratio,
symmetric_pad,
detect_orientation,
**kwargs,
)
self.detect_orientation = detect_orientation
self.detect_language = detect_language
Expand Down Expand Up @@ -83,19 +89,16 @@ def forward(
for out_map in out_maps
]
if self.detect_orientation:
origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps) # type: ignore[arg-type]
orientations = [
{"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
{"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
]
else:
orientations = None
general_pages_orientations = None
origin_pages_orientations = None
if self.straighten_pages:
origin_page_orientations = (
origin_page_orientations
if self.detect_orientation
else [estimate_orientation(seq_map) for seq_map in seg_maps]
)
pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)] # type: ignore[arg-type]
pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations) # type: ignore
# Forward again to get predictions on straight pages
loc_preds = self.det_predictor(pages, **kwargs)

Expand Down
25 changes: 14 additions & 11 deletions doctr/models/kie_predictor/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
import tensorflow as tf

from doctr.io.elements import Document
from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
from doctr.models._utils import get_language, invert_data_structure
from doctr.models.detection.predictor import DetectionPredictor
from doctr.models.recognition.predictor import RecognitionPredictor
from doctr.utils.geometry import detach_scores, rotate_image
from doctr.utils.geometry import detach_scores
from doctr.utils.repr import NestedObject

from .base import _KIEPredictor
Expand Down Expand Up @@ -56,7 +56,13 @@ def __init__(
self.det_predictor = det_predictor
self.reco_predictor = reco_predictor
_KIEPredictor.__init__(
self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
self,
assume_straight_pages,
straighten_pages,
preserve_aspect_ratio,
symmetric_pad,
detect_orientation,
**kwargs,
)
self.detect_orientation = detect_orientation
self.detect_language = detect_language
Expand All @@ -83,19 +89,16 @@ def __call__(
for out_map in out_maps
]
if self.detect_orientation:
origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
orientations = [
{"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
{"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
]
else:
orientations = None
general_pages_orientations = None
origin_pages_orientations = None
if self.straighten_pages:
origin_page_orientations = (
origin_page_orientations
if self.detect_orientation
else [estimate_orientation(seq_map) for seq_map in seg_maps]
)
pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
# Forward again to get predictions on straight pages
loc_preds = self.det_predictor(pages, **kwargs) # type: ignore[assignment]

Expand Down
62 changes: 59 additions & 3 deletions doctr/models/predictor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
import numpy as np

from doctr.models.builder import DocumentBuilder
from doctr.utils.geometry import extract_crops, extract_rcrops
from doctr.utils.geometry import extract_crops, extract_rcrops, rotate_image

from .._utils import rectify_crops, rectify_loc_preds
from ..classification import crop_orientation_predictor
from .._utils import estimate_orientation, rectify_crops, rectify_loc_preds
from ..classification import crop_orientation_predictor, page_orientation_predictor
from ..classification.predictor import OrientationPredictor

__all__ = ["_OCRPredictor"]
Expand All @@ -29,27 +29,83 @@ class _OCRPredictor:
accordingly. Doing so will improve performances for documents with page-uniform rotations.
preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding)
symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically.
detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
page. Doing so will slightly deteriorate the overall latency.
**kwargs: keyword args of `DocumentBuilder`
"""

crop_orientation_predictor: Optional[OrientationPredictor]
page_orientation_predictor: Optional[OrientationPredictor]

def __init__(
self,
assume_straight_pages: bool = True,
straighten_pages: bool = False,
preserve_aspect_ratio: bool = True,
symmetric_pad: bool = True,
detect_orientation: bool = False,
**kwargs: Any,
) -> None:
self.assume_straight_pages = assume_straight_pages
self.straighten_pages = straighten_pages
self.crop_orientation_predictor = None if assume_straight_pages else crop_orientation_predictor(pretrained=True)
self.page_orientation_predictor = (
page_orientation_predictor(pretrained=True)
if detect_orientation or straighten_pages or not assume_straight_pages
else None
)
self.doc_builder = DocumentBuilder(**kwargs)
self.preserve_aspect_ratio = preserve_aspect_ratio
self.symmetric_pad = symmetric_pad
self.hooks: List[Callable] = []

def _general_page_orientations(
self,
pages: List[np.ndarray],
) -> List[Tuple[int, float]]:
_, classes, probs = zip(self.page_orientation_predictor(pages)) # type: ignore[misc]
# Flatten to list of tuples with (value, confidence)
page_orientations = [
(orientation, prob)
for page_classes, page_probs in zip(classes, probs)
for orientation, prob in zip(page_classes, page_probs)
]
return page_orientations

def _get_orientations(
self, pages: List[np.ndarray], seg_maps: List[np.ndarray]
) -> Tuple[List[Tuple[int, float]], List[int]]:
general_pages_orientations = self._general_page_orientations(pages)
origin_page_orientations = [
estimate_orientation(seq_map, general_orientation)
for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
]
return general_pages_orientations, origin_page_orientations

def _straighten_pages(
self,
pages: List[np.ndarray],
seg_maps: List[np.ndarray],
general_pages_orientations: Optional[List[Tuple[int, float]]] = None,
origin_pages_orientations: Optional[List[int]] = None,
) -> List[np.ndarray]:
general_pages_orientations = (
general_pages_orientations if general_pages_orientations else self._general_page_orientations(pages)
)
origin_pages_orientations = (
origin_pages_orientations
if origin_pages_orientations
else [
estimate_orientation(seq_map, general_orientation)
for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
]
)
return [
# We exapnd if the page is wider than tall and the angle is 90 or -90
rotate_image(page, angle, expand=page.shape[1] > page.shape[0] and abs(angle) == 90)
for page, angle in zip(pages, origin_pages_orientations)
]

@staticmethod
def _generate_crops(
pages: List[np.ndarray],
Expand Down
Loading

0 comments on commit a26bea5

Please sign in to comment.