[orientation] page orientation improvements (#1553)

mindee · Jun 12, 2024 · a26bea5 · a26bea5
1 parent 4d9552b
commit a26bea5
Show file tree

Hide file tree

Showing 12 changed files with 217 additions and 72 deletions.
diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py
@@ -11,6 +11,8 @@
 import numpy as np
 from langdetect import LangDetectException, detect_langs
 
+from doctr.utils.geometry import rotate_image
+
 __all__ = ["estimate_orientation", "get_language", "invert_data_structure"]
 
 
@@ -29,42 +31,63 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
     return max(w / h, h / w)
 
 
-def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> int:
+def estimate_orientation(
+    img: np.ndarray,
+    general_page_orientation: Optional[Tuple[int, float]] = None,
+    n_ct: int = 70,
+    ratio_threshold_for_lines: float = 3,
+    min_confidence: float = 0.2,
+    lower_area: int = 100,
+) -> int:
     """Estimate the angle of the general document orientation based on the
      lines of the document and the assumption that they should be horizontal.
 
     Args:
     ----
         img: the img or bitmap to analyze (H, W, C)
+        general_page_orientation: the general orientation of the page (angle [0, 90, 180, 270 (-90)], confidence)
+            estimated by a model
         n_ct: the number of contours used for the orientation estimation
         ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines
+        min_confidence: the minimum confidence to consider the general_page_orientation
+        lower_area: the minimum area of a contour to be considered
 
     Returns:
     -------
-        the angle of the general document orientation
+        the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
     """
     assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
-    max_value = np.max(img)
-    min_value = np.min(img)
-    if max_value <= 1 and min_value >= 0 or (max_value <= 255 and min_value >= 0 and img.shape[-1] == 1):
-        thresh = img.astype(np.uint8)
-    if max_value <= 255 and min_value >= 0 and img.shape[-1] == 3:
+    thresh = None
+    # Convert image to grayscale if necessary
+    if img.shape[-1] == 3:
         gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
         gray_img = cv2.medianBlur(gray_img, 5)
         thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-
-    # try to merge words in lines
-    (h, w) = img.shape[:2]
-    k_x = max(1, (floor(w / 100)))
-    k_y = max(1, (floor(h / 100)))
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
-    thresh = cv2.dilate(thresh, kernel, iterations=1)
+    else:
+        thresh = img.astype(np.uint8)  # type: ignore[assignment]
+
+    page_orientation, orientation_confidence = general_page_orientation or (None, 0.0)
+    if page_orientation and orientation_confidence >= min_confidence:
+        # We rotate the image to the general orientation which improves the detection
+        # No expand needed bitmap is already padded
+        thresh = rotate_image(thresh, -page_orientation)  # type: ignore
+    else:  # That's only required if we do not work on the detection models bin map
+        # try to merge words in lines
+        (h, w) = img.shape[:2]
+        k_x = max(1, (floor(w / 100)))
+        k_y = max(1, (floor(h / 100)))
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
+        thresh = cv2.dilate(thresh, kernel, iterations=1)
 
     # extract contours
     contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
 
-    # Sort contours
-    contours = sorted(contours, key=get_max_width_length_ratio, reverse=True)
+    # Filter & Sort contours
+    contours = sorted(
+        [contour for contour in contours if cv2.contourArea(contour) > lower_area],
+        key=get_max_width_length_ratio,
+        reverse=True,
+    )
 
     angles = []
     for contour in contours[:n_ct]:
@@ -75,10 +98,24 @@ def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_li
             angles.append(angle - 90)
 
     if len(angles) == 0:
-        return 0  # in case no angles is found
+        estimated_angle = 0  # in case no angles is found
     else:
         median = -median_low(angles)
-        return round(median) if abs(median) != 0 else 0
+        estimated_angle = -round(median) if abs(median) != 0 else 0
+
+    # combine with the general orientation and the estimated angle
+    if page_orientation and orientation_confidence >= min_confidence:
+        # special case where the estimated angle is mostly wrong:
+        # case 1: - and + swapped
+        # case 2: estimated angle is completely wrong
+        # so in this case we prefer the general page orientation
+        if abs(estimated_angle) == abs(page_orientation):
+            return page_orientation
+        estimated_angle = estimated_angle if page_orientation == 0 else page_orientation + estimated_angle
+        if estimated_angle > 180:
+            estimated_angle -= 360
+
+    return estimated_angle  # return the clockwise angle (negative - left side rotation, positive - right side rotation)
 
 
 def rectify_crops(

diff --git a/doctr/models/classification/mobilenet/pytorch.py b/doctr/models/classification/mobilenet/pytorch.py
@@ -252,15 +252,18 @@ def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any)
     """MobileNetV3-Small architecture as described in
     `"Searching for MobileNetV3",
     <https://arxiv.org/pdf/1905.02244.pdf>`_.
+
     >>> import torch
     >>> from doctr.models import mobilenet_v3_small_page_orientation
     >>> model = mobilenet_v3_small_page_orientation(pretrained=False)
     >>> input_tensor = torch.rand((1, 3, 512, 512), dtype=torch.float32)
     >>> out = model(input_tensor)
+
     Args:
     ----
         pretrained: boolean, True if model is pretrained
         **kwargs: keyword arguments of the MobileNetV3 architecture
+
     Returns:
     -------
         a torch.nn.Module

diff --git a/doctr/models/classification/mobilenet/tensorflow.py b/doctr/models/classification/mobilenet/tensorflow.py
@@ -421,15 +421,18 @@ def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any)
     """MobileNetV3-Small architecture as described in
     `"Searching for MobileNetV3",
     <https://arxiv.org/pdf/1905.02244.pdf>`_.
+
     >>> import tensorflow as tf
     >>> from doctr.models import mobilenet_v3_small_page_orientation
     >>> model = mobilenet_v3_small_page_orientation(pretrained=False)
     >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
+
     Args:
     ----
         pretrained: boolean, True if model is pretrained
         **kwargs: keyword arguments of the MobileNetV3 architecture
+
     Returns:
     -------
         a keras.Model

diff --git a/doctr/models/kie_predictor/base.py b/doctr/models/kie_predictor/base.py
@@ -25,19 +25,25 @@ class _KIEPredictor(_OCRPredictor):
             accordingly. Doing so will improve performances for documents with page-uniform rotations.
         preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding)
         symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically.
+        detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
+            page. Doing so will slightly deteriorate the overall latency.
         kwargs: keyword args of `DocumentBuilder`
     """
 
     crop_orientation_predictor: Optional[OrientationPredictor]
+    page_orientation_predictor: Optional[OrientationPredictor]
 
     def __init__(
         self,
         assume_straight_pages: bool = True,
         straighten_pages: bool = False,
         preserve_aspect_ratio: bool = True,
         symmetric_pad: bool = True,
+        detect_orientation: bool = False,
         **kwargs: Any,
     ) -> None:
-        super().__init__(assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs)
+        super().__init__(
+            assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, detect_orientation, **kwargs
+        )
 
         self.doc_builder: KIEDocumentBuilder = KIEDocumentBuilder(**kwargs)
diff --git a/doctr/models/kie_predictor/pytorch.py b/doctr/models/kie_predictor/pytorch.py
@@ -10,10 +10,10 @@
 from torch import nn
 
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
+from doctr.models._utils import get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import detach_scores, rotate_image
+from doctr.utils.geometry import detach_scores
 
 from .base import _KIEPredictor
 
@@ -55,7 +55,13 @@ def __init__(
         self.det_predictor = det_predictor.eval()  # type: ignore[attr-defined]
         self.reco_predictor = reco_predictor.eval()  # type: ignore[attr-defined]
         _KIEPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -83,19 +89,16 @@ def forward(
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)  # type: ignore[arg-type]
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]  # type: ignore[arg-type]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)  # type: ignore
             # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)
 

diff --git a/doctr/models/kie_predictor/tensorflow.py b/doctr/models/kie_predictor/tensorflow.py
@@ -9,10 +9,10 @@
 import tensorflow as tf
 
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
+from doctr.models._utils import get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import detach_scores, rotate_image
+from doctr.utils.geometry import detach_scores
 from doctr.utils.repr import NestedObject
 
 from .base import _KIEPredictor
@@ -56,7 +56,13 @@ def __init__(
         self.det_predictor = det_predictor
         self.reco_predictor = reco_predictor
         _KIEPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -83,19 +89,16 @@ def __call__(
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
             # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
 

diff --git a/doctr/models/predictor/base.py b/doctr/models/predictor/base.py
@@ -8,10 +8,10 @@
 import numpy as np
 
 from doctr.models.builder import DocumentBuilder
-from doctr.utils.geometry import extract_crops, extract_rcrops
+from doctr.utils.geometry import extract_crops, extract_rcrops, rotate_image
 
-from .._utils import rectify_crops, rectify_loc_preds
-from ..classification import crop_orientation_predictor
+from .._utils import estimate_orientation, rectify_crops, rectify_loc_preds
+from ..classification import crop_orientation_predictor, page_orientation_predictor
 from ..classification.predictor import OrientationPredictor
 
 __all__ = ["_OCRPredictor"]
@@ -29,27 +29,83 @@ class _OCRPredictor:
             accordingly. Doing so will improve performances for documents with page-uniform rotations.
         preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding)
         symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically.
+        detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
+            page. Doing so will slightly deteriorate the overall latency.
         **kwargs: keyword args of `DocumentBuilder`
     """
 
     crop_orientation_predictor: Optional[OrientationPredictor]
+    page_orientation_predictor: Optional[OrientationPredictor]
 
     def __init__(
         self,
         assume_straight_pages: bool = True,
         straighten_pages: bool = False,
         preserve_aspect_ratio: bool = True,
         symmetric_pad: bool = True,
+        detect_orientation: bool = False,
         **kwargs: Any,
     ) -> None:
         self.assume_straight_pages = assume_straight_pages
         self.straighten_pages = straighten_pages
         self.crop_orientation_predictor = None if assume_straight_pages else crop_orientation_predictor(pretrained=True)
+        self.page_orientation_predictor = (
+            page_orientation_predictor(pretrained=True)
+            if detect_orientation or straighten_pages or not assume_straight_pages
+            else None
+        )
         self.doc_builder = DocumentBuilder(**kwargs)
         self.preserve_aspect_ratio = preserve_aspect_ratio
         self.symmetric_pad = symmetric_pad
         self.hooks: List[Callable] = []
 
+    def _general_page_orientations(
+        self,
+        pages: List[np.ndarray],
+    ) -> List[Tuple[int, float]]:
+        _, classes, probs = zip(self.page_orientation_predictor(pages))  # type: ignore[misc]
+        # Flatten to list of tuples with (value, confidence)
+        page_orientations = [
+            (orientation, prob)
+            for page_classes, page_probs in zip(classes, probs)
+            for orientation, prob in zip(page_classes, page_probs)
+        ]
+        return page_orientations
+
+    def _get_orientations(
+        self, pages: List[np.ndarray], seg_maps: List[np.ndarray]
+    ) -> Tuple[List[Tuple[int, float]], List[int]]:
+        general_pages_orientations = self._general_page_orientations(pages)
+        origin_page_orientations = [
+            estimate_orientation(seq_map, general_orientation)
+            for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
+        ]
+        return general_pages_orientations, origin_page_orientations
+
+    def _straighten_pages(
+        self,
+        pages: List[np.ndarray],
+        seg_maps: List[np.ndarray],
+        general_pages_orientations: Optional[List[Tuple[int, float]]] = None,
+        origin_pages_orientations: Optional[List[int]] = None,
+    ) -> List[np.ndarray]:
+        general_pages_orientations = (
+            general_pages_orientations if general_pages_orientations else self._general_page_orientations(pages)
+        )
+        origin_pages_orientations = (
+            origin_pages_orientations
+            if origin_pages_orientations
+            else [
+                estimate_orientation(seq_map, general_orientation)
+                for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
+            ]
+        )
+        return [
+            # We exapnd if the page is wider than tall and the angle is 90 or -90
+            rotate_image(page, angle, expand=page.shape[1] > page.shape[0] and abs(angle) == 90)
+            for page, angle in zip(pages, origin_pages_orientations)
+        ]
+
     @staticmethod
     def _generate_crops(
         pages: List[np.ndarray],