refactor: Unified np_dtype and fix comments (#782)

* start synth * cleanup * start synth * add synthtext * add docu and tests * apply code factor suggestions * apply changes * clean * unify and replace wrong comments * fix synth and apply request
mindee · Jan 5, 2022 · e14e643 · e14e643
1 parent 68e2120
commit e14e643
Show file tree

Hide file tree

Showing 14 changed files with 29 additions and 14 deletions.
diff --git a/doctr/datasets/cord.py b/doctr/datasets/cord.py
@@ -50,6 +50,7 @@ def __init__(
         tmp_root = os.path.join(self.root, 'image')
         self.data: List[Tuple[str, Dict[str, Any]]] = []
         self.train = train
+        np_dtype = np.float32
         for img_path in os.listdir(tmp_root):
             # File existence check
             if not os.path.exists(os.path.join(tmp_root, img_path)):
@@ -65,14 +66,15 @@ def __init__(
                             x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
                             y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
                             if use_polygons:
+                                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                 box = np.array([
                                     [x[0], y[0]],
                                     [x[1], y[1]],
                                     [x[2], y[2]],
                                     [x[3], y[3]],
-                                ], dtype=np.float32)
+                                ], dtype=np_dtype)
                             else:
-                                # Reduce 8 coords to 4
+                                # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax
                                 box = [min(x), min(y), max(x), max(y)]
                             _targets.append((word['text'], box))
 

diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py
@@ -51,12 +51,13 @@ def __init__(
             labels = json.load(f)
 
         self.data: List[Tuple[str, np.ndarray]] = []
+        np_dtype = np.float32
         for img_name, label in labels.items():
             # File existence check
             if not os.path.exists(os.path.join(self.root, img_name)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
 
-            polygons = np.asarray(label['polygons'], dtype=np.float32)
+            polygons = np.asarray(label['polygons'], dtype=np_dtype)
             geoms = polygons if use_polygons else np.concatenate((polygons.min(axis=1), polygons.max(axis=1)), axis=1)
 
-            self.data.append((img_name, np.asarray(geoms, dtype=np.float32)))
+            self.data.append((img_name, np.asarray(geoms, dtype=np_dtype)))
diff --git a/doctr/datasets/doc_artefacts.py b/doctr/datasets/doc_artefacts.py
@@ -59,9 +59,11 @@ def __init__(
             if not os.path.exists(os.path.join(tmp_root, img_name)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_name)}")
 
+            # xmin, ymin, xmax, ymax
             boxes = np.asarray([obj['geometry'] for obj in label], dtype=np_dtype)
             classes = np.asarray([self.CLASSES.index(obj['label']) for obj in label], dtype=np.int64)
             if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                 boxes = np.stack(
                     [
                         np.stack([boxes[:, 0], boxes[:, 1]], axis=-1),

diff --git a/doctr/datasets/funsd.py b/doctr/datasets/funsd.py
@@ -51,6 +51,7 @@ def __init__(
             **kwargs
         )
         self.train = train
+        np_dtype = np.float32
 
         # Use the subset
         subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data')
@@ -71,7 +72,7 @@ def __init__(
                         for word in block['words'] if len(word['text']) > 0]
             text_targets, box_targets = zip(*_targets)
             if use_polygons:
-                # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0
+                # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners
                 box_targets = [
                     [
                         [box[0], box[1]],
@@ -83,7 +84,7 @@ def __init__(
 
             self.data.append((
                 img_path,
-                dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=list(text_targets)),
+                dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)),
             ))
 
         self.root = tmp_root

diff --git a/doctr/datasets/ic03.py b/doctr/datasets/ic03.py
@@ -63,6 +63,7 @@ def __init__(
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}")
 
             if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                 _boxes = [
                     [
                         [float(rect.attrib['x']), float(rect.attrib['y'])],

diff --git a/doctr/datasets/ic13.py b/doctr/datasets/ic13.py
@@ -66,7 +66,7 @@ def __init__(
             # xmin, ymin, xmax, ymax
             box_targets = np.array([list(map(int, line[:4])) for line in _lines], dtype=np_dtype)
             if use_polygons:
-                # x_center, y_center, width, height, 0
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                 box_targets = np.array(
                     [
                         [

diff --git a/doctr/datasets/iiit5k.py b/doctr/datasets/iiit5k.py
@@ -70,7 +70,7 @@ def __init__(
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")
 
             if use_polygons:
-                # x_center, y_center, w, h, alpha = 0
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                 box_targets = [
                     [
                         [box[0], box[1]],
@@ -80,7 +80,7 @@ def __init__(
                     ] for box in box_targets
                 ]
             else:
-                # x, y, width, height -> xmin, ymin, xmax, ymax
+                # xmin, ymin, xmax, ymax
                 box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
 
             # label are casted to list where each char corresponds to the character's bounding box

diff --git a/doctr/datasets/ocr.py b/doctr/datasets/ocr.py
@@ -50,9 +50,10 @@ def __init__(
             if len(annotations["typed_words"]) == 0:
                 self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[])))
                 continue
-            # Unpack the straight boxes
+            # Unpack the straight boxes (xmin, ymin, xmax, ymax)
             geoms = [list(map(float, obj['geometry'][:4])) for obj in annotations['typed_words']]
             if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                 geoms = [
                     [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]]  # type: ignore[list-item]
                     for geom in geoms

diff --git a/doctr/datasets/sroie.py b/doctr/datasets/sroie.py
@@ -62,11 +62,13 @@ def __init__(
                 _rows = [row for row in list(csv.reader(f, delimiter=',')) if len(row) > 0]
 
             labels = [",".join(row[8:]) for row in _rows]
-            # reorder coordinates (8 -> (4,2)) and filter empty lines
+            # reorder coordinates (8 -> (4,2) ->
+            # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines
             coords = np.stack([np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2))
                               for row in _rows], axis=0)
 
             if not use_polygons:
+                # xmin, ymin, xmax, ymax
                 coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1)
 
             self.data.append((img_path, dict(boxes=coords, labels=labels)))

diff --git a/doctr/datasets/svhn.py b/doctr/datasets/svhn.py
@@ -89,6 +89,7 @@ def __init__(
                 label_targets = list(map(str, box_dict['label']))
 
                 if use_polygons:
+                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                     box_targets = np.stack(
                         [
                             np.stack([coords[:, 0], coords[:, 1]], axis=-1),

diff --git a/doctr/datasets/svt.py b/doctr/datasets/svt.py
@@ -58,6 +58,7 @@ def __init__(
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}")
 
             if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                 _boxes = [
                     [
                         [float(rect.attrib['x']), float(rect.attrib['y'])],

diff --git a/doctr/datasets/synthtext.py b/doctr/datasets/synthtext.py
@@ -72,9 +72,12 @@ def __init__(
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path[0])}")
 
             labels = [elt for word in txt.tolist() for elt in word.split()]
-            word_boxes = word_boxes.transpose(2, 1, 0) if word_boxes.ndim == 3 else np.expand_dims(word_boxes, axis=0)
+            # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+            word_boxes = word_boxes.transpose(2, 1, 0) if word_boxes.ndim == 3 else np.expand_dims(
+                word_boxes.transpose(1, 0), axis=0)
 
             if not use_polygons:
+                # xmin, ymin, xmax, ymax
                 word_boxes = np.concatenate((word_boxes.min(axis=1), word_boxes.max(axis=1)), axis=1)
 
             self.data.append((img_path[0], dict(boxes=np.asarray(word_boxes, dtype=np_dtype), labels=labels)))

diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
@@ -18,7 +18,7 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly
     assert img.shape == (3, *input_size)
     assert img.dtype == torch.float32
     assert isinstance(target, dict)
-    assert isinstance(target['boxes'], np.ndarray)
+    assert isinstance(target['boxes'], np.ndarray) and target['boxes'].dtype == np.float32
     if is_polygons:
         assert target['boxes'].ndim == 3 and target['boxes'].shape[1:] == (4, 2)
     else:

diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py
@@ -18,7 +18,7 @@ def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_poly
     assert img.shape == (*input_size, 3)
     assert img.dtype == tf.float32
     assert isinstance(target, dict)
-    assert isinstance(target['boxes'], np.ndarray)
+    assert isinstance(target['boxes'], np.ndarray) and target['boxes'].dtype == np.float32
     if is_polygons:
         assert target['boxes'].ndim == 3 and target['boxes'].shape[1:] == (4, 2)
     else: