Skip to content

Commit

Permalink
[Fix] MJSynth dataset: filter corrupted or missing images (#956)
Browse files Browse the repository at this point in the history
* filter corrupted or missing images

* rename black_list to blacklist

* fix constant naming
  • Loading branch information
felixdittrich92 committed Jun 24, 2022
1 parent a2626a1 commit fc90306
Showing 1 changed file with 19 additions and 3 deletions.
22 changes: 19 additions & 3 deletions doctr/datasets/mjsynth.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,21 @@ class MJSynth(AbstractDataset):
**kwargs: keyword arguments from `AbstractDataset`.
"""

# filter corrupted or missing images
BLACKLIST = [
'./1881/4/225_Marbling_46673.jpg\n', './2069/4/192_whittier_86389.jpg\n', './869/4/234_TRIASSIC_80582.jpg\n',
'./173/2/358_BURROWING_10395.jpg\n', './913/4/231_randoms_62372.jpg\n', './596/2/372_Ump_81662.jpg\n',
'./936/2/375_LOCALITIES_44992.jpg\n', './2540/4/246_SQUAMOUS_73902.jpg\n', './1332/4/224_TETHERED_78397.jpg\n',
'./627/6/83_PATRIARCHATE_55931.jpg\n', './2013/2/370_refract_63890.jpg\n', './2911/6/77_heretical_35885.jpg\n',
'./1730/2/361_HEREON_35880.jpg\n', './2194/2/334_EFFLORESCENT_24742.jpg\n', './2025/2/364_SNORTERS_72304.jpg\n',
'./368/4/232_friar_30876.jpg\n', './275/6/96_hackle_34465.jpg\n', './384/4/220_bolts_8596.jpg\n',
'./905/4/234_Postscripts_59142.jpg\n', './2749/6/101_Chided_13155.jpg\n', './495/6/81_MIDYEAR_48332.jpg\n',
'./2852/6/60_TOILSOME_79481.jpg\n', './554/2/366_Teleconferences_77948.jpg\n',
'./1696/4/211_Queened_61779.jpg\n', './2128/2/369_REDACTED_63458.jpg\n', './2557/2/351_DOWN_23492.jpg\n',
'./2489/4/221_snored_72290.jpg\n', './1650/2/355_stony_74902.jpg\n', './1863/4/223_Diligently_21672.jpg\n',
'./264/2/362_FORETASTE_30276.jpg\n', './429/4/208_Mainmasts_46140.jpg\n', './1817/2/363_actuating_904.jpg\n'
]

def __init__(
self,
img_folder: str,
Expand All @@ -60,10 +75,11 @@ def __init__(
set_slice = slice(train_samples) if self.train else slice(train_samples, None)

for path in tqdm(iterable=img_paths[set_slice], desc='Unpacking MJSynth', total=len(img_paths[set_slice])):
label = [path.split('_')[1]]
img_path = os.path.join(img_folder, path[2:]).strip()
if path not in self.BLACKLIST:
label = [path.split('_')[1]]
img_path = os.path.join(img_folder, path[2:]).strip()

self.data.append((img_path, dict(labels=label)))
self.data.append((img_path, dict(labels=label)))

def extra_repr(self) -> str:
return f"train={self.train}"

0 comments on commit fc90306

Please sign in to comment.