-
Notifications
You must be signed in to change notification settings - Fork 0
/
_removeDuplicateImages.py
119 lines (102 loc) · 3.59 KB
/
_removeDuplicateImages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# import the necessary packages
from imutils import paths
import numpy as np
import argparse
import cv2
import os
import dhash
from PIL import Image
"""
pip install dhash imutils
"""
from collections import OrderedDict
def my_dhash(image, hashSize=8):
# convert the image to grayscale and resize the grayscale image,
# adding a single column (width) so we can compute the horizontal
# gradient
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
resized = cv2.resize(gray, (hashSize + 1, hashSize))
# compute the (relative) horizontal gradient between adjacent
# column pixels
diff = resized[:, 1:] > resized[:, :-1]
# convert the difference image to a hash and return it
return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True,
help="path to input dataset")
ap.add_argument("-r", "--remove", type=int, default=0,
help="whether or not duplicates should be removed (i.e., dry run). Set 1 to activate this feature.")
ap.add_argument("-ra", "--removeall", type=int, default=0,
help="whether or not all images that have duplicates, including original images, should be removed. Set 1 to activate this feature.")
args = vars(ap.parse_args())
# grab the paths to all images in our input dataset directory and
# then initialize our hashes dictionary
imagePaths = list(paths.list_images(args["dataset"]))
print(f"[INFO] computing image hashes... Found {len(imagePaths)} images")
hashes = {}
# loop over our image paths
cont = 0
for imagePath in imagePaths:
# load the input image and compute the hash
# image = cv2.imread(imagePath)
# h = my_dhash(image)
image = Image.open(imagePath)
h = dhash.dhash_int(image, size=8)
# grab all image paths with that hash, add the current image
# path to it, and store the list back in the hashes dictionary
p = hashes.get(h, [])
p.append(imagePath)
hashes[h] = p
##### check hamming distance between hashes
threshold = 10
for key in list(hashes):
for otherKey in list(hashes):
if otherKey != key:
diff = dhash.get_num_bits_different(otherKey, key)
if diff <= threshold:
#print(diff)
try:
hashes[key] = hashes[key] + hashes[otherKey]
hashes.pop(otherKey, None)
except:
continue
# loop over the image hashes
for (h, hashedPaths) in hashes.items():
# check to see if there is more than one image with the same hash
if len(hashedPaths) > 1:
# check to see if this is a dry run
# otherwise, we'll be removing the duplicate images
if args["remove"] == 1:
# loop over all image paths with the same hash *except*
# for the first image in the list (since we want to keep
# one, and only one, of the duplicate images)
cont = cont + len(hashedPaths) - 1
for p in hashedPaths[1:]:
os.remove(p)
elif args["removeall"] == 1:
cont = cont + len(hashedPaths) - 1
for p in hashedPaths:
os.remove(p)
else:
# initialize a montage to store all images with the same
# hash
montage = None
# loop over all image paths with the same hash
for p in hashedPaths:
# load the input image and resize it to a fixed width
# and heightG
image = cv2.imread(p)
image = cv2.resize(image, (150, 150))
# if our montage is None, initialize it
if montage is None:
montage = image
# otherwise, horizontally stack the images
else:
montage = np.hstack([montage, image])
cont = cont + len(hashedPaths) - 1
# show the montage for the hash
# print("[INFO] hash: {}".format(h))
# cv2.imshow("Montage", montage)
# cv2.waitKey(0)
print(f"Duplicates found {cont}")