CV系列之用图像相似度去重
本帖最后由 Andrew 于 2023-9-14 01:59 PM 编辑当我们处理一批图片时,这批图片可能会有重复的图片,下面分享一个具备图片相似度去重功能的脚本,欢迎大家对该脚本进行改进。
import os
import cv2
from PIL import Image
import imagehash
import shutil
import numpy as np
from tqdm import tqdm
def calculate_phash(image):
# 使用感知哈希算法计算图像的哈希值
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
phash = imagehash.phash(pil_image)
return str(phash)
def calculate_mse(image1, image2):
# 计算均方误差
mse = ((image1 - image2) ** 2).mean()
return mse
def move_similar_images(source_folder, destination_folder, near_num=10, mse_thred = 100):
# 获取源文件夹中所有图片的路径
image_paths =
# 按照图片名称排序
image_paths.sort()
# 遍历图片列表,判断相邻的图片是否相似
img_num = len(image_paths)-near_num
progress_bar = tqdm(total=img_num, unit='files', desc='processing files')
for i in range(len(image_paths)-near_num):
progress_bar.set_postfix(file=i)
progress_bar.update(1)
# current_image = cv2.imread(image_paths)
current_image = cv2.imdecode(np.fromfile(image_paths, dtype=np.uint8), -1)
# 比较当前图像和后续 near_num 张图像
for j in range(i+1, i+near_num+1):
# next_image = cv2.imread(image_paths)
next_image = cv2.imdecode(np.fromfile(image_paths, dtype=np.uint8), -1)
#
if current_image.shape != next_image.shape:
continue
# # 计算当前图像和下一张图像的哈希值,如果相同,移动
# current_hash = calculate_phash(current_image)
# next_hash = calculate_phash(next_image)
# if current_hash == next_hash:
# destination_path = os.path.join(destination_folder, os.path.basename(image_paths))
# shutil.move(image_paths, destination_path)
# break
# 计算mse相似度,如果低于阈值,移动
try:
mse = calculate_mse(current_image, next_image)
except IOError as e:
print('erro:', e)
if mse < mse_thred:
destination_path = os.path.join(destination_folder, os.path.basename(image_paths))
shutil.move(image_paths, destination_path)
break
progress_bar.close()
if __name__ == '__main__':
# 源文件夹和目标文件夹的路径
src_dir = r'F:\data\xxx'
des_dir = src_dir + '_chongfu'
if not os.path.exists(des_dir):
os.makedirs(des_dir)
# 移动相似图片到目标文件夹
move_similar_images(src_dir, des_dir, near_num=20, mse_thred = 70)
除了计算哈希值,也可以通过一些网络模型提取特征值来进行比对,相似度还可以用余弦相似度,计算向量内积就可以
页:
[1]