python 删除相似的图片
主要应用在制作数据集时,将视频裁剪为图片,但多少帧取一张图片不太好选择,太密导致裁剪下来的图片太多,有很多重复的;太稀疏会丢失部分有用信息。可以选择在采集时密集一些,之后使用图片相似度判断,删除一些帧,以下是代码。
# coding: utf-8
import os
import cv2
# from skimage.measure import compare_ssim
# from skimage.metrics import _structural_similarity
from skimage.metrics import structural_similarity as ssim
# import multiprocessing
import multiprocessing
import time
# import Queue
from tkinter import _flatten
def delete(filename1):
os.remove(filename1)
def list_all_files(root):
files = []
list = os.listdir(root)
if len(list)>0:
if (not os.path.isdir(os.path.join(root, list[0]))) and list[0].endswith('.jpg'): # os.path.isdir()用于判断某一对象(需提供绝对路径)是否为目录
dict = {}
for l in list:
l_ = l.split('.')[0]
dict[int(l_.split('_')[-2])*1000000 + int(l_.split('_')[-1])] = l
dict_keys = sorted(dict.keys())
list = []
print(dict_keys)
for dict_key in dict_keys:
list.append(dict[dict_key])
# print('list',list)
# os.listdir()方法:返回指定文件夹包含的文件或子文件夹名字的列表。该列表顺序以字母排序
for i in range(len(list)):
element = os.path.join(root, list[i])
# 需要先使用python路径拼接os.path.join()函数,将os.listdir()返回的名称拼接成文件或目录的绝对路径再传入os.path.isdir()和os.path.isfile().
if os.path.isdir(element): # os.path.isdir()用于判断某一对象(需提供绝对路径)是否为目录
# temp_dir = os.path.split(element)[-1]
# os.path.split分割文件名与路径,分割为data_dir和此路径下的文件名,[-1]表示只取data_dir下的文件名
files.append(list_all_files(element))
elif os.path.isfile(element) and type(element)==str:
files.append(element)
else:
print('err',element)
raise element
print('2',files)
return files
def simJr(count,left_index, right_index,img_files,q,interval):
# print('llllllllllllllllll')
# left_index,right_index = q.get()
print('77img_files,left_index,',left_index,img_files[left_index])
img = cv2.imread(img_files[left_index])
img1 = cv2.imread(img_files[right_index])
# print('76big_ssim:', left_index, right_index, img_files[left_index], img_files[right_index])
# 进行结构性相似度判断
# ssim_value = _structural_similarity.structural_similarity(img,img1,multichannel=True)
ssim_value = ssim(img, img1, multichannel=True)
# ssim_value = pool.apply_async(func=ssim, args=(img, img1,True,))
print('67big_ssim:', left_index, right_index, img_files[left_index], img_files[right_index], ssim_value)
if ssim_value > 0.5:
# print('big_ssim:', left_index, right_index, img_files[left_index], img_files[right_index], ssim_value)
delete(img_files[right_index])
print('86delete', right_index, img_files[right_index])
right_index += 1
else:
left_index = right_index
right_index += interval
q.put([left_index,right_index])
def q_clear(q):
while q.qsize()>0:
q.get()
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=3)
img_path = r' '
q = multiprocessing.Queue()
count = 0
left_index = 0
right_index = 1
count = 0
# interval=2
P_num = 10
for interval in range(1,50):
try:
imgs_n = []
all_files = list_all_files(path) # 返回包含完整路径的所有图片名的列表
all_files = list(_flatten(all_files))
print('1', len(all_files), all_files)
img_files = all_files
q_clear(q)
for i in range(P_num):
print('[int((len(all_files) / P_num) * i), int((len(all_files) / P_num) * i) + 1]',[int((len(all_files) / P_num) * i), int((len(all_files) / P_num) * i) + interval])
q.put([int((len(all_files) / P_num) * i), int((len(all_files) / P_num) * i) + interval])
flag_begin = True
while right_index < len(all_files) or flag_begin == True:
left_index, right_index = q.get()
# pool.apply_async(func=simJr, args=(count,left_index, right_index,img_files,q,))
simJr_P = multiprocessing.Process(target=simJr, args=(count,left_index, right_index,img_files,q,interval,))
simJr_P.start()
print('q.qsize()',q.qsize(),'count',count)
print('122left_index, right_index',left_index, right_index)
count+=1
flag_begin = False
except:
print('err_overinterval',interval)
time.sleep(1000)
q_clear(q)