42 lines
1.8 KiB
Python
42 lines
1.8 KiB
Python
from os import walk, remove, stat
|
|
from os.path import join as joinpath
|
|
import textdistance
|
|
|
|
#todo: don't compare jpeg, do something with folder
|
|
|
|
word_to_delete = ['french','bluray','hdlight','x264','.mkv','.mp4','subforced','720p','1080p','h265','brrip','bdrip','aac','vfi','vff','h264','am64','webrip','vost','.mp3','mhdgz','ac3','pophd','.avi','bluray', 'multi', 'notag','dvdrip','pal','vostfr','truefrench','tvrip']
|
|
|
|
def find_duplicates(rootdir,file_to_compare = None,path_file_to_compare=None):
|
|
for path, dirs, files in walk( rootdir ):
|
|
for filename in files:
|
|
if file_to_compare is not None:
|
|
distance = textdistance.levenshtein.normalized_similarity(
|
|
better_name(file_to_compare),
|
|
better_name(filename))
|
|
complete_path_to_compare = joinpath(path_file_to_compare,file_to_compare)
|
|
complete_path = joinpath( path, filename )
|
|
if distance > 0.7 and complete_path_to_compare != complete_path:
|
|
print(complete_path_to_compare +";"+ complete_path+";"+str(distance))
|
|
else:
|
|
find_duplicates(rootdir,filename,path)
|
|
|
|
def better_name(filename):
|
|
filename = filename.lower()
|
|
position = len(filename)
|
|
for word in word_to_delete:
|
|
position = update_position(position,filename.find(" "+word))
|
|
position = update_position(position,filename.find("."+word))
|
|
position = update_position(position,filename.find("["+word))
|
|
#print(filename)
|
|
#print(filename[0:position])
|
|
return filename[0:position]
|
|
|
|
def update_position(position,newposition):
|
|
if newposition > -1 and position > newposition:
|
|
return newposition
|
|
else:
|
|
return position
|
|
find_duplicates('/data/downloads/film/')
|
|
#find_duplicates('./test')
|
|
|