rtorrent-container/scripts/duplicate.py

42 lines
1.8 KiB
Python

from os import walk, remove, stat
from os.path import join as joinpath
import textdistance
#todo: don't compare jpeg, do something with folder
word_to_delete = ['french','bluray','hdlight','x264','.mkv','.mp4','subforced','720p','1080p','h265','brrip','bdrip','aac','vfi','vff','h264','am64','webrip','vost','.mp3','mhdgz','ac3','pophd','.avi','bluray', 'multi', 'notag','dvdrip','pal','vostfr','truefrench','tvrip']
def find_duplicates(rootdir,file_to_compare = None,path_file_to_compare=None):
for path, dirs, files in walk( rootdir ):
for filename in files:
if file_to_compare is not None:
distance = textdistance.levenshtein.normalized_similarity(
better_name(file_to_compare),
better_name(filename))
complete_path_to_compare = joinpath(path_file_to_compare,file_to_compare)
complete_path = joinpath( path, filename )
if distance > 0.7 and complete_path_to_compare != complete_path:
print(complete_path_to_compare +";"+ complete_path+";"+str(distance))
else:
find_duplicates(rootdir,filename,path)
def better_name(filename):
filename = filename.lower()
position = len(filename)
for word in word_to_delete:
position = update_position(position,filename.find(" "+word))
position = update_position(position,filename.find("."+word))
position = update_position(position,filename.find("["+word))
#print(filename)
#print(filename[0:position])
return filename[0:position]
def update_position(position,newposition):
if newposition > -1 and position > newposition:
return newposition
else:
return position
find_duplicates('/data/downloads/film/')
#find_duplicates('./test')