rtorrent-container/scripts/duplicate.py

from os import walk, remove, stat
from os.path import join as joinpath
import textdistance

#todo: don't compare jpeg, do something with folder

word_to_delete = ['french','bluray','hdlight','x264','.mkv','.mp4','subforced','720p','1080p','h265','brrip','bdrip','aac','vfi','vff','h264','am64','webrip','vost','.mp3','mhdgz','ac3','pophd','.avi','bluray', 'multi', 'notag','dvdrip','pal','vostfr','truefrench','tvrip']

def find_duplicates(rootdir,file_to_compare = None,path_file_to_compare=None):
    for path, dirs, files in walk( rootdir ):
        for filename in files:
            if file_to_compare is not None:
                distance = textdistance.levenshtein.normalized_similarity(
                        better_name(file_to_compare),
                        better_name(filename))
                complete_path_to_compare = joinpath(path_file_to_compare,file_to_compare)
                complete_path = joinpath( path, filename )
                if distance > 0.7 and complete_path_to_compare != complete_path:
                    print(complete_path_to_compare +";"+ complete_path+";"+str(distance))
            else:
                find_duplicates(rootdir,filename,path)

def better_name(filename):
    filename = filename.lower()
    position = len(filename)
    for word in word_to_delete:
        position = update_position(position,filename.find(" "+word))
        position = update_position(position,filename.find("."+word))
        position = update_position(position,filename.find("["+word))
    #print(filename)
    #print(filename[0:position])
    return filename[0:position]

def update_position(position,newposition):
    if newposition > -1 and position > newposition:
        return newposition
    else:
        return position
find_duplicates('/data/downloads/film/')
#find_duplicates('./test')