From 69b7e2d9613283ee06dcd28a405bb1ba5cbf72c5 Mon Sep 17 00:00:00 2001 From: zlg Date: Fri, 4 Nov 2011 06:49:59 -0500 Subject: Corrected dupekill 1.3 tag --- TODO | 0 dupekill | 185 +++++++++++++++++++++++++++++++-------------------------------- 2 files changed, 91 insertions(+), 94 deletions(-) delete mode 100644 TODO diff --git a/TODO b/TODO deleted file mode 100644 index e69de29..0000000 diff --git a/dupekill b/dupekill index d269b46..79d5c04 100755 --- a/dupekill +++ b/dupekill @@ -35,100 +35,74 @@ from optparse import OptionParser # to the list of files in the data list. def need_to_add(filepath, datalist): found = 0 - if len(datalist) > 0: - for entry in datalist: - if entry[0] == filepath: - found = 1 - if found == 1: - return False - else: - return True - else: + if len(datalist) == 0: return True + for entry in datalist: + if entry[0] == filepath: + found = 1 + + return found != 1 + def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()): if not os.path.isdir(path): print("Error: Unable to fetch directory to work with.") sys.exit(1) - else: - # Create the generator, create the hash list and the counters. - file_list = os.walk(path) - hashList = [] - processed_files = 0 - deleted_files = 0 - - for root, dirs, file in file_list: - ignore_dirs = ['.git', '.config'] - for dir in ignore_dirs: - if recursive == True: - # Recursion still needs to ignore certain dirs - if dir in dirs: - dirs.remove(dir) - - else: - # We don't need _any_ dirs if recursion's off! - while dirs: - dirs.pop() - - for item in file: - # Set up a few variables we'll be needing. - filepath = os.path.join(root, item) - filesize = os.stat(filepath).st_size - deleted = False # We need this flag to determine state before adding to the list - - # Funny, processed_files will always equal the index of the file - # in our list. :D We might not need it, though - processed_files += 1 - - if len(hashList) > 0: - for entry in hashList: - - # ---- CHECK FILE NAME ---- - # We'll get a false positive if we process a file against itself. - if filepath == entry[0]: - continue - - # ---- CHECK FILESIZE ---- - if filesize == entry[1]: - # File sizes match, so let's check the first 512 bytes - filepreview = open(filepath, "rb").read(512) - - # Check the length of the entry, and only add information if it's not already there - if len(entry) == 2: - entry.append(open(entry[0], "rb").read(512)) - else: - # The filesizes don't match, so they must be different; move to the next file - continue - - # ---- CHECK FILE PREVIEW ---- - if len(entry) == 3 and entry[2] == filepreview: - - # If they match and the files are less than 512 bytes... we don't need to hash! - if entry[1] < 512 and filesize < 512: - # We KNOW there's a match now. - deleted_files += 1 - deleted = True - - if not dry_run: - os.remove(filepath) - - if verbose: - print("DUPE:", filepath) - break - else: - # Open the file in binary mode, to avoid UTF-8 errors. - filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest() - - if len(entry) == 3: - entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest()) - else: - # The previews don't match, so they're different - continue - - # ---- CHECK FILE HASH ---- - if len(entry) == 4 and entry[3] == filehash: + # Create the generator, create the hash list and the counters. + file_list = os.walk(path) + hashList = [] + processed_files = 0 + deleted_files = 0 + + for root, dirs, file in file_list: + ignore_dirs = ['.git', '.config'] + for dir in ignore_dirs: + if recursive == True: + # Recursion still needs to ignore certain dirs + if dir in dirs: + dirs.remove(dir) + + else: + # We don't need _any_ dirs if recursion's off! + while dirs: + dirs.pop() + + for item in file: + # Set up a few variables we'll be needing. + filepath = os.path.join(root, item) + filesize = os.stat(filepath).st_size + deleted = False # We need this flag to determine state before adding to the list + + # Funny, processed_files will always equal the index of the file + # in our list. :D We might not need it, though + processed_files += 1 + + if len(hashList) > 0: + for entry in hashList: + + # ---- CHECK FILE NAME ---- + # We'll get a false positive if we process a file against itself. + if filepath == entry[0]: + continue + + # ---- CHECK FILESIZE ---- + if filesize != entry[1]: + continue + + # File sizes match, so let's check the first 512 bytes + filepreview = open(filepath, "rb").read(512) + + # Check the length of the entry, and only add information if it's not already there + if len(entry) == 2: + entry.append(open(entry[0], "rb").read(512)) + + # ---- CHECK FILE PREVIEW ---- + if len(entry) == 3 and entry[2] == filepreview: + + # If they match and the files are less than 512 bytes... we don't need to hash! + if entry[1] < 512 and filesize < 512: # We KNOW there's a match now. deleted_files += 1 deleted = True @@ -139,19 +113,42 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()): if verbose: print("DUPE:", filepath) break + else: + # Open the file in binary mode, to avoid UTF-8 errors. + filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest() + + if len(entry) == 3: + entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest()) + + else: + # The previews don't match, so they're different + continue + + # ---- CHECK FILE HASH ---- + if len(entry) == 4 and entry[3] == filehash: + # We KNOW there's a match now. + deleted_files += 1 + deleted = True + + if not dry_run: + os.remove(filepath) + + if verbose: + print("DUPE:", filepath) + break - if need_to_add(filepath, hashList) and not deleted: - hashList.append([filepath, filesize]) + if need_to_add(filepath, hashList) and not deleted: + hashList.append([filepath, filesize]) - if verbose: - print("FILE:", filepath) + if verbose: + print("FILE:", filepath) - print() + print() - if dry_run: - print("DRY RUN ON. NO FILES WILL BE DELETED.") + if dry_run: + print("DRY RUN ON. NO FILES WILL BE DELETED.") - print(processed_files, "files processed,", deleted_files, "deleted.\n") + print(processed_files, "files processed,", deleted_files, "deleted.\n") if __name__ == '__main__': -- cgit v1.2.3-54-g00ecf