diff options
Diffstat (limited to '')
-rw-r--r-- | TODO | 0 | ||||
-rwxr-xr-x | dupekill | 101 |
2 files changed, 85 insertions, 16 deletions
@@ -31,6 +31,21 @@ from optparse import OptionParser # # You have been warned. >:3 +# This function determines whether or not the file needs to be added +# to the list of files in the data list. +def need_to_add(filepath, datalist): + found = 0 + if len(datalist) > 0: + for entry in datalist: + if entry[0] == filepath: + found = 1 + if found == 1: + return False + else: + return True + else: + return True + def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()): if not os.path.isdir(path): @@ -52,39 +67,93 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()): dirs.remove(dir) else: - # While no recursion doesn't need _any_ dirs! + # We don't need _any_ dirs if recursion's off! while dirs: dirs.pop() for item in file: - checkedFile = open(os.path.join(root, item), "rb").read() - hash = hashlib.sha256(checkedFile).hexdigest() + # Set up a few variables we'll be needing. + filepath = os.path.join(root, item) + filesize = os.stat(filepath).st_size + deleted = False # We need this flag to determine state before adding to the list + + # Funny, processed_files will always equal the index of the file + # in our list. :D We might not need it, though processed_files += 1 - - if len(hashList) > 0 and hash in hashList: - # We want to count these, even if it's a dry run. - deleted_files += 1 - if not dry_run: - os.remove(os.path.join(root, item)) + if len(hashList) > 0: + for entry in hashList: - if verbose: - print("Dupe", os.path.join(root, item), "found.") + # ---- CHECK FILE NAME ---- + # We'll get a false positive if we process a file against itself. + if filepath == entry[0]: + continue - else: - hashList.append(hash) + # ---- CHECK FILESIZE ---- + if filesize == entry[1]: + # File sizes match, so let's check the first 512 bytes + filepreview = open(filepath, "rb").read(512) + + # Check the length of the entry, and only add information if it's not already there + if len(entry) == 2: + entry.append(open(entry[0], "rb").read(512)) + else: + # The filesizes don't match, so they must be different; move to the next file + continue + + # ---- CHECK FILE PREVIEW ---- + if len(entry) == 3 and entry[2] == filepreview: + + # If they match and the files are less than 512 bytes... we don't need to hash! + if entry[1] < 512 and filesize < 512: + # We KNOW there's a match now. + deleted_files += 1 + deleted = True + + if not dry_run: + os.remove(filepath) + + if verbose: + print("DUPE:", filepath) + break + else: + # Open the file in binary mode, to avoid UTF-8 errors. + filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest() + + if len(entry) == 3: + entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest()) + + else: + # The previews don't match, so they're different + continue + + # ---- CHECK FILE HASH ---- + if len(entry) == 4 and entry[3] == filehash: + # We KNOW there's a match now. + deleted_files += 1 + deleted = True + + if not dry_run: + os.remove(filepath) + + if verbose: + print("DUPE:", filepath) + break + + if need_to_add(filepath, hashList) and not deleted: + hashList.append([filepath, filesize]) if verbose: - print("New file", os.path.join(root, item)) + print("FILE:", filepath) - # Print a summary print() if dry_run: - print("THIS IS A DRY RUN! NO FILES WILL BE ALTERED!") + print("DRY RUN ON. NO FILES WILL BE DELETED.") print(processed_files, "files processed,", deleted_files, "deleted.\n") + if __name__ == '__main__': try: usage = "Usage: %prog [options] {path}" |