diff options
author | zlg <zlg@zlg.space> | 2011-11-04 06:31:55 -0500 |
---|---|---|
committer | zlg <zlg@zlg.space> | 2018-06-25 22:13:52 -0700 |
commit | c7ff0cfed2e8e74b65a5e0561389d5572b321a4d (patch) | |
tree | edf6adf00d8c1f1ef5cca80aef2ec35934ef1f9b | |
parent | dupekill 1.2! (diff) | |
download | dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.gz dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.bz2 dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.xz dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.zip |
dupekill 1.3 completed
dupekill now checks files based on heirarchy of computing speed. It starts
with file path and size, then checks the first 512 bytes, before finally
hashing it with SHA256. Speed increases are _huge_ compared to 1.2, and thus
it deserves a new subversion number.
Diffstat (limited to '')
-rw-r--r-- | TODO | 0 | ||||
-rwxr-xr-x | dupekill | 101 |
2 files changed, 85 insertions, 16 deletions
@@ -31,6 +31,21 @@ from optparse import OptionParser # # You have been warned. >:3 +# This function determines whether or not the file needs to be added +# to the list of files in the data list. +def need_to_add(filepath, datalist): + found = 0 + if len(datalist) > 0: + for entry in datalist: + if entry[0] == filepath: + found = 1 + if found == 1: + return False + else: + return True + else: + return True + def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()): if not os.path.isdir(path): @@ -52,39 +67,93 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()): dirs.remove(dir) else: - # While no recursion doesn't need _any_ dirs! + # We don't need _any_ dirs if recursion's off! while dirs: dirs.pop() for item in file: - checkedFile = open(os.path.join(root, item), "rb").read() - hash = hashlib.sha256(checkedFile).hexdigest() + # Set up a few variables we'll be needing. + filepath = os.path.join(root, item) + filesize = os.stat(filepath).st_size + deleted = False # We need this flag to determine state before adding to the list + + # Funny, processed_files will always equal the index of the file + # in our list. :D We might not need it, though processed_files += 1 - - if len(hashList) > 0 and hash in hashList: - # We want to count these, even if it's a dry run. - deleted_files += 1 - if not dry_run: - os.remove(os.path.join(root, item)) + if len(hashList) > 0: + for entry in hashList: - if verbose: - print("Dupe", os.path.join(root, item), "found.") + # ---- CHECK FILE NAME ---- + # We'll get a false positive if we process a file against itself. + if filepath == entry[0]: + continue - else: - hashList.append(hash) + # ---- CHECK FILESIZE ---- + if filesize == entry[1]: + # File sizes match, so let's check the first 512 bytes + filepreview = open(filepath, "rb").read(512) + + # Check the length of the entry, and only add information if it's not already there + if len(entry) == 2: + entry.append(open(entry[0], "rb").read(512)) + else: + # The filesizes don't match, so they must be different; move to the next file + continue + + # ---- CHECK FILE PREVIEW ---- + if len(entry) == 3 and entry[2] == filepreview: + + # If they match and the files are less than 512 bytes... we don't need to hash! + if entry[1] < 512 and filesize < 512: + # We KNOW there's a match now. + deleted_files += 1 + deleted = True + + if not dry_run: + os.remove(filepath) + + if verbose: + print("DUPE:", filepath) + break + else: + # Open the file in binary mode, to avoid UTF-8 errors. + filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest() + + if len(entry) == 3: + entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest()) + + else: + # The previews don't match, so they're different + continue + + # ---- CHECK FILE HASH ---- + if len(entry) == 4 and entry[3] == filehash: + # We KNOW there's a match now. + deleted_files += 1 + deleted = True + + if not dry_run: + os.remove(filepath) + + if verbose: + print("DUPE:", filepath) + break + + if need_to_add(filepath, hashList) and not deleted: + hashList.append([filepath, filesize]) if verbose: - print("New file", os.path.join(root, item)) + print("FILE:", filepath) - # Print a summary print() if dry_run: - print("THIS IS A DRY RUN! NO FILES WILL BE ALTERED!") + print("DRY RUN ON. NO FILES WILL BE DELETED.") print(processed_files, "files processed,", deleted_files, "deleted.\n") + if __name__ == '__main__': try: usage = "Usage: %prog [options] {path}" |