aboutsummaryrefslogtreecommitdiff
path: root/dupekill
diff options
context:
space:
mode:
authorzlg <zlg@zlg.space>2011-11-04 06:31:55 -0500
committerzlg <zlg@zlg.space>2018-06-25 22:13:52 -0700
commitc7ff0cfed2e8e74b65a5e0561389d5572b321a4d (patch)
treeedf6adf00d8c1f1ef5cca80aef2ec35934ef1f9b /dupekill
parentdupekill 1.2! (diff)
downloaddupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.gz
dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.bz2
dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.xz
dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.zip
dupekill 1.3 completed
dupekill now checks files based on heirarchy of computing speed. It starts with file path and size, then checks the first 512 bytes, before finally hashing it with SHA256. Speed increases are _huge_ compared to 1.2, and thus it deserves a new subversion number.
Diffstat (limited to '')
-rwxr-xr-xdupekill101
1 files changed, 85 insertions, 16 deletions
diff --git a/dupekill b/dupekill
index ff9e3b2..d269b46 100755
--- a/dupekill
+++ b/dupekill
@@ -31,6 +31,21 @@ from optparse import OptionParser
#
# You have been warned. >:3
+# This function determines whether or not the file needs to be added
+# to the list of files in the data list.
+def need_to_add(filepath, datalist):
+ found = 0
+ if len(datalist) > 0:
+ for entry in datalist:
+ if entry[0] == filepath:
+ found = 1
+ if found == 1:
+ return False
+ else:
+ return True
+ else:
+ return True
+
def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
if not os.path.isdir(path):
@@ -52,39 +67,93 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
dirs.remove(dir)
else:
- # While no recursion doesn't need _any_ dirs!
+ # We don't need _any_ dirs if recursion's off!
while dirs:
dirs.pop()
for item in file:
- checkedFile = open(os.path.join(root, item), "rb").read()
- hash = hashlib.sha256(checkedFile).hexdigest()
+ # Set up a few variables we'll be needing.
+ filepath = os.path.join(root, item)
+ filesize = os.stat(filepath).st_size
+ deleted = False # We need this flag to determine state before adding to the list
+
+ # Funny, processed_files will always equal the index of the file
+ # in our list. :D We might not need it, though
processed_files += 1
-
- if len(hashList) > 0 and hash in hashList:
- # We want to count these, even if it's a dry run.
- deleted_files += 1
- if not dry_run:
- os.remove(os.path.join(root, item))
+ if len(hashList) > 0:
+ for entry in hashList:
- if verbose:
- print("Dupe", os.path.join(root, item), "found.")
+ # ---- CHECK FILE NAME ----
+ # We'll get a false positive if we process a file against itself.
+ if filepath == entry[0]:
+ continue
- else:
- hashList.append(hash)
+ # ---- CHECK FILESIZE ----
+ if filesize == entry[1]:
+ # File sizes match, so let's check the first 512 bytes
+ filepreview = open(filepath, "rb").read(512)
+
+ # Check the length of the entry, and only add information if it's not already there
+ if len(entry) == 2:
+ entry.append(open(entry[0], "rb").read(512))
+ else:
+ # The filesizes don't match, so they must be different; move to the next file
+ continue
+
+ # ---- CHECK FILE PREVIEW ----
+ if len(entry) == 3 and entry[2] == filepreview:
+
+ # If they match and the files are less than 512 bytes... we don't need to hash!
+ if entry[1] < 512 and filesize < 512:
+ # We KNOW there's a match now.
+ deleted_files += 1
+ deleted = True
+
+ if not dry_run:
+ os.remove(filepath)
+
+ if verbose:
+ print("DUPE:", filepath)
+ break
+ else:
+ # Open the file in binary mode, to avoid UTF-8 errors.
+ filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
+
+ if len(entry) == 3:
+ entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
+
+ else:
+ # The previews don't match, so they're different
+ continue
+
+ # ---- CHECK FILE HASH ----
+ if len(entry) == 4 and entry[3] == filehash:
+ # We KNOW there's a match now.
+ deleted_files += 1
+ deleted = True
+
+ if not dry_run:
+ os.remove(filepath)
+
+ if verbose:
+ print("DUPE:", filepath)
+ break
+
+ if need_to_add(filepath, hashList) and not deleted:
+ hashList.append([filepath, filesize])
if verbose:
- print("New file", os.path.join(root, item))
+ print("FILE:", filepath)
- # Print a summary
print()
if dry_run:
- print("THIS IS A DRY RUN! NO FILES WILL BE ALTERED!")
+ print("DRY RUN ON. NO FILES WILL BE DELETED.")
print(processed_files, "files processed,", deleted_files, "deleted.\n")
+
if __name__ == '__main__':
try:
usage = "Usage: %prog [options] {path}"