dupekill 1.3 completed

dupekill now checks files based on heirarchy of computing speed. It starts with file path and size, then checks the first 512 bytes, before finally hashing it with SHA256. Speed increases are _huge_ compared to 1.2, and thus it deserves a new subversion number.
author: zlg <zlg@zlg.space> 2011-11-04 06:31:55 -0500
committer: zlg <zlg@zlg.space> 2018-06-25 22:13:52 -0700
commit: c7ff0cfed2e8e74b65a5e0561389d5572b321a4d (patch)
tree: edf6adf00d8c1f1ef5cca80aef2ec35934ef1f9b /dupekill
parent: dupekill 1.2! (diff)
download: dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.gz
dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.bz2
dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.xz
dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.zip
1 files changed, 85 insertions, 16 deletions
diff --git a/dupekill b/dupekill
index ff9e3b2..d269b46 100755
--- a/dupekill
+++ b/dupekill
@@ -31,6 +31,21 @@ from optparse import OptionParser
 #
 # You have been warned. >:3
 
+# This function determines whether or not the file needs to be added
+# to the list of files in the data list.
+def need_to_add(filepath, datalist):
+    found = 0
+    if len(datalist) > 0:
+        for entry in datalist:
+            if entry[0] == filepath:
+                found = 1
+        if found == 1:
+            return False
+        else:
+            return True
+    else:
+        return True
+
 def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
 
     if not os.path.isdir(path):
@@ -52,39 +67,93 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
                         dirs.remove(dir)
 
                 else:
-                    # While no recursion doesn't need _any_ dirs!
+                    # We don't need _any_ dirs if recursion's off!
                     while dirs:
                         dirs.pop()
 
             for item in file:
-                checkedFile = open(os.path.join(root, item), "rb").read()
-                hash = hashlib.sha256(checkedFile).hexdigest()
+                # Set up a few variables we'll be needing.
+                filepath = os.path.join(root, item)
+                filesize = os.stat(filepath).st_size
+                deleted = False # We need this flag to determine state before adding to the list
+
+                # Funny, processed_files will always equal the index of the file
+                # in our list. :D We might not need it, though
                 processed_files += 1
-                
-                if len(hashList) > 0 and hash in hashList:
-                    # We want to count these, even if it's a dry run.
-                    deleted_files += 1
 
-                    if not dry_run:
-                        os.remove(os.path.join(root, item))
+                if len(hashList) > 0:
+                    for entry in hashList:
 
-                    if verbose:
-                        print("Dupe", os.path.join(root, item), "found.")
+                        # ---- CHECK FILE NAME ----
+                        # We'll get a false positive if we process a file against itself.
+                        if filepath == entry[0]:
+                            continue
 
-                else:
-                    hashList.append(hash)
+                        # ---- CHECK FILESIZE ----
+                        if filesize == entry[1]:
+                            # File sizes match, so let's check the first 512 bytes
+                            filepreview = open(filepath, "rb").read(512)
+
+                            # Check the length of the entry, and only add information if it's not already there
+                            if len(entry) == 2:
+                                entry.append(open(entry[0], "rb").read(512))
+                        else:
+                            # The filesizes don't match, so they must be different; move to the next file
+                            continue
+
+                        # ---- CHECK FILE PREVIEW ----
+                        if len(entry) == 3 and entry[2] == filepreview:
+
+                            # If they match and the files are less than 512 bytes... we don't need to hash!
+                            if entry[1] < 512 and filesize < 512:
+                                # We KNOW there's a match now.
+                                deleted_files += 1
+                                deleted = True
+
+                                if not dry_run:
+                                    os.remove(filepath)
+
+                                if verbose:
+                                    print("DUPE:", filepath)
+                                    break
+                            else:
+                                # Open the file in binary mode, to avoid UTF-8 errors.
+                                filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
+
+                                if len(entry) == 3:
+                                    entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
+
+                        else:
+                            # The previews don't match, so they're different
+                            continue
+
+                        # ---- CHECK FILE HASH ----
+                        if len(entry) == 4 and entry[3] == filehash:
+                            # We KNOW there's a match now.
+                            deleted_files += 1
+                            deleted = True
+
+                            if not dry_run:
+                                os.remove(filepath)
+
+                            if verbose:
+                                print("DUPE:", filepath)
+                                break
+
+                if need_to_add(filepath, hashList) and not deleted:
+                    hashList.append([filepath, filesize])
 
                     if verbose:
-                        print("New file", os.path.join(root, item))
+                        print("FILE:", filepath)
 
-        # Print a summary
         print()
 
         if dry_run:
-            print("THIS IS A DRY RUN! NO FILES WILL BE ALTERED!")
+            print("DRY RUN ON. NO FILES WILL BE DELETED.")
 
         print(processed_files, "files processed,", deleted_files, "deleted.\n")
 
+
 if __name__ == '__main__':
     try:
         usage = "Usage: %prog [options] {path}"
author	zlg <zlg@zlg.space>	2011-11-04 06:31:55 -0500
committer	zlg <zlg@zlg.space>	2018-06-25 22:13:52 -0700
commit	c7ff0cfed2e8e74b65a5e0561389d5572b321a4d (patch)
tree	edf6adf00d8c1f1ef5cca80aef2ec35934ef1f9b /dupekill
parent	dupekill 1.2! (diff)
download	dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.gz dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.bz2 dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.tar.xz dupekill-c7ff0cfed2e8e74b65a5e0561389d5572b321a4d.zip