From 69b7e2d9613283ee06dcd28a405bb1ba5cbf72c5 Mon Sep 17 00:00:00 2001
From: zlg <zlg@zlg.space>
Date: Fri, 4 Nov 2011 06:49:59 -0500
Subject: Corrected dupekill 1.3 tag

---
 TODO     |   0
 dupekill | 185 +++++++++++++++++++++++++++++++--------------------------------
 2 files changed, 91 insertions(+), 94 deletions(-)
 delete mode 100644 TODO

diff --git a/TODO b/TODO
deleted file mode 100644
index e69de29..0000000
diff --git a/dupekill b/dupekill
index d269b46..79d5c04 100755
--- a/dupekill
+++ b/dupekill
@@ -35,100 +35,74 @@ from optparse import OptionParser
 # to the list of files in the data list.
 def need_to_add(filepath, datalist):
     found = 0
-    if len(datalist) > 0:
-        for entry in datalist:
-            if entry[0] == filepath:
-                found = 1
-        if found == 1:
-            return False
-        else:
-            return True
-    else:
+    if len(datalist) == 0:
         return True
 
+    for entry in datalist:
+        if entry[0] == filepath:
+            found = 1
+
+    return found != 1
+
 def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
 
     if not os.path.isdir(path):
         print("Error: Unable to fetch directory to work with.")
         sys.exit(1)
-    else:
-        # Create the generator, create the hash list and the counters.
-        file_list = os.walk(path)
-        hashList = []
-        processed_files = 0
-        deleted_files = 0
-
-        for root, dirs, file in file_list:
-            ignore_dirs = ['.git', '.config']
-            for dir in ignore_dirs:
-                if recursive == True:
-                    # Recursion still needs to ignore certain dirs
-                    if dir in dirs:
-                        dirs.remove(dir)
-
-                else:
-                    # We don't need _any_ dirs if recursion's off!
-                    while dirs:
-                        dirs.pop()
-
-            for item in file:
-                # Set up a few variables we'll be needing.
-                filepath = os.path.join(root, item)
-                filesize = os.stat(filepath).st_size
-                deleted = False # We need this flag to determine state before adding to the list
-
-                # Funny, processed_files will always equal the index of the file
-                # in our list. :D We might not need it, though
-                processed_files += 1
-
-                if len(hashList) > 0:
-                    for entry in hashList:
-
-                        # ---- CHECK FILE NAME ----
-                        # We'll get a false positive if we process a file against itself.
-                        if filepath == entry[0]:
-                            continue
-
-                        # ---- CHECK FILESIZE ----
-                        if filesize == entry[1]:
-                            # File sizes match, so let's check the first 512 bytes
-                            filepreview = open(filepath, "rb").read(512)
-
-                            # Check the length of the entry, and only add information if it's not already there
-                            if len(entry) == 2:
-                                entry.append(open(entry[0], "rb").read(512))
-                        else:
-                            # The filesizes don't match, so they must be different; move to the next file
-                            continue
-
-                        # ---- CHECK FILE PREVIEW ----
-                        if len(entry) == 3 and entry[2] == filepreview:
-
-                            # If they match and the files are less than 512 bytes... we don't need to hash!
-                            if entry[1] < 512 and filesize < 512:
-                                # We KNOW there's a match now.
-                                deleted_files += 1
-                                deleted = True
-
-                                if not dry_run:
-                                    os.remove(filepath)
-
-                                if verbose:
-                                    print("DUPE:", filepath)
-                                    break
-                            else:
-                                # Open the file in binary mode, to avoid UTF-8 errors.
-                                filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
-
-                                if len(entry) == 3:
-                                    entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
 
-                        else:
-                            # The previews don't match, so they're different
-                            continue
-
-                        # ---- CHECK FILE HASH ----
-                        if len(entry) == 4 and entry[3] == filehash:
+    # Create the generator, create the hash list and the counters.
+    file_list = os.walk(path)
+    hashList = []
+    processed_files = 0
+    deleted_files = 0
+
+    for root, dirs, file in file_list:
+        ignore_dirs = ['.git', '.config']
+        for dir in ignore_dirs:
+            if recursive == True:
+                # Recursion still needs to ignore certain dirs
+                if dir in dirs:
+                    dirs.remove(dir)
+
+            else:
+                # We don't need _any_ dirs if recursion's off!
+                while dirs:
+                    dirs.pop()
+
+        for item in file:
+            # Set up a few variables we'll be needing.
+            filepath = os.path.join(root, item)
+            filesize = os.stat(filepath).st_size
+            deleted = False # We need this flag to determine state before adding to the list
+
+            # Funny, processed_files will always equal the index of the file
+            # in our list. :D We might not need it, though
+            processed_files += 1
+
+            if len(hashList) > 0:
+                for entry in hashList:
+
+                    # ---- CHECK FILE NAME ----
+                    # We'll get a false positive if we process a file against itself.
+                    if filepath == entry[0]:
+                        continue
+
+                    # ---- CHECK FILESIZE ----
+                    if filesize != entry[1]:
+                        continue
+
+                    # File sizes match, so let's check the first 512 bytes
+                    filepreview = open(filepath, "rb").read(512)
+
+                    # Check the length of the entry, and only add information if it's not already there
+                    if len(entry) == 2:
+                        entry.append(open(entry[0], "rb").read(512))
+
+                    # ---- CHECK FILE PREVIEW ----
+                    if len(entry) == 3 and entry[2] == filepreview:
+
+                        # If they match and the files are less than 512 bytes... we don't need to hash!
+                        if entry[1] < 512 and filesize < 512:
                             # We KNOW there's a match now.
                             deleted_files += 1
                             deleted = True
@@ -139,19 +113,42 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
                             if verbose:
                                 print("DUPE:", filepath)
                                 break
+                        else:
+                            # Open the file in binary mode, to avoid UTF-8 errors.
+                            filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
+
+                            if len(entry) == 3:
+                                entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
+
+                    else:
+                        # The previews don't match, so they're different
+                        continue
+
+                    # ---- CHECK FILE HASH ----
+                    if len(entry) == 4 and entry[3] == filehash:
+                        # We KNOW there's a match now.
+                        deleted_files += 1
+                        deleted = True
+
+                        if not dry_run:
+                            os.remove(filepath)
+
+                        if verbose:
+                            print("DUPE:", filepath)
+                            break
 
-                if need_to_add(filepath, hashList) and not deleted:
-                    hashList.append([filepath, filesize])
+            if need_to_add(filepath, hashList) and not deleted:
+                hashList.append([filepath, filesize])
 
-                    if verbose:
-                        print("FILE:", filepath)
+                if verbose:
+                    print("FILE:", filepath)
 
-        print()
+    print()
 
-        if dry_run:
-            print("DRY RUN ON. NO FILES WILL BE DELETED.")
+    if dry_run:
+        print("DRY RUN ON. NO FILES WILL BE DELETED.")
 
-        print(processed_files, "files processed,", deleted_files, "deleted.\n")
+    print(processed_files, "files processed,", deleted_files, "deleted.\n")
 
 
 if __name__ == '__main__':
-- 
cgit v1.2.3-54-g00ecf