2 files changed, 85 insertions, 16 deletions
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/TODO
diff --git a/dupekill b/dupekill
index ff9e3b2..d269b46 100755
--- a/dupekill
+++ b/dupekill
@@ -31,6 +31,21 @@ from optparse import OptionParser
 #
 # You have been warned. >:3
 
+# This function determines whether or not the file needs to be added
+# to the list of files in the data list.
+def need_to_add(filepath, datalist):
+    found = 0
+    if len(datalist) > 0:
+        for entry in datalist:
+            if entry[0] == filepath:
+                found = 1
+        if found == 1:
+            return False
+        else:
+            return True
+    else:
+        return True
+
 def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
 
     if not os.path.isdir(path):
@@ -52,39 +67,93 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
                         dirs.remove(dir)
 
                 else:
-                    # While no recursion doesn't need _any_ dirs!
+                    # We don't need _any_ dirs if recursion's off!
                     while dirs:
                         dirs.pop()
 
             for item in file:
-                checkedFile = open(os.path.join(root, item), "rb").read()
-                hash = hashlib.sha256(checkedFile).hexdigest()
+                # Set up a few variables we'll be needing.
+                filepath = os.path.join(root, item)
+                filesize = os.stat(filepath).st_size
+                deleted = False # We need this flag to determine state before adding to the list
+
+                # Funny, processed_files will always equal the index of the file
+                # in our list. :D We might not need it, though
                 processed_files += 1
-                
-                if len(hashList) > 0 and hash in hashList:
-                    # We want to count these, even if it's a dry run.
-                    deleted_files += 1
 
-                    if not dry_run:
-                        os.remove(os.path.join(root, item))
+                if len(hashList) > 0:
+                    for entry in hashList:
 
-                    if verbose:
-                        print("Dupe", os.path.join(root, item), "found.")
+                        # ---- CHECK FILE NAME ----
+                        # We'll get a false positive if we process a file against itself.
+                        if filepath == entry[0]:
+                            continue
 
-                else:
-                    hashList.append(hash)
+                        # ---- CHECK FILESIZE ----
+                        if filesize == entry[1]:
+                            # File sizes match, so let's check the first 512 bytes
+                            filepreview = open(filepath, "rb").read(512)
+
+                            # Check the length of the entry, and only add information if it's not already there
+                            if len(entry) == 2:
+                                entry.append(open(entry[0], "rb").read(512))
+                        else:
+                            # The filesizes don't match, so they must be different; move to the next file
+                            continue
+
+                        # ---- CHECK FILE PREVIEW ----
+                        if len(entry) == 3 and entry[2] == filepreview:
+
+                            # If they match and the files are less than 512 bytes... we don't need to hash!
+                            if entry[1] < 512 and filesize < 512:
+                                # We KNOW there's a match now.
+                                deleted_files += 1
+                                deleted = True
+
+                                if not dry_run:
+                                    os.remove(filepath)
+
+                                if verbose:
+                                    print("DUPE:", filepath)
+                                    break
+                            else:
+                                # Open the file in binary mode, to avoid UTF-8 errors.
+                                filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
+
+                                if len(entry) == 3:
+                                    entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
+
+                        else:
+                            # The previews don't match, so they're different
+                            continue
+
+                        # ---- CHECK FILE HASH ----
+                        if len(entry) == 4 and entry[3] == filehash:
+                            # We KNOW there's a match now.
+                            deleted_files += 1
+                            deleted = True
+
+                            if not dry_run:
+                                os.remove(filepath)
+
+                            if verbose:
+                                print("DUPE:", filepath)
+                                break
+
+                if need_to_add(filepath, hashList) and not deleted:
+                    hashList.append([filepath, filesize])
 
                     if verbose:
-                        print("New file", os.path.join(root, item))
+                        print("FILE:", filepath)
 
-        # Print a summary
         print()
 
         if dry_run:
-            print("THIS IS A DRY RUN! NO FILES WILL BE ALTERED!")
+            print("DRY RUN ON. NO FILES WILL BE DELETED.")
 
         print(processed_files, "files processed,", deleted_files, "deleted.\n")
 
+
 if __name__ == '__main__':
     try:
         usage = "Usage: %prog [options] {path}"