aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO0
-rwxr-xr-xdupekill101
2 files changed, 85 insertions, 16 deletions
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/TODO
diff --git a/dupekill b/dupekill
index ff9e3b2..d269b46 100755
--- a/dupekill
+++ b/dupekill
@@ -31,6 +31,21 @@ from optparse import OptionParser
#
# You have been warned. >:3
+# This function determines whether or not the file needs to be added
+# to the list of files in the data list.
+def need_to_add(filepath, datalist):
+ found = 0
+ if len(datalist) > 0:
+ for entry in datalist:
+ if entry[0] == filepath:
+ found = 1
+ if found == 1:
+ return False
+ else:
+ return True
+ else:
+ return True
+
def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
if not os.path.isdir(path):
@@ -52,39 +67,93 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
dirs.remove(dir)
else:
- # While no recursion doesn't need _any_ dirs!
+ # We don't need _any_ dirs if recursion's off!
while dirs:
dirs.pop()
for item in file:
- checkedFile = open(os.path.join(root, item), "rb").read()
- hash = hashlib.sha256(checkedFile).hexdigest()
+ # Set up a few variables we'll be needing.
+ filepath = os.path.join(root, item)
+ filesize = os.stat(filepath).st_size
+ deleted = False # We need this flag to determine state before adding to the list
+
+ # Funny, processed_files will always equal the index of the file
+ # in our list. :D We might not need it, though
processed_files += 1
-
- if len(hashList) > 0 and hash in hashList:
- # We want to count these, even if it's a dry run.
- deleted_files += 1
- if not dry_run:
- os.remove(os.path.join(root, item))
+ if len(hashList) > 0:
+ for entry in hashList:
- if verbose:
- print("Dupe", os.path.join(root, item), "found.")
+ # ---- CHECK FILE NAME ----
+ # We'll get a false positive if we process a file against itself.
+ if filepath == entry[0]:
+ continue
- else:
- hashList.append(hash)
+ # ---- CHECK FILESIZE ----
+ if filesize == entry[1]:
+ # File sizes match, so let's check the first 512 bytes
+ filepreview = open(filepath, "rb").read(512)
+
+ # Check the length of the entry, and only add information if it's not already there
+ if len(entry) == 2:
+ entry.append(open(entry[0], "rb").read(512))
+ else:
+ # The filesizes don't match, so they must be different; move to the next file
+ continue
+
+ # ---- CHECK FILE PREVIEW ----
+ if len(entry) == 3 and entry[2] == filepreview:
+
+ # If they match and the files are less than 512 bytes... we don't need to hash!
+ if entry[1] < 512 and filesize < 512:
+ # We KNOW there's a match now.
+ deleted_files += 1
+ deleted = True
+
+ if not dry_run:
+ os.remove(filepath)
+
+ if verbose:
+ print("DUPE:", filepath)
+ break
+ else:
+ # Open the file in binary mode, to avoid UTF-8 errors.
+ filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
+
+ if len(entry) == 3:
+ entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
+
+ else:
+ # The previews don't match, so they're different
+ continue
+
+ # ---- CHECK FILE HASH ----
+ if len(entry) == 4 and entry[3] == filehash:
+ # We KNOW there's a match now.
+ deleted_files += 1
+ deleted = True
+
+ if not dry_run:
+ os.remove(filepath)
+
+ if verbose:
+ print("DUPE:", filepath)
+ break
+
+ if need_to_add(filepath, hashList) and not deleted:
+ hashList.append([filepath, filesize])
if verbose:
- print("New file", os.path.join(root, item))
+ print("FILE:", filepath)
- # Print a summary
print()
if dry_run:
- print("THIS IS A DRY RUN! NO FILES WILL BE ALTERED!")
+ print("DRY RUN ON. NO FILES WILL BE DELETED.")
print(processed_files, "files processed,", deleted_files, "deleted.\n")
+
if __name__ == '__main__':
try:
usage = "Usage: %prog [options] {path}"