aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--TODO0
-rwxr-xr-xdupekill185
2 files changed, 91 insertions, 94 deletions
diff --git a/TODO b/TODO
deleted file mode 100644
index e69de29..0000000
--- a/TODO
+++ /dev/null
diff --git a/dupekill b/dupekill
index d269b46..79d5c04 100755
--- a/dupekill
+++ b/dupekill
@@ -35,100 +35,74 @@ from optparse import OptionParser
# to the list of files in the data list.
def need_to_add(filepath, datalist):
found = 0
- if len(datalist) > 0:
- for entry in datalist:
- if entry[0] == filepath:
- found = 1
- if found == 1:
- return False
- else:
- return True
- else:
+ if len(datalist) == 0:
return True
+ for entry in datalist:
+ if entry[0] == filepath:
+ found = 1
+
+ return found != 1
+
def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
if not os.path.isdir(path):
print("Error: Unable to fetch directory to work with.")
sys.exit(1)
- else:
- # Create the generator, create the hash list and the counters.
- file_list = os.walk(path)
- hashList = []
- processed_files = 0
- deleted_files = 0
-
- for root, dirs, file in file_list:
- ignore_dirs = ['.git', '.config']
- for dir in ignore_dirs:
- if recursive == True:
- # Recursion still needs to ignore certain dirs
- if dir in dirs:
- dirs.remove(dir)
-
- else:
- # We don't need _any_ dirs if recursion's off!
- while dirs:
- dirs.pop()
-
- for item in file:
- # Set up a few variables we'll be needing.
- filepath = os.path.join(root, item)
- filesize = os.stat(filepath).st_size
- deleted = False # We need this flag to determine state before adding to the list
-
- # Funny, processed_files will always equal the index of the file
- # in our list. :D We might not need it, though
- processed_files += 1
-
- if len(hashList) > 0:
- for entry in hashList:
-
- # ---- CHECK FILE NAME ----
- # We'll get a false positive if we process a file against itself.
- if filepath == entry[0]:
- continue
-
- # ---- CHECK FILESIZE ----
- if filesize == entry[1]:
- # File sizes match, so let's check the first 512 bytes
- filepreview = open(filepath, "rb").read(512)
-
- # Check the length of the entry, and only add information if it's not already there
- if len(entry) == 2:
- entry.append(open(entry[0], "rb").read(512))
- else:
- # The filesizes don't match, so they must be different; move to the next file
- continue
-
- # ---- CHECK FILE PREVIEW ----
- if len(entry) == 3 and entry[2] == filepreview:
-
- # If they match and the files are less than 512 bytes... we don't need to hash!
- if entry[1] < 512 and filesize < 512:
- # We KNOW there's a match now.
- deleted_files += 1
- deleted = True
-
- if not dry_run:
- os.remove(filepath)
-
- if verbose:
- print("DUPE:", filepath)
- break
- else:
- # Open the file in binary mode, to avoid UTF-8 errors.
- filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
-
- if len(entry) == 3:
- entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
- else:
- # The previews don't match, so they're different
- continue
-
- # ---- CHECK FILE HASH ----
- if len(entry) == 4 and entry[3] == filehash:
+ # Create the generator, create the hash list and the counters.
+ file_list = os.walk(path)
+ hashList = []
+ processed_files = 0
+ deleted_files = 0
+
+ for root, dirs, file in file_list:
+ ignore_dirs = ['.git', '.config']
+ for dir in ignore_dirs:
+ if recursive == True:
+ # Recursion still needs to ignore certain dirs
+ if dir in dirs:
+ dirs.remove(dir)
+
+ else:
+ # We don't need _any_ dirs if recursion's off!
+ while dirs:
+ dirs.pop()
+
+ for item in file:
+ # Set up a few variables we'll be needing.
+ filepath = os.path.join(root, item)
+ filesize = os.stat(filepath).st_size
+ deleted = False # We need this flag to determine state before adding to the list
+
+ # Funny, processed_files will always equal the index of the file
+ # in our list. :D We might not need it, though
+ processed_files += 1
+
+ if len(hashList) > 0:
+ for entry in hashList:
+
+ # ---- CHECK FILE NAME ----
+ # We'll get a false positive if we process a file against itself.
+ if filepath == entry[0]:
+ continue
+
+ # ---- CHECK FILESIZE ----
+ if filesize != entry[1]:
+ continue
+
+ # File sizes match, so let's check the first 512 bytes
+ filepreview = open(filepath, "rb").read(512)
+
+ # Check the length of the entry, and only add information if it's not already there
+ if len(entry) == 2:
+ entry.append(open(entry[0], "rb").read(512))
+
+ # ---- CHECK FILE PREVIEW ----
+ if len(entry) == 3 and entry[2] == filepreview:
+
+ # If they match and the files are less than 512 bytes... we don't need to hash!
+ if entry[1] < 512 and filesize < 512:
# We KNOW there's a match now.
deleted_files += 1
deleted = True
@@ -139,19 +113,42 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
if verbose:
print("DUPE:", filepath)
break
+ else:
+ # Open the file in binary mode, to avoid UTF-8 errors.
+ filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
+
+ if len(entry) == 3:
+ entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
+
+ else:
+ # The previews don't match, so they're different
+ continue
+
+ # ---- CHECK FILE HASH ----
+ if len(entry) == 4 and entry[3] == filehash:
+ # We KNOW there's a match now.
+ deleted_files += 1
+ deleted = True
+
+ if not dry_run:
+ os.remove(filepath)
+
+ if verbose:
+ print("DUPE:", filepath)
+ break
- if need_to_add(filepath, hashList) and not deleted:
- hashList.append([filepath, filesize])
+ if need_to_add(filepath, hashList) and not deleted:
+ hashList.append([filepath, filesize])
- if verbose:
- print("FILE:", filepath)
+ if verbose:
+ print("FILE:", filepath)
- print()
+ print()
- if dry_run:
- print("DRY RUN ON. NO FILES WILL BE DELETED.")
+ if dry_run:
+ print("DRY RUN ON. NO FILES WILL BE DELETED.")
- print(processed_files, "files processed,", deleted_files, "deleted.\n")
+ print(processed_files, "files processed,", deleted_files, "deleted.\n")
if __name__ == '__main__':