aboutsummaryrefslogtreecommitdiff
path: root/dupekill
diff options
context:
space:
mode:
authorzlg <zlg@zlg.space>2011-11-04 06:49:59 -0500
committerzlg <zlg@zlg.space>2018-06-25 22:16:27 -0700
commit69b7e2d9613283ee06dcd28a405bb1ba5cbf72c5 (patch)
tree62f8d812bdacb255e9c5b2239bb4d30f7dcd3e6b /dupekill
parentdupekill 1.3 completed (diff)
downloaddupekill-69b7e2d9613283ee06dcd28a405bb1ba5cbf72c5.tar.gz
dupekill-69b7e2d9613283ee06dcd28a405bb1ba5cbf72c5.tar.bz2
dupekill-69b7e2d9613283ee06dcd28a405bb1ba5cbf72c5.tar.xz
dupekill-69b7e2d9613283ee06dcd28a405bb1ba5cbf72c5.zip
Corrected dupekill 1.3 tag
Diffstat (limited to '')
-rwxr-xr-xdupekill185
1 files changed, 91 insertions, 94 deletions
diff --git a/dupekill b/dupekill
index d269b46..79d5c04 100755
--- a/dupekill
+++ b/dupekill
@@ -35,100 +35,74 @@ from optparse import OptionParser
# to the list of files in the data list.
def need_to_add(filepath, datalist):
found = 0
- if len(datalist) > 0:
- for entry in datalist:
- if entry[0] == filepath:
- found = 1
- if found == 1:
- return False
- else:
- return True
- else:
+ if len(datalist) == 0:
return True
+ for entry in datalist:
+ if entry[0] == filepath:
+ found = 1
+
+ return found != 1
+
def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
if not os.path.isdir(path):
print("Error: Unable to fetch directory to work with.")
sys.exit(1)
- else:
- # Create the generator, create the hash list and the counters.
- file_list = os.walk(path)
- hashList = []
- processed_files = 0
- deleted_files = 0
-
- for root, dirs, file in file_list:
- ignore_dirs = ['.git', '.config']
- for dir in ignore_dirs:
- if recursive == True:
- # Recursion still needs to ignore certain dirs
- if dir in dirs:
- dirs.remove(dir)
-
- else:
- # We don't need _any_ dirs if recursion's off!
- while dirs:
- dirs.pop()
-
- for item in file:
- # Set up a few variables we'll be needing.
- filepath = os.path.join(root, item)
- filesize = os.stat(filepath).st_size
- deleted = False # We need this flag to determine state before adding to the list
-
- # Funny, processed_files will always equal the index of the file
- # in our list. :D We might not need it, though
- processed_files += 1
-
- if len(hashList) > 0:
- for entry in hashList:
-
- # ---- CHECK FILE NAME ----
- # We'll get a false positive if we process a file against itself.
- if filepath == entry[0]:
- continue
-
- # ---- CHECK FILESIZE ----
- if filesize == entry[1]:
- # File sizes match, so let's check the first 512 bytes
- filepreview = open(filepath, "rb").read(512)
-
- # Check the length of the entry, and only add information if it's not already there
- if len(entry) == 2:
- entry.append(open(entry[0], "rb").read(512))
- else:
- # The filesizes don't match, so they must be different; move to the next file
- continue
-
- # ---- CHECK FILE PREVIEW ----
- if len(entry) == 3 and entry[2] == filepreview:
-
- # If they match and the files are less than 512 bytes... we don't need to hash!
- if entry[1] < 512 and filesize < 512:
- # We KNOW there's a match now.
- deleted_files += 1
- deleted = True
-
- if not dry_run:
- os.remove(filepath)
-
- if verbose:
- print("DUPE:", filepath)
- break
- else:
- # Open the file in binary mode, to avoid UTF-8 errors.
- filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
-
- if len(entry) == 3:
- entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
- else:
- # The previews don't match, so they're different
- continue
-
- # ---- CHECK FILE HASH ----
- if len(entry) == 4 and entry[3] == filehash:
+ # Create the generator, create the hash list and the counters.
+ file_list = os.walk(path)
+ hashList = []
+ processed_files = 0
+ deleted_files = 0
+
+ for root, dirs, file in file_list:
+ ignore_dirs = ['.git', '.config']
+ for dir in ignore_dirs:
+ if recursive == True:
+ # Recursion still needs to ignore certain dirs
+ if dir in dirs:
+ dirs.remove(dir)
+
+ else:
+ # We don't need _any_ dirs if recursion's off!
+ while dirs:
+ dirs.pop()
+
+ for item in file:
+ # Set up a few variables we'll be needing.
+ filepath = os.path.join(root, item)
+ filesize = os.stat(filepath).st_size
+ deleted = False # We need this flag to determine state before adding to the list
+
+ # Funny, processed_files will always equal the index of the file
+ # in our list. :D We might not need it, though
+ processed_files += 1
+
+ if len(hashList) > 0:
+ for entry in hashList:
+
+ # ---- CHECK FILE NAME ----
+ # We'll get a false positive if we process a file against itself.
+ if filepath == entry[0]:
+ continue
+
+ # ---- CHECK FILESIZE ----
+ if filesize != entry[1]:
+ continue
+
+ # File sizes match, so let's check the first 512 bytes
+ filepreview = open(filepath, "rb").read(512)
+
+ # Check the length of the entry, and only add information if it's not already there
+ if len(entry) == 2:
+ entry.append(open(entry[0], "rb").read(512))
+
+ # ---- CHECK FILE PREVIEW ----
+ if len(entry) == 3 and entry[2] == filepreview:
+
+ # If they match and the files are less than 512 bytes... we don't need to hash!
+ if entry[1] < 512 and filesize < 512:
# We KNOW there's a match now.
deleted_files += 1
deleted = True
@@ -139,19 +113,42 @@ def dupekill(dry_run=False, recursive=False, verbose=False, path=os.getcwd()):
if verbose:
print("DUPE:", filepath)
break
+ else:
+ # Open the file in binary mode, to avoid UTF-8 errors.
+ filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()
+
+ if len(entry) == 3:
+ entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())
+
+ else:
+ # The previews don't match, so they're different
+ continue
+
+ # ---- CHECK FILE HASH ----
+ if len(entry) == 4 and entry[3] == filehash:
+ # We KNOW there's a match now.
+ deleted_files += 1
+ deleted = True
+
+ if not dry_run:
+ os.remove(filepath)
+
+ if verbose:
+ print("DUPE:", filepath)
+ break
- if need_to_add(filepath, hashList) and not deleted:
- hashList.append([filepath, filesize])
+ if need_to_add(filepath, hashList) and not deleted:
+ hashList.append([filepath, filesize])
- if verbose:
- print("FILE:", filepath)
+ if verbose:
+ print("FILE:", filepath)
- print()
+ print()
- if dry_run:
- print("DRY RUN ON. NO FILES WILL BE DELETED.")
+ if dry_run:
+ print("DRY RUN ON. NO FILES WILL BE DELETED.")
- print(processed_files, "files processed,", deleted_files, "deleted.\n")
+ print(processed_files, "files processed,", deleted_files, "deleted.\n")
if __name__ == '__main__':
d>zlg1-2/+9 At present, user modification is needed to make these seamless. vgup() may need to be axed in favor of telling the user to make an alias. 2018-03-13Make VGSTASH_DB_LOCATION point to a filezlg2-21/+20 It used to point to a directory, which would then look for .vgstash.db. This behavior was kind of backwards and I don't remember why I did it that way. This change gives users more control over where they put their DB. Be sure to update your environment variable if you have it set! 2016-11-18Remove settings from helpers.shZe Libertine Gamer1-5/+0 Sourcing them in .bash_profile screws up login if they're set. 2016-11-15Correct phrasing in README.Ze Libertine Gamer1-4/+4 2016-11-13DerpZe Libertine Gamer1-0/+1 2016-11-03Improve error handling in shell scriptsZe Libertine Gamer4-3/+23 2016-10-24Correct run_again, add recursionZe Libertine Gamer1-0/+4 Loops and functions -- oh my, what a useful combination. :) 2016-10-21Add quotes to correct behavior for arglistZe Libertine Gamer1-1/+1 2016-10-14updater.sh: add recursion, error handlingZe Libertine Gamer1-43/+101 2016-10-14Correct pipe-handling behaviorZe Libertine Gamer1-1/+9 2016-10-12Clarify a method to move between platformsZe Libertine Gamer1-2/+5 Also correct a typo.