#!/usr/bin/env python3
import os
import hashlib
import sys
import stat
from optparse import OptionParser

# dupekill - deletes duplicates of existing data
# Version 1.6 (2012-04-02)
# Written by zlg <zlg@zlg.space>
# Original idea and code by NF <radicalmori@gmail.com>
# License: WTFPL <http://sam.zoy.org/wtfpl>

# This function determines whether or not the file needs to be added
# to the list of files in the data list.

def need_to_add(filepath, datalist):
    found = 0
    if len(datalist) == 0:
        return True

    for entry in datalist:
        if entry[0] == filepath:
            found = 1

    return found != 1

def dupekill(dry_run=False, recursive=False, verbose=False, all_files=False, ignore_links=False, path=os.getcwd()):

    if all_files and verbose:
        print("Error: All operations (-a) or only important ones (-v), not both.")
        sys.exit(1)

    if not os.path.isdir(path):
        print("Error: Unable to fetch directory to work with.")
        sys.exit(1)

    # Create the generator, create the hash list and the counters.
    file_list = os.walk(path)
    hashList = []
    processed_files = 0
    deleted_files = 0

    for root, dirs, file in file_list:
        ignore_dirs = ['.git', '.config']
        for dir in ignore_dirs:
            if recursive == True:
                # Recursion still needs to ignore certain dirs
                if dir in dirs:
                    dirs.remove(dir)

            else:
                # We don't need _any_ dirs if recursion's off!
                while dirs:
                    dirs.pop()

        for item in file:
            # Set up a few variables we'll be needing.
            filepath = os.path.join(root, item)

            # We need this to determine the file's type
            filemode = os.lstat(filepath).st_mode

            # Skip past file types we shouldn't deal with: sockets, device nodes, etc
            if stat.S_ISSOCK(filemode) or stat.S_ISCHR(filemode) or stat.S_ISBLK(filemode) or stat.S_ISFIFO(filemode):
                continue

            # Make sure links are handled properly. If they're gonna be ignored,
            # there's no reason to continue the loop.
            if stat.S_ISLNK(filemode):
                if not ignore_links:
                    if all_files:
                        print("LINK:", filepath)
                continue

            # Check for information. If we can't fetch any, there's likely
            # a dead symlink or something else wrong.
            try:
                filesize = os.path.getsize(filepath)
            # This occurs when the data points to something that can't be found or
            # resolved.
            except IOError:
                if verbose or all_files:
                    print("NOT FOUND:", filepath)
                continue

            # This occurs mostly with symlinks.
            except OSError:
                if verbose or all_files:
                    print("DEAD LINK:", filepath)
                continue

            # No sense in adding an empty file to the list
            if filesize == 0:
                continue

            # We need this flag to determine state before adding to the list
            deleted = False

            # A clash happens when a symlink has been added to the list and is being
            # checked against a file. In this case, the symlink should be deleted
            # since the file contains the actual data.
            clash = False

            # Check to see if we can read or write to the file.
            if not os.access(filepath, os.R_OK):
                if verbose or all_files:
                    print("NOTICE: Cannot read from", filepath)

                continue

            if not os.access(filepath, os.W_OK):
                if verbose or all_files:
                    print("NOTICE: Cannot write to", filepath)

            # Funny, processed_files will always equal the index of the file
            # in our list. :D We might not need it, though
            processed_files += 1

            if len(hashList) > 0:
                for entry in hashList:

                    # ---- CHECK FILE NAME ----
                    # We'll get a false positive if we process a file against itself.
                    if filepath == entry[0]:
                        continue

                    # ---- CHECK FILESIZE ----
                    if filesize != entry[1]:
                        continue

                    # File sizes match, so let's check the first 512 bytes
                    filepreview = open(filepath, "rb").read(512)

                    # Check the length of the entry, and only add information if it's not already there
                    if len(entry) == 2:
                        entry.append(open(entry[0], "rb").read(512))

                    # ---- CHECK FILE PREVIEW ----
                    if len(entry) == 3 and entry[2] == filepreview:

                        # If they match and the files are less than 513 bytes... we don't need to hash!
                        if entry[1] < 513 and filesize < 513:
                            # We KNOW there's a match now.
                            deleted_files += 1
                            deleted = True

                            # Identify the right file to get rid of
                            if stat.S_ISLNK(os.lstat(entry[0]).st_mode):
                                clash = True

                            # It's important to make sure the proper file is being deleted
                            if not dry_run:
                                # If the hashlist has a symlink, get rid of it
                                if not ignore_links and clash:
                                    os.remove(entry[0])
                                else:
                                    os.remove(filepath)

                            if verbose or all_files:
                                if not ignore_links and clash:
                                    print("CLASH:", entry[0])
                                    print("  with", filepath)
                                    del entry
                                else:
                                    print("DUPE:", filepath)
                                    print("   of", entry[0])
                                break
                        else:
                            # Open the file in binary mode, to avoid UTF-8 errors.
                            filehash = hashlib.sha256(open(filepath, "rb").read()).hexdigest()

                            if len(entry) == 3:
                                entry.append(hashlib.sha256(open(entry[0], "rb").read()).hexdigest())

                    else:
                        # The previews don't match, so they're different
                        continue

                    # ---- CHECK FILE HASH ----
                    if len(entry) == 4 and entry[3] == filehash:
                        # We KNOW there's a match now.
                        deleted_files += 1
                        deleted = True

                        # Identify the right file to get rid of
                        if stat.S_ISLNK(os.lstat(entry[0]).st_mode):
                            clash = True

                        # It's important to make sure the proper file is being deleted
                        if not dry_run:
                            # If the hashlist has a symlink, get rid of it
                            if not ignore_links and clash:
                                os.remove(entry[0])
                            else:
                                os.remove(filepath)

                        if verbose or all_files:
                            if not ignore_links and clash:
                                print("CLASH:", entry[0])
                                print("  with", filepath)
                                del entry
                            else:
                                print("DUPE:", filepath)
                                print("   of", entry[0])
                            break

            if need_to_add(filepath, hashList) and not deleted:
                hashList.append([filepath, filesize])

                if all_files:
                    print("FILE:", filepath)

    print()

    if dry_run:
        print("DRY RUN ON. NO FILES WILL BE DELETED.")

    print(processed_files, "files processed,", deleted_files, "deleted.\n")


if __name__ == '__main__':
    try:
        usage = "Usage: %prog [options] {path}"
        description = "Deletes files that have duplicate data in them"
        epilog = "dupekill likes to munch on files. A lot. By default, symlinks and hardlinks that point to the same file will be deleted. Be careful!"
        version = "%prog version 1.6 (2012-06-13)"
        parser = OptionParser(usage=usage, description=description, epilog=epilog, version=version)
        parser.add_option("-d", "--dry", dest='dry_run', action='store_true', default=False, help="don't delete any files")
        parser.add_option("-r", "--recursive", dest='recursive', action='store_true', default=False, help="recurse into all directories below the current directory")
        parser.add_option("-v", "--verbose", dest='verbose', action='store_true', default=False, help="provide more detailed output")
        parser.add_option("-a", "--all-files", dest='all_files', action='store_true', default=False, help="show all processed files, not just dupes and errors")
        parser.add_option("-i", "--ignore-links", dest="ignore_links", action='store_true', default=False, help="don't process symlinks")
        (options, args) = parser.parse_args()
        if args and os.path.isdir(args[0]):
            path = os.path.abspath(args[0])
        else:
            path = os.getcwd()
        dupekill(options.dry_run, options.recursive, options.verbose, options.all_files, options.ignore_links, path)
    except KeyboardInterrupt:
        print("Aborted")