git @ Cat's Eye Technologies Space-Madness / master bin / find-unique
master

Tree @master (Download .tar.gz)

find-unique @masterraw · history · blame

#!/usr/bin/env python

"""
Find unique files.

Usage: find-unique src-dir {dest-dir}

List all files in src-dir which are not to be found anywhere (under any name)
in any of the dest-dirs.

src-dir may be somewhere inside one of the dest-dirs.  src-dir will not be
traversed during dest-dir traveral.  (really?  doesn't always seem to work)

"""

import hashlib
import os
import re
import sys
from optparse import OptionParser


def md5(filename):
    """Compute and return the MD5 hash of the named file.

    """
    hash = hashlib.md5()
    file = open(filename, "r")
    eof = False
    while not eof:
        data = file.read(1024)
        if data:
            hash.update(data)
        else:
            eof = True
    file.close()
    return hash.hexdigest()


def build_sizemap(directory, sizemap, exclude=None):
    for root, dirs, files in os.walk(directory):
        if os.path.normpath(root) == exclude:
            print "(skipping %s)" % exclude
            dirs[:] = []
        else:
            for filename in files:
                full = os.path.normpath(os.path.join(root, filename))
                if os.path.islink(full):
                    continue
                try:
                    size = os.path.getsize(full)
                    sizemap.setdefault(size, []).append(full)
                except IOError as e:
                    print str(e)


### MAIN ###

def main(argv):
    parser = OptionParser()
    parser.add_option("--move-to",
                      dest="move_to",
                      default=None,
                      help="move unique files to this directory "
                           "(DOES NOT CHECK THAT THE FILE BASENAMES "
                           "ARE UNIQUE)")

    (options, args) = parser.parse_args()
    dir_a = os.path.normpath(args[0])
    print "traversing %s..." % dir_a
    a = {}
    build_sizemap(dir_a, a)

    b = {}
    for arg in args[1:]:
        dest = os.path.normpath(arg)
        if os.path.isdir(dest):
            print "traversing %s..." % dest
            build_sizemap(dest, b, exclude=dir_a)

    def process(filename):
        print filename
        if options.move_to is not None:
            basename = os.path.basename(filename)
            destname = os.path.join(options.move_to, basename)
            while os.path.exists(destname):
                destname += '1'
            print "renaming to %s..." % destname
            os.rename(filename, destname)

    for size in a:
        if size not in b:
            for filename in a[size]:
                process(filename)
        else:
            a_hash = {}
            for filename in a[size]:
                a_hash.setdefault(md5(filename), []).append(filename)
            b_hash = {}
            for filename in b[size]:
                b_hash.setdefault(md5(filename), []).append(filename)

            for hash in a_hash:
                if hash not in b_hash:
                    for filename in a_hash[hash]:
                        process(filename)


if __name__ == '__main__':
    main(sys.argv)