git @ Cat's Eye Technologies Space-Madness / master bin / find-dups
master

Tree @master (Download .tar.gz)

find-dups @masterraw · history · blame

#!/usr/bin/env python

r"""
Find duplicate files.

Usage: find-dups [--canonical=<prefix>] [--format=<fmt>] {dir}

Any number of directories may be given.

If --canonical is not given, list all duplicate files found in those
directories, indexed by their md5 hashes.

If --canonical is given, it is a prefix; list all duplicate files
that do not have that prefix.

If --format is given, apply that format string to output filenames.
For example,

    find-dups foo --canonical=foo/bar --format="rm \"%s\""

...will produce a script to remove the duplicate files found somewhere
in foo other than in foo/bar.

"""

import hashlib
import os
import sys
from optparse import OptionParser


def md5(filename):
    """Compute and return the MD5 hash of the named file.

    """
    hash = hashlib.md5()
    file = open(filename, "r")
    eof = False
    while not eof:
        data = file.read(1024)
        if data:
            hash.update(data)
        else:
            eof = True
    file.close()
    return hash.hexdigest()


def build_sizemap(directory, sizemap, exclude=None):
    for root, dirs, files in os.walk(directory):
        if os.path.normpath(root) == exclude:
            print "(skipping %s)" % exclude
            dirs[:] = []
        else:
            for filename in files:
                full = os.path.normpath(os.path.join(root, filename))
                if os.path.islink(full):
                    continue
                try:
                    size = os.path.getsize(full)
                    sizemap.setdefault(size, []).append(full)
                except IOError as e:
                    print str(e)


### MAIN ###


def main(argv):
    parser = OptionParser()
    parser.add_option("--canonical",
                      dest="canonical",
                      default=None,
                      help="list all duplicate files found "
                           "that do not have this prefix")
    parser.add_option("--format",
                      dest="format",
                      default="%s",
                      help="format string for output filenames")
    (options, args) = parser.parse_args()

    sizemap = {}
    for directory in args:
        print "traversing %s..." % directory
        build_sizemap(directory, sizemap)

    hashmap = {}
    for size, filenames in sizemap.iteritems():
        if len(filenames) < 2:
            continue
        for filename in filenames:
            hashmap.setdefault(md5(filename), []).append(filename)

    if options.canonical is None:
        for hash in hashmap:
            filenames = sorted(hashmap[hash])
            if len(filenames) > 1:
                for filename in filenames:
                    print "%s\t%s" % (hash, filename)
    else:
        print
        for hash in hashmap:
            filenames = sorted(hashmap[hash])
            if len(filenames) > 1:
                canonicals = []
                for filename in filenames:
                    if filename.startswith(options.canonical):
                        canonicals.append(filename)
                if len(canonicals) == 1:
                    for filename in filenames:
                        if not filename.startswith(options.canonical):
                            print options.format % filename

if __name__ == '__main__':
    main(sys.argv)