git @ Cat's Eye Technologies Dipple / master python / mk-dups-index
master

Tree @master (Download .tar.gz)

mk-dups-index @masterraw · history · blame

#!/usr/bin/env python

# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
# For more information, please refer to <https://unlicense.org/>
# SPDX-License-Identifier: Unlicense

"""
Create an index to speed up finding duplicate files.

Usage: mk-dups-index dir >index.txt

NOTE 1: if the contents of 'dir' changes, you need to re-make the index.
NOTE 2: the contents of 'dir' need not be unique.

"""

import hashlib
import os
import sys


def md5(filename):
    """Compute and return the MD5 hash of the named file.

    """
    hash = hashlib.md5()
    file = open(filename, "r")
    eof = False
    while not eof:
        data = file.read(1024)
        if data:
            hash.update(data)
        else:
            eof = True
    file.close()
    return hash.hexdigest()


### MAIN ###


def main(argv):
    for root, dirs, files in os.walk(argv[1]):
        for filename in files:
            full = os.path.normpath(os.path.join(root, filename))
            hash = md5(full)
            print hash, full

if __name__ == '__main__':
    main(sys.argv)