git @ Cat's Eye Technologies The-Dipple / master python / mk-dups-index

Tree @master (Download .tar.gz)

mk-dups-index @masterraw · history · blame

#!/usr/bin/env python

# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
# For more information, please refer to <>
# SPDX-License-Identifier: Unlicense

Create an index to speed up finding duplicate files.

Usage: mk-dups-index dir >index.txt

NOTE 1: if the contents of 'dir' changes, you need to re-make the index.
NOTE 2: the contents of 'dir' need not be unique.


import hashlib
import os
import sys

def md5(filename):
    """Compute and return the MD5 hash of the named file.

    hash = hashlib.md5()
    file = open(filename, "r")
    eof = False
    while not eof:
        data =
        if data:
            eof = True
    return hash.hexdigest()

### MAIN ###

def main(argv):
    for root, dirs, files in os.walk(argv[1]):
        for filename in files:
            full = os.path.normpath(os.path.join(root, filename))
            hash = md5(full)
            print hash, full

if __name__ == '__main__':