git @ Cat's Eye Technologies NaNoGenLab / master guten-gutter / guten-gutter.py
master

Tree @master (Download .tar.gz)

guten-gutter.py @masterraw · history · blame

#!/usr/bin/env python

"""Usage: guten-gutter.py [--output-dir DIR] FILES

Cleans the Project Gutenberg boilerplate off of the given input files.
"""

from optparse import OptionParser
import os
import re
import sys


class AbstractBaseCleaner(object):
    """Defines the protocol for "cleaner" objects."""

    def clean(self, lines, name=''):
        """Given an iterable of lines, yield cleaned lines.  ``name'' is
        the (optional) name of the entity being cleaned, for error-reporting
        purposes.

        Note that a file-like object is an iterable of lines.
        """
        raise NotImplementedError

    def has_failed(self, original_lines, result_lines, name=''):
        """Given two iterables of lines, representing the input and the
        result of running this cleaner on that input, return a boolean
        indicating whether we think this cleaner has failed or not.
        """
        return False

    def __str__(self):
        return self.__class__.__name__


class TrailingWhitespaceCleaner(AbstractBaseCleaner):

    def clean(self, lines, name=''):
        for line in lines:
            yield line.rstrip()


class IllustrationCleaner(AbstractBaseCleaner):

    def clean(self, lines, name=''):
        for line in lines:
            match = re.match(r'^\s*\[Illustration.*?\]\s*$', line)
            if not match:
                yield line


class SentinelCleaner(AbstractBaseCleaner):
    """Cleans the input lines, returning only the lines between the start
    sentinel (exclusive) and the end sentinel (exclusive.)
    
    The start sentinel is actually "super-exclusive" in that neither it,
    nor any non-blank lines immediately following it, are included in
    the output.

    Note that cleaned lines are stripped of trailing whitespace.
    """

    def __init__(self, start_re=None, end_re=None):
        if start_re is None:
            start_re = self.START_RE
        self.start_re = start_re
        if end_re is None:
            end_re = self.END_RE
        self.end_re = end_re
        self.state = 'pre'

    def clean(self, lines, name=''):
        for line in lines:
            line = line.rstrip()
            if self.state == 'pre':
                match = re.match(self.start_re, line.upper())
                if match:
                    self.state = 'consuming-start'
            elif self.state == 'consuming-start':
                if not line:
                    self.state = 'mid'
            elif self.state == 'mid':
                match = re.match(self.end_re, line.upper())
                if match:
                    self.state = 'post'
                else:
                    yield line
            else:
                assert self.state == 'post'
                pass


class GutenbergCleaner(SentinelCleaner):
    START_RE = r'^\**\s*START\s+OF\s+(TH(IS|E)\s+)?PROJECT\s+GUTENBERG.*?$'
    END_RE = r'^\**\s*END\s+OF\s+(TH(IS|E)\s+)?PROJECT\s+GUTENBERG.*?$'

    def has_failed(self, original_lines, result_lines, name=''):
        original_lines = list(original_lines)
        result_lines = list(result_lines)
        shrinkage = len(original_lines) - len(result_lines)
        # usually under 400, but sometimes as high as 418...
        return len(result_lines) == 0 or shrinkage > 450


class ProducedByCleaner(SentinelCleaner):
    START_RE = (r'^((THIS\s+)?E\-?(TEXT|BOOKS?)\s+(WAS\s+)?)?'
                '(PRODUCED|PREPARED|TRANSCRIBED|UPDATED|SCANNED).*?$')
    END_RE = r'^\**\s*END\s+OF\s+(TH(IS|E)\s+)?PROJECT\s+GUTENBERG.*?$'

    def has_failed(self, original_lines, result_lines, name=''):
        original_lines = list(original_lines)
        result_lines = list(result_lines)
        shrinkage = len(original_lines) - len(result_lines)
        # Note: this is not sufficient by itself; assumes GutenbergCleaner
        # got the trailing legal text, which is large.
        return len(result_lines) == 0 or shrinkage > 20


class MultiCleaner(AbstractBaseCleaner):
    """An object which attempts to apply multiple cleaners to an input.
    If any cleaner fails, it returns the input just previous to that
    failure.
    """

    def __init__(self, cleaners=()):
        self.cleaners = cleaners

    def clean(self, lines, name=''):
        lines = list(lines)
        for cleaner in self.cleaners:
            new_lines = list(cleaner.clean(lines, name=name))
            if cleaner.has_failed(lines, new_lines, name=name):
                sys.stderr.write("%s failed to clean '%s'\n" % (cleaner, name))
                break
            lines = new_lines

        for line in lines:
            yield line


def main(argv):
    optparser = OptionParser(__doc__.strip())
    optparser.add_option("--strip-illustrations", default=False,
                         action='store_true',
                         help="also try to remove [Illustration: foo]'s")
    optparser.add_option("--output-dir", default=None, metavar='DIR',
                         help="if given, save the resulting files to this "
                              "directory (under their original names)"
                              "instead of dumping them to standard output")
    (options, args) = optparser.parse_args(argv[1:])

    for filename in args:
        out = sys.stdout
        if options.output_dir is not None:
            out_filename = os.path.join(
                options.output_dir, os.path.basename(filename)
            )
            out = open(out_filename, 'w')
        cleaners = [
            TrailingWhitespaceCleaner(),
            GutenbergCleaner(),
        ]
        if options.strip_illustrations:
            cleaners.append(IllustrationCleaner())
        cleaners.append(ProducedByCleaner())
        cleaner = MultiCleaner(cleaners)
        with open(filename, 'r') as f:
            for line in cleaner.clean(f, name=filename):
                out.write(line + '\n')
        if out is not sys.stdout:
            out.close()


if __name__ == '__main__':
    main(sys.argv)