git @ Cat's Eye Technologies Chainscape / master src / extract-tokenstream.py
master

Tree @master (Download .tar.gz)

extract-tokenstream.py @masterraw · history · blame

#!/usr/bin/env python3

# Copyright (c) 2024 Chris Pressey, Cat's Eye Technologies
# This file is distributed under an MIT license.  See LICENSES directory.
# SPDX-License-Identifier: LicenseRef-MIT-X-Chainscape

"""

This script was lifted from the build tools for "Anne of Green Garbles", NaNoGenMo 2019:
https://git.catseye.tc/NaNoGenMo-Entries-2019/blob/master/Anne%20of%20Green%20Garbles/README.md
and modified slightly to fit this project.

*   input format: HTML
*   output format: tokenstream

Script that takes an HTML file and outputs a stream of tokens,
one per line (tokenstream).

Tries to ignore any element of the HTML which is not obviously
a paragraph of text (for example, ToC entries, chapter headings,
and Gutenberg license blocks.)

It retains punctuation symbols such as “ and ” and ( and ) and
. and , and ! and treats these as individual tokens.

It also produces ¶ symbols to indicate the end of each paragraph.

"""


import sys
import re
from bs4 import BeautifulSoup, NavigableString


def scan_token(s, tokens):
    s = s.lstrip()

    match = re.match(r'^(\w+)(.*?)$', s)
    if match:
        tokens.append(match.group(1))
        return match.group(2)

    match = re.match(r'^(.)(.*?)$', s)
    if match:
        tokens.append(match.group(1))
        return match.group(2)


def tokenize(s):
    tokens = []
    while s:
        s = scan_token(s, tokens)
    return tokens


def render(node):
    name = (node.name or "").lower()
    if hasattr(node, 'attrs'):
        name += "<{}>".format(node.attrs.get('class') if node.attrs.get('class') else "")
    return name


def process_children(container, indent=""):
    for child in container.children:

        if isinstance(child, NavigableString):
            continue

        if child.attrs.get('class') and 'toc' in child.attrs.get('class'):
            continue

        if child.attrs.get('class') and 'licenseContainer' in child.attrs.get('class'):
            continue

        text = child.get_text().lstrip().replace('\n', ' ')
        if 'PROJECT GUTENBERG' in text.upper():
            continue

        if child.name.lower() in ('p',):
            tokens = tokenize(text)
            for token in tokens:
                print(token)
            print("¶")

        if child.name.lower() in ('div', 'section',):
            process_children(child, indent=indent + "  ")


def main(args):
    filename = args[0]
    with open(filename, 'rb') as f:
        text = f.read()
    soup = BeautifulSoup(text, 'html5lib')
    process_children(soup.body)


if __name__ == '__main__':
    main(sys.argv[1:])