#!/usr/bin/env python3
# Copyright (c) 2024 Chris Pressey, Cat's Eye Technologies
# This file is distributed under an MIT license. See LICENSES directory.
# SPDX-License-Identifier: LicenseRef-MIT-X-Chainscape
"""
This script was lifted from the build tools for "Anne of Green Garbles", NaNoGenMo 2019:
https://git.catseye.tc/NaNoGenMo-Entries-2019/blob/master/Anne%20of%20Green%20Garbles/README.md
and modified slightly to fit this project.
* input format: HTML
* output format: tokenstream
Script that takes an HTML file and outputs a stream of tokens,
one per line (tokenstream).
Tries to ignore any element of the HTML which is not obviously
a paragraph of text (for example, ToC entries, chapter headings,
and Gutenberg license blocks.)
It retains punctuation symbols such as “ and ” and ( and ) and
. and , and ! and treats these as individual tokens.
It also produces ¶ symbols to indicate the end of each paragraph.
"""
import sys
import re
from bs4 import BeautifulSoup, NavigableString
def scan_token(s, tokens):
s = s.lstrip()
match = re.match(r'^(\w+)(.*?)$', s)
if match:
tokens.append(match.group(1))
return match.group(2)
match = re.match(r'^(.)(.*?)$', s)
if match:
tokens.append(match.group(1))
return match.group(2)
def tokenize(s):
tokens = []
while s:
s = scan_token(s, tokens)
return tokens
def render(node):
name = (node.name or "").lower()
if hasattr(node, 'attrs'):
name += "<{}>".format(node.attrs.get('class') if node.attrs.get('class') else "")
return name
def process_children(container, indent=""):
for child in container.children:
if isinstance(child, NavigableString):
continue
if child.attrs.get('class') and 'toc' in child.attrs.get('class'):
continue
if child.attrs.get('class') and 'licenseContainer' in child.attrs.get('class'):
continue
text = child.get_text().lstrip().replace('\n', ' ')
if 'PROJECT GUTENBERG' in text.upper():
continue
if child.name.lower() in ('p',):
tokens = tokenize(text)
for token in tokens:
print(token)
print("¶")
if child.name.lower() in ('div', 'section',):
process_children(child, indent=indent + " ")
def main(args):
filename = args[0]
with open(filename, 'rb') as f:
text = f.read()
soup = BeautifulSoup(text, 'html5lib')
process_children(soup.body)
if __name__ == '__main__':
main(sys.argv[1:])