git @ Cat's Eye Technologies Dissociated-Parse / master

Tree @master (Download .tar.gz) @masterraw · history · blame

#!/usr/bin/env python3

# Copyright (c) 2021-2024 Chris Pressey, Cat's Eye Technologies
# This file is distributed under the MIT license.  For more information, see
# the file LicenseRef-MIT-X-Dissociated-Parse.txt in the LICENSES directory.
# SPDX-License-Identifier: LicenseRef-MIT-X-Dissociated-Parse

import json
import os
import re
import sys

from bs4 import BeautifulSoup

def scannerate(text):
    pos = 0
    tokens = []
    done = False

    patterns = [
        (r'(\s+)',      'whitespace'),
        ('([\\w\'-]+)', 'word'),
        ('(.)',         'any character'),
    while pos < len(text):
        for (pattern, pattype) in patterns:
            regexp = re.compile(pattern, flags=re.DOTALL)
            match = regexp.match(text, pos=pos)
            if match:
                token =
                if pattype not in ('whitespace',):
                pos += len(token)
    return tokens

def main(args):
    paras = []

    for filename in args:
        with open(filename, 'r') as f:
            text =
        soup = BeautifulSoup(text, features="html.parser")
        for para in soup.find_all('p'):
            para_text = para.text.replace('--', '—')

    data = {
        'paragraphs': paras,
    with open('data/all-paragraphs.json', 'w') as f:
        f.write(json.dumps(data, indent=4))

if __name__ == '__main__':