git @ Cat's Eye Technologies Feedmark / master eg / script / output_links.py
master

Tree @master (Download .tar.gz)

output_links.py @masterraw · history · blame

#!/usr/bin/env python3

# Copyright (c) 2019-2024 Chris Pressey, Cat's Eye Technologies
# This file is distributed under an MIT license.  See LICENSES/ directory.
# SPDX-License-Identifier: LicenseRef-MIT-X-Feedmark

# This script demonstrates how the modules from the feedmark package
# can be used in scripts.  It also requires the beautifulsoup4 package
# to be installed.

import json
import sys

from bs4 import BeautifulSoup
from feedmark.formats.markdown import markdown_to_html5
from feedmark.main import read_document_from


def extract_links(html_text):

    links = []
    soup = BeautifulSoup(html_text, "html.parser")
    for link in soup.find_all("a"):
        url = link.get("href")
        links.append(url)

    return links


def extract_links_from_documents(documents):
    links = []

    def make_link(url, document=None, section=None, **kwargs):
        link = {
            "url": url,
        }
        if document:
            link.update(
                {
                    "document": document.title,
                }
            )
        elif section:
            link.update(
                {
                    "section": section.title,
                    "document": section.document.title,
                }
            )
        link.update(kwargs)
        return link

    def extend_links(section, md):
        links.extend(
            [
                make_link(url, section=section)
                for url in extract_links(markdown_to_html5(md))
            ]
        )

    for document in documents:
        link_ref_defs = document.link_ref_defs
        for name, (url, title) in link_ref_defs.items():
            name = sorted(link_ref_defs.unnormalized_labels_for(name))[0]
            links.append(make_link(url, document=document, name=name))
        for section in document.sections:
            for name, url in section.images:
                links.append(make_link(url, section=section, name=name))
            for key, value in section.properties.items():
                if isinstance(value, list):
                    for subitem in value:
                        extend_links(section, subitem)
                else:
                    extend_links(section, value)
            extend_links(section, section.body)
    return links


def main(args):
    documents = []
    for filename in args:
        documents.append(read_document_from(filename))
    links = extract_links_from_documents(documents)
    sys.stdout.write(json.dumps(links, indent=4, sort_keys=True))


if __name__ == "__main__":
    main(sys.argv[1:])