script/build_readmes.py - The-Glosscubator (master)

Tree @master (Download .tar.gz)

build_readmes.py @master — raw · history · blame

#!/usr/bin/env python3

# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
# For more information, please refer to <https://unlicense.org/>
# SPDX-License-Identifier: Unlicense

from argparse import ArgumentParser
import json
import re
import os
import subprocess

from feedmark.loader import read_document_from


UNLICENSE_HEADER = """<!--
{}-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.

{}-License-Identifier: CC0-1.0
-->
""".format('SPDX', 'SPDX')


def make_anchor(title):
    return re.sub(
        r'[-\s]',
        '-',
        re.sub(r'[^\w\s-]', '', title).strip().lower()
    )


def format_see_also_link(topic, parent=".."):
    title = topic  # TODO no, get this from caller
    return (
        '[{}]({}/{}/README.md#{})'.format(
            title,
            parent,
            topic.replace(' ', '%20'),
            make_anchor(title)
        )
    )


def format_see_also_links(see_also, **kwargs):
    return ', '.join([format_see_also_link(topic, **kwargs) for topic in see_also])


def format_webpage(webpage):
    url = webpage["properties"].get("url")
    is_heading = webpage["properties"].get("is-heading")
    if url:
        return "[{}]({})".format(
            webpage["title"],
            url,
        )
    elif is_heading:
        return "### {}".format(webpage["title"])
    else:
        raise NotImplementedError("ouch")


def format_paper(paper):
    url = paper["properties"].get("url")
    if url:
        return "[{}]({})".format(
            paper["title"],
            url,
        )
    else:
        onlines = paper["properties"].get("online", [])
        online_at = " (online @ {})".format(', '.join(onlines)) if onlines else ""

        return "{}{}".format(
            paper["title"], online_at,
        )


def format_repo(repo):
    url = repo["properties"].get("url")
    return "[{}]({})".format(
        repo["title"],
        url,
    )


def format_book(book):
    onlines = book["properties"].get("online", [])
    online_at = " (online @ {})".format(', '.join(onlines)) if onlines else ""

    borrows = book["properties"].get("borrow", [])
    borrow_at = " (borrow @ {})".format(', '.join(borrows)) if borrows else ""

    borrowwpds = book["properties"].get("borrow-with-print-disabilities", [])
    borrowwpd_at = " (borrow with print disabilities @ {})".format(', '.join(borrowwpds)) if borrowwpds else ""

    return "{}{}{}{}".format(
        book["title"], online_at, borrow_at, borrowwpd_at
    )


def write_readme_file(base_dir, topic, topic_section, entries):

    webpages = entries["webpages"]
    repos = entries["repos"]
    books = entries["books"]
    papers = entries["papers"]

    title = topic_section["properties"].get("title", topic)
    see_also = topic_section["properties"].get("see-also", "")
    if see_also:
        see_also = [s.strip() for s in see_also.split(",")]
    else:
        see_also = []

    see_also = " | _See also: {}_".format(format_see_also_links(see_also)) if see_also else ""

    with open(os.path.join(base_dir, "by-topic", topic, "README.md"), 'w') as f:
        f.write(title + "\n")
        f.write("-" * len(title) + "\n")
        f.write("\n{}\n".format(UNLICENSE_HEADER))
        f.write("[(Up)](../../README.md#topics){}\n".format(see_also))
        f.write("""
- - - -

""")
        if webpages:
            f.write("\n### Web resources\n\n")
            for i, webpage in enumerate(webpages):
                f.write("{}\n".format(format_webpage(webpage)))
                f.write("\n" if i < len(webpages) - 1 else "")

        if repos:
            f.write("\n### Repositories\n\n")
            for i, repo in enumerate(repos):
                f.write("{}\n".format(format_repo(repo)))
                f.write("\n" if i < len(repos) - 1 else "")

        if papers:
            f.write("\n### Papers\n\n")
            for i, paper in enumerate(papers):
                f.write("{}\n".format(format_paper(paper)))
                f.write("\n" if i < len(papers) - 1 else "")

        if books:
            f.write("\n### Books\n\n")
            for i, book in enumerate(books):
                f.write("{}\n".format(format_book(book)))
                f.write("\n" if i < len(books) - 1 else "")


class Collector:
    def __init__(self, base_dir, webpages, repos, books, papers):
        self.base_dir = base_dir
        self.webpages = webpages
        self.repos = repos
        self.books = books
        self.papers = papers
        self.commentary = {}
        self.topics = read_document_from(os.path.join(base_dir, "TOPICS.md")).to_json_data()
        self.topic_dirs = set([f for f in os.listdir(os.path.join(self.base_dir, "by-topic")) if self.is_bookmark_dir(f)])
        self.seen_dirs = set()
        self.counts = {"webpages": 0, "repos": 0, "books": 0, "papers": 0}

    def is_bookmark_dir(self, dir_name):
        return any([
            os.path.isfile(os.path.join(self.base_dir, "by-topic", dir_name, filename))
            for filename in ["Webpages.md", "Books.md", "Papers.md", "Repositories.md"]
        ])

    def load_feedmark_sections(self, topic, filename):
        path = os.path.join(self.base_dir, "by-topic", topic, filename)
        if os.path.isfile(path):
            return read_document_from(path).to_json_data()["sections"]
        else:
            return []

    def load_topic(self, topic):
        assert topic in self.topic_dirs, "topics that are not dirs: {}".format(topic)
        self.seen_dirs.add(topic)
        self.webpages[topic] = self.load_feedmark_sections(topic, "Webpages.md")
        self.counts["webpages"] += len([w for w in self.webpages[topic] if not w["properties"].get("is-heading")])
        self.repos[topic] = self.load_feedmark_sections(topic, "Repositories.md")
        self.counts["repos"] += len(self.repos[topic])
        self.books[topic] = self.load_feedmark_sections(topic, "Books.md")
        self.counts["books"] += len(self.books[topic])
        self.papers[topic] = self.load_feedmark_sections(topic, "Papers.md")
        self.counts["papers"] += len(self.papers[topic])

    def check_entry_topics(self):

        def check_entry(topic, entry):
            assert isinstance(topic, str), "{}: {}".format(entry["title"], topic)
            topics = entry["properties"].get("topics", topic)
            assert isinstance(topics, str), "{}: {}".format(entry["title"], topics)
            topics = [t.strip() for t in topics.split(',')]
            assert topic in topics, "{}: {} not in {}".format(entry["title"], topic, topics)
            for x in topics:
                assert x in self.topic_dirs, "{}: {} not in {}".format(entry["title"], x, self.topic_dirs)
            # entry["properties"]["topics"] = topics

            if entry["title"] in self.commentary:
                commentary = self.commentary[entry["title"]]
                entry["properties"]["rating"] = commentary["properties"]["rating"]
                # print(topic, entry["title"])  # , commentary)
                # TODO generate a LINK to the commentary entry!

        for topic in self.topic_dirs:
            for entry in self.webpages.get(topic, []):
                check_entry(topic, entry)
            for entry in self.repos.get(topic, []):
                check_entry(topic, entry)
            for entry in self.books.get(topic, []):
                check_entry(topic, entry)
            for entry in self.papers.get(topic, []):
                check_entry(topic, entry)

    def load_commentary(self, topic):
        path = os.path.join(self.base_dir, "by-topic", topic, "commentary", "Chris Pressey.md")
        sections = []
        if os.path.isfile(path):
            sections = read_document_from(path).to_json_data()["sections"]
        for item in sections:
            assert item["title"] not in self.commentary
            self.commentary[item["title"]] = item


def dump_at_rating(f, c, entries, target_rating, formatter):
    count = 0
    for topic in sorted(c.topic_dirs):
        selecteds = []
        for entry in entries[topic]:
            if entry["properties"].get("is-heading"):
                continue

            url = entry["properties"].get("url", "")
            if url.startswith("https://en.wikipedia"):
                continue

            link = "*   {}".format(formatter(entry))

            rating = entry["properties"].get("rating", "TODO")
            if rating == target_rating:
                selecteds.append(link)
        if selecteds:
            f.write("\n### {}\n\n".format(topic))
            for selected in selecteds:
                f.write(selected)
                f.write("\n")
            count += len(selecteds)
    return count


def dump_rating_page(c, rating, rating_name, webpages, repos, books, papers):
    count = 0
    with open(os.path.join(c.base_dir, "by-rating", rating_name + ".md"), "w") as f:
        title = rating_name + " Resources"
        f.write(title + "\n")
        f.write("=" * len(title) + "\n")
        f.write("\n{}".format(UNLICENSE_HEADER))
        f.write("\n{} Webpages\n--------------\n".format(rating_name))
        count += dump_at_rating(f, c, webpages, rating, formatter=format_webpage)
        f.write("\n{} Books\n--------------\n".format(rating_name))
        count += dump_at_rating(f, c, books, rating, formatter=format_book)
        f.write("\n{} Papers\n--------------\n".format(rating_name))
        count += dump_at_rating(f, c, papers, rating, formatter=format_paper)
        f.write("\n{} Repositories\n--------------\n".format(rating_name))
        count += dump_at_rating(f, c, repos, rating, formatter=format_repo)
    return count


def run(cmd):
    print(cmd)
    subprocess.run(cmd, shell=True)

def main(args):
    argparser = ArgumentParser()

    argparser.add_argument('base_dir', metavar='DIRNAME', type=str,
        help='Directory in which the topic subdirectories reside'
    )

    options = argparser.parse_args(args)

    webpages = {}
    repos = {}
    books = {}
    papers = {}

    c = Collector(options.base_dir, webpages, repos, books, papers)

    topic_sections = {}
    seen_dirs = set()
    for section in sorted(c.topics["sections"], key=lambda s: s["title"]):
        # print(json.dumps(section, indent=4))
        topic = section["title"]
        used_in = section["properties"]["used-in"]
        if "bookmarks" not in used_in:
            continue
        c.load_commentary(topic)
        c.load_topic(topic)
        topic_sections[topic] = section

    c.check_entry_topics()

    assert c.topic_dirs == c.seen_dirs, "dirs that are not topics: {}".format(c.topic_dirs - seen_dirs)

    for topic in sorted(webpages.keys()):
        print("Writing out '{}'/README.md...".format(topic))
        write_readme_file(c.base_dir, topic, topic_sections[topic], {
            "webpages": webpages[topic],
            "repos": repos[topic],
            "books": books[topic],
            "papers": papers[topic],
        })

    rating_counts = {}
    for (rating, rating_name) in [
        ("TODO", "Unrated"),
        ("2", "Very Interesting"),
        ("3", "Top-rated"),
        ("classic", "Classic"),
    ]:
        rating_counts[rating] = dump_rating_page(c, rating, rating_name, webpages, repos, books, papers)

    totals1 = "Currently it consists of **{}** web pages, **{}** repositories, **{}** papers, and **{}** books in **{}** topics.".format(
        c.counts["webpages"], c.counts["repos"], c.counts["papers"], c.counts["books"], len(c.topic_dirs)
    )
    totals2 = """
Of these, [**{}** have the highest rating](by-rating/Top-rated.md),
[**{}** are considered classics](by-rating/Classic.md),
[**{}** are considered very interesting](by-rating/Very%20Interesting.md),
while [**{}** are yet to be rated](by-rating/Unrated.md).""".format(
    rating_counts["3"],
    rating_counts["classic"],
    rating_counts["2"],
    rating_counts["TODO"],
)

    totals = totals1 + totals2
    print(totals1)

    topics = []
    for topic in sorted(webpages.keys()):
        topics.append("*   {}".format(format_see_also_link(topic, parent="by-topic")))
    topics = '\n'.join(topics)

    with open(os.path.join(c.base_dir, "README.md"), "r") as f:
        readme = f.read()

    pattern = r"\<\!-- TOTALS --\>.*?\<\!-- \/TOTALS --\>"
    repl = "<!-- TOTALS -->\n\n{}\n\n<!-- /TOTALS -->".format(totals)
    readme = re.sub(pattern, repl, readme, count=1, flags=re.DOTALL)

    pattern = r"\<\!-- TOPICS --\>.*?\<\!-- \/TOPICS --\>"
    repl = "<!-- TOPICS -->\n\n{}\n\n<!-- /TOPICS -->".format(topics)
    readme = re.sub(pattern, repl, readme, count=1, flags=re.DOTALL)

    with open(os.path.join(c.base_dir, "README.md"), "w") as f:
        f.write(readme)


if __name__ == "__main__":
    import sys
    main(sys.argv[1:])