git @ Cat's Eye Technologies The-Glosscubator / master script / build_readmes / collector.py
master

Tree @master (Download .tar.gz)

collector.py @masterraw · history · blame

# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
# For more information, please refer to <https://unlicense.org/>
# SPDX-License-Identifier: Unlicense

"""
Handles collection and organization of entries from the filesystem
"""
from collections import defaultdict
import os

from feedmark.loader import read_document_from

from formatters import make_anchor


class Collector:
    def __init__(self, base_dir, webpages, repos, books, papers):
        self.base_dir = base_dir
        self.webpages = webpages
        self.repos = repos
        self.books = books
        self.papers = papers
        self.commentary = {}
        self.topics = read_document_from(os.path.join(base_dir, "TOPICS.md")).to_json_data()
        self.topic_dirs = set([f for f in os.listdir(os.path.join(self.base_dir, "by-topic")) if self.is_bookmark_dir(f)])
        self.seen_dirs = set()
        self.counts = {"webpages": 0, "repos": 0, "books": 0, "papers": 0}
        self.secondary_webpages = defaultdict(lambda: defaultdict(list))
        self.secondary_repos = defaultdict(lambda: defaultdict(list))
        self.secondary_books = defaultdict(lambda: defaultdict(list))
        self.secondary_papers = defaultdict(lambda: defaultdict(list))

    def is_bookmark_dir(self, dir_name):
        return any([
            os.path.isfile(os.path.join(self.base_dir, "by-topic", dir_name, filename))
            for filename in ["Webpages.md", "Books.md", "Papers.md", "Repositories.md"]
        ])

    def load_feedmark_sections(self, topic, filename):
        path = os.path.join(self.base_dir, "by-topic", topic, filename)
        if os.path.isfile(path):
            return read_document_from(path).to_json_data()["sections"]
        else:
            return []

    def load_topic(self, topic):
        assert topic in self.topic_dirs, "topics that are not dirs: {}".format(topic)
        self.seen_dirs.add(topic)
        self.webpages[topic] = self.load_feedmark_sections(topic, "Webpages.md")
        self.counts["webpages"] += len([w for w in self.webpages[topic] if not w["properties"].get("is-heading")])
        self.repos[topic] = self.load_feedmark_sections(topic, "Repositories.md")
        self.counts["repos"] += len(self.repos[topic])
        self.books[topic] = self.load_feedmark_sections(topic, "Books.md")
        self.counts["books"] += len(self.books[topic])
        self.papers[topic] = self.load_feedmark_sections(topic, "Papers.md")
        self.counts["papers"] += len(self.papers[topic])

    def process_secondary_topics(self):
        """Process entries that belong to multiple topics."""
        for main_topic in self.topic_dirs:
            # Process each type of entry
            for entry in self.webpages.get(main_topic, []):
                self._process_secondary_entry(entry, main_topic, self.secondary_webpages)
            for entry in self.repos.get(main_topic, []):
                self._process_secondary_entry(entry, main_topic, self.secondary_repos)
            for entry in self.books.get(main_topic, []):
                self._process_secondary_entry(entry, main_topic, self.secondary_books)
            for entry in self.papers.get(main_topic, []):
                self._process_secondary_entry(entry, main_topic, self.secondary_papers)

    def _process_secondary_entry(self, entry, main_topic, secondary_dict):
        """Helper method to process an entry for secondary topics."""
        if "topics" in entry["properties"]:
            topics = [t.strip() for t in entry["properties"]["topics"].split(',')]
            for topic in topics:
                if topic != main_topic:  # Don't add to secondary if it's the main topic
                    secondary_dict[topic][main_topic].append(entry)

    def check_entry_topics(self):
        for topic in self.topic_dirs:
            for entry in self.webpages.get(topic, []):
                self._check_entry(topic, entry)
            for entry in self.repos.get(topic, []):
                self._check_entry(topic, entry)
            for entry in self.books.get(topic, []):
                self._check_entry(topic, entry)
            for entry in self.papers.get(topic, []):
                self._check_entry(topic, entry)

    def _check_entry(self, topic, entry):
        title = entry["title"]
        assert isinstance(topic, str), "{}: {}".format(title, topic)
        topics = entry["properties"].get("topics", topic)
        assert isinstance(topics, str), "{}: {}".format(title, topics)
        topics = [t.strip() for t in topics.split(',')]
        assert topic in topics, "{}: {} not in {}".format(title, topic, topics)
        for x in topics:
            assert x in self.topic_dirs, "{}: {} not in {}".format(title, x, self.topic_dirs)

        if title in self.commentary:
            commentary = self.commentary[title]
            entry["properties"]["rating"] = commentary["properties"]["rating"]
            commentary_text = commentary["body"].strip()
            if len(commentary_text) > 4:
                entry["properties"]["commentary_link"] = (
                    "commentary/Chris%20Pressey.md#{}".format(make_anchor(title))
                )

    def load_commentary(self, topic):
        path = os.path.join(self.base_dir, "by-topic", topic, "commentary", "Chris Pressey.md")
        sections = []
        if os.path.isfile(path):
            sections = read_document_from(path).to_json_data()["sections"]
        for item in sections:
            assert item["title"] not in self.commentary
            self.commentary[item["title"]] = item

    def get_entries_for_topic(self, topic):
        """Get both primary and secondary entries for a topic."""
        primary = {
            "webpages": self.webpages.get(topic, []),
            "repos": self.repos.get(topic, []),
            "books": self.books.get(topic, []),
            "papers": self.papers.get(topic, [])
        }

        secondary = {
            "webpages": self.secondary_webpages.get(topic, {}),
            "repos": self.secondary_repos.get(topic, {}),
            "books": self.secondary_books.get(topic, {}),
            "papers": self.secondary_papers.get(topic, {})
        }

        return primary, secondary