# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
# For more information, please refer to <https://unlicense.org/>
# SPDX-License-Identifier: Unlicense
"""
Handles collection and organization of entries from the filesystem
"""
from collections import defaultdict
import os
from feedmark.loader import read_document_from
from formatters import make_anchor
class Collector:
def __init__(self, base_dir, webpages, repos, books, papers):
self.base_dir = base_dir
self.webpages = webpages
self.repos = repos
self.books = books
self.papers = papers
self.commentary = {}
self.topics = read_document_from(os.path.join(base_dir, "TOPICS.md")).to_json_data()
self.topic_dirs = set([f for f in os.listdir(os.path.join(self.base_dir, "by-topic")) if self.is_bookmark_dir(f)])
self.seen_dirs = set()
self.counts = {"webpages": 0, "repos": 0, "books": 0, "papers": 0}
self.secondary_webpages = defaultdict(lambda: defaultdict(list))
self.secondary_repos = defaultdict(lambda: defaultdict(list))
self.secondary_books = defaultdict(lambda: defaultdict(list))
self.secondary_papers = defaultdict(lambda: defaultdict(list))
def is_bookmark_dir(self, dir_name):
return any([
os.path.isfile(os.path.join(self.base_dir, "by-topic", dir_name, filename))
for filename in ["Webpages.md", "Books.md", "Papers.md", "Repositories.md"]
])
def load_feedmark_sections(self, topic, filename):
path = os.path.join(self.base_dir, "by-topic", topic, filename)
if os.path.isfile(path):
return read_document_from(path).to_json_data()["sections"]
else:
return []
def load_topic(self, topic):
assert topic in self.topic_dirs, "topics that are not dirs: {}".format(topic)
self.seen_dirs.add(topic)
self.webpages[topic] = self.load_feedmark_sections(topic, "Webpages.md")
self.counts["webpages"] += len([w for w in self.webpages[topic] if not w["properties"].get("is-heading")])
self.repos[topic] = self.load_feedmark_sections(topic, "Repositories.md")
self.counts["repos"] += len(self.repos[topic])
self.books[topic] = self.load_feedmark_sections(topic, "Books.md")
self.counts["books"] += len(self.books[topic])
self.papers[topic] = self.load_feedmark_sections(topic, "Papers.md")
self.counts["papers"] += len(self.papers[topic])
def process_secondary_topics(self):
"""Process entries that belong to multiple topics."""
for main_topic in self.topic_dirs:
# Process each type of entry
for entry in self.webpages.get(main_topic, []):
self._process_secondary_entry(entry, main_topic, self.secondary_webpages)
for entry in self.repos.get(main_topic, []):
self._process_secondary_entry(entry, main_topic, self.secondary_repos)
for entry in self.books.get(main_topic, []):
self._process_secondary_entry(entry, main_topic, self.secondary_books)
for entry in self.papers.get(main_topic, []):
self._process_secondary_entry(entry, main_topic, self.secondary_papers)
def _process_secondary_entry(self, entry, main_topic, secondary_dict):
"""Helper method to process an entry for secondary topics."""
if "topics" in entry["properties"]:
topics = [t.strip() for t in entry["properties"]["topics"].split(',')]
for topic in topics:
if topic != main_topic: # Don't add to secondary if it's the main topic
secondary_dict[topic][main_topic].append(entry)
def check_entry_topics(self):
for topic in self.topic_dirs:
for entry in self.webpages.get(topic, []):
self._check_entry(topic, entry)
for entry in self.repos.get(topic, []):
self._check_entry(topic, entry)
for entry in self.books.get(topic, []):
self._check_entry(topic, entry)
for entry in self.papers.get(topic, []):
self._check_entry(topic, entry)
def _check_entry(self, topic, entry):
title = entry["title"]
assert isinstance(topic, str), "{}: {}".format(title, topic)
topics = entry["properties"].get("topics", topic)
assert isinstance(topics, str), "{}: {}".format(title, topics)
topics = [t.strip() for t in topics.split(',')]
assert topic in topics, "{}: {} not in {}".format(title, topic, topics)
for x in topics:
assert x in self.topic_dirs, "{}: {} not in {}".format(title, x, self.topic_dirs)
if title in self.commentary:
commentary = self.commentary[title]
entry["properties"]["rating"] = commentary["properties"]["rating"]
commentary_text = commentary["body"].strip()
if len(commentary_text) > 4:
entry["properties"]["commentary_link"] = (
"commentary/Chris%20Pressey.md#{}".format(make_anchor(title))
)
def load_commentary(self, topic):
path = os.path.join(self.base_dir, "by-topic", topic, "commentary", "Chris Pressey.md")
sections = []
if os.path.isfile(path):
sections = read_document_from(path).to_json_data()["sections"]
for item in sections:
assert item["title"] not in self.commentary
self.commentary[item["title"]] = item
def get_entries_for_topic(self, topic):
"""Get both primary and secondary entries for a topic."""
primary = {
"webpages": self.webpages.get(topic, []),
"repos": self.repos.get(topic, []),
"books": self.books.get(topic, []),
"papers": self.papers.get(topic, [])
}
secondary = {
"webpages": self.secondary_webpages.get(topic, {}),
"repos": self.secondary_repos.get(topic, {}),
"books": self.secondary_books.get(topic, {}),
"papers": self.secondary_papers.get(topic, {})
}
return primary, secondary