script/feedmarkize_ia.py - The-Glosscubator (master)

Tree @master (Download .tar.gz)

feedmarkize_ia.py @master — raw · history · blame

#!/usr/bin/env python3

# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
# For more information, please refer to <https://unlicense.org/>
# SPDX-License-Identifier: Unlicense


import json
import re
import os
import sys
from time import sleep

from feedmark.checkers import Schema
from feedmark.formats.markdown import feedmark_markdownize
from feedmark.loader import read_document_from
import requests


# Note: WIP!


def clean_authors(data):
    if not isinstance(data, list):
        data = data.split(';')

    return ', '.join([clean_author(a) for a in data])


def clean_author(author):
    match = re.match(r'^(.+?)\,\s*(.+?)\,\s*(.+?)\s*$', author)
    if match:
        return '{} {}'.format(match.group(2), match.group(1))
    match = re.match(r'^(.+?)\,\s*(.+?)\s*$', author)
    if match:
        return '{} {}'.format(match.group(2), match.group(1))
    return author.strip()


SHORTWORDS = (
    'for', 'in', 'the', 'of', 'a', 'an', 'with', 'to', 'from', 'and',
    'on', 'can', 'do', 'over', 'has', 'is', 'at', 'this', 'your', 'my', 'it',
)


def titlecase(s):
    ws = []
    words = [w for w in s.split(' ') if w]
    for i, w in enumerate(words):
        if i == 0:
            ws.append(w.capitalize())
        elif w not in SHORTWORDS:
            ws.append(w.capitalize())
        else:
            ws.append(w)
    return (' '.join(ws)).strip()


def clean_title(title):
    segments = title.split(':')
    subtitle = None
    if len(segments) > 1:
        subtitle = titlecase(segments[-1])
        title = ': '.join(segments[:-1])
    else:
        subtitle = None
        title = ': '.join(segments)

    title = titlecase(title)
    return (title, subtitle)


def clean_url(url):
    match = re.match(r'^.+?\:\/\/(archive.org\/details\/[0-9a-zA-Z\-\_]+)', url)
    return 'https://' + match.group(1)


def fetch_details(url):
    inlibrary = False
    printdisabled = False

    archive_page = requests.get(url).text
    sleep(10)  # be nice to the free online service.

    inlibrary = "/details/inlibrary" in archive_page
    printdisabled = "/details/printdisabled" in archive_page
    if inlibrary:
        verb = "borrow"
    elif printdisabled:
        verb = "borrow-with-print-disabilities"
    else:
        verb = "online"

    match = re.match(
        r'^.*?\<input\s+class\=\"js-ia-metadata\"\s+type\=\"hidden\"\s+value\=\'(\{.*?\})\'\/\>',
        archive_page,
        re.DOTALL | re.MULTILINE
    )
    if not match:
        print(archive_page)
        raise ValueError("couldn't find js-ia-metadata!")

    data = json.loads(match.group(1))

    try:
        metadata = data["metadata"]
        authors = clean_authors(metadata.get("creator", metadata.get("associated-names", "Unknown")))
        date = metadata.get("date", "Unknown")
        (title, subtitle) = clean_title(metadata["title"])
        url = clean_url(url)
    except:
        print(json.dumps(data, indent=4, sort_keys=True))
        raise

    print("### {}".format(title))
    print("")
    if subtitle:
        print("*   subtitle: {}".format(subtitle))
    print("*   authors: {}".format(authors))
    print("*   date: {}".format(date))
    print("*   {} @ [archive.org]({})".format(verb, url))
    print("")


def fetch_mdlinks(f):
    urls = []
    for line in f:
        line = line.strip()
        match = re.match(r'^.*?\[(.+?)\]\((.+?)\)\s*(.*?)\s*$', line)
        if not match:
            continue
        site = match.group(1)
        url = match.group(2)
        comments = match.group(3)
        if 'archive.org' not in url:
            continue
        urls.append(url)

    # print(urls)

    for url in urls:
        fetch_details(url)


def fetch_txtlinks(f):
    urls = []
    for line in f:
        line = line.strip()
        match = re.match(r'^\s*(.+?)\s*$', line)
        if not match:
            continue
        url = match.group(1)
        if 'archive.org' not in url:
            continue
        urls.append(url)

    # print(urls)

    for url in urls:
        fetch_details(url)


def main(args):
    if args[0] == 'url':
        for url in args[1:]:
            fetch_details(url)
    elif args[0] == 'mdlinks':
        fetch_mdlinks(sys.stdin)
    elif args[0] == 'txtlinks':
        fetch_txtlinks(sys.stdin)


if __name__ == "__main__":
    import sys
    main(sys.argv[1:])