#!/usr/bin/env python3
# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
# For more information, please refer to <https://unlicense.org/>
# SPDX-License-Identifier: Unlicense
import json
import re
import os
import sys
from time import sleep
from feedmark.checkers import Schema
from feedmark.formats.markdown import feedmark_markdownize
from feedmark.loader import read_document_from
import requests
# Note: WIP!
def clean_authors(data):
if not isinstance(data, list):
data = data.split(';')
return ', '.join([clean_author(a) for a in data])
def clean_author(author):
match = re.match(r'^(.+?)\,\s*(.+?)\,\s*(.+?)\s*$', author)
if match:
return '{} {}'.format(match.group(2), match.group(1))
match = re.match(r'^(.+?)\,\s*(.+?)\s*$', author)
if match:
return '{} {}'.format(match.group(2), match.group(1))
return author.strip()
SHORTWORDS = (
'for', 'in', 'the', 'of', 'a', 'an', 'with', 'to', 'from', 'and',
'on', 'can', 'do', 'over', 'has', 'is', 'at', 'this', 'your', 'my', 'it',
)
def titlecase(s):
ws = []
words = [w for w in s.split(' ') if w]
for i, w in enumerate(words):
if i == 0:
ws.append(w.capitalize())
elif w not in SHORTWORDS:
ws.append(w.capitalize())
else:
ws.append(w)
return (' '.join(ws)).strip()
def clean_title(title):
segments = title.split(':')
subtitle = None
if len(segments) > 1:
subtitle = titlecase(segments[-1])
title = ': '.join(segments[:-1])
else:
subtitle = None
title = ': '.join(segments)
title = titlecase(title)
return (title, subtitle)
def clean_url(url):
match = re.match(r'^.+?\:\/\/(archive.org\/details\/[0-9a-zA-Z\-\_]+)', url)
return 'https://' + match.group(1)
def fetch_details(url):
inlibrary = False
printdisabled = False
archive_page = requests.get(url).text
sleep(10) # be nice to the free online service.
inlibrary = "/details/inlibrary" in archive_page
printdisabled = "/details/printdisabled" in archive_page
if inlibrary:
verb = "borrow"
elif printdisabled:
verb = "borrow-with-print-disabilities"
else:
verb = "online"
match = re.match(
r'^.*?\<input\s+class\=\"js-ia-metadata\"\s+type\=\"hidden\"\s+value\=\'(\{.*?\})\'\/\>',
archive_page,
re.DOTALL | re.MULTILINE
)
if not match:
print(archive_page)
raise ValueError("couldn't find js-ia-metadata!")
data = json.loads(match.group(1))
try:
metadata = data["metadata"]
authors = clean_authors(metadata.get("creator", metadata.get("associated-names", "Unknown")))
date = metadata.get("date", "Unknown")
(title, subtitle) = clean_title(metadata["title"])
url = clean_url(url)
except:
print(json.dumps(data, indent=4, sort_keys=True))
raise
print("### {}".format(title))
print("")
if subtitle:
print("* subtitle: {}".format(subtitle))
print("* authors: {}".format(authors))
print("* date: {}".format(date))
print("* {} @ [archive.org]({})".format(verb, url))
print("")
def fetch_mdlinks(f):
urls = []
for line in f:
line = line.strip()
match = re.match(r'^.*?\[(.+?)\]\((.+?)\)\s*(.*?)\s*$', line)
if not match:
continue
site = match.group(1)
url = match.group(2)
comments = match.group(3)
if 'archive.org' not in url:
continue
urls.append(url)
# print(urls)
for url in urls:
fetch_details(url)
def fetch_txtlinks(f):
urls = []
for line in f:
line = line.strip()
match = re.match(r'^\s*(.+?)\s*$', line)
if not match:
continue
url = match.group(1)
if 'archive.org' not in url:
continue
urls.append(url)
# print(urls)
for url in urls:
fetch_details(url)
def main(args):
if args[0] == 'url':
for url in args[1:]:
fetch_details(url)
elif args[0] == 'mdlinks':
fetch_mdlinks(sys.stdin)
elif args[0] == 'txtlinks':
fetch_txtlinks(sys.stdin)
if __name__ == "__main__":
import sys
main(sys.argv[1:])