|
0 |
#!/usr/bin/env python3
|
|
1 |
|
|
2 |
# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
|
|
3 |
# For more information, please refer to <https://unlicense.org/>
|
|
4 |
# SPDX-License-Identifier: Unlicense
|
|
5 |
|
|
6 |
|
|
7 |
import json
|
|
8 |
import re
|
|
9 |
import os
|
|
10 |
import sys
|
|
11 |
from time import sleep
|
|
12 |
|
|
13 |
from feedmark.checkers import Schema
|
|
14 |
from feedmark.formats.markdown import feedmark_markdownize
|
|
15 |
from feedmark.loader import read_document_from
|
|
16 |
import requests
|
|
17 |
|
|
18 |
|
|
19 |
# Note: WIP!
|
|
20 |
|
|
21 |
|
|
22 |
def clean_authors(data):
|
|
23 |
if not isinstance(data, list):
|
|
24 |
data = data.split(';')
|
|
25 |
|
|
26 |
return ', '.join([clean_author(a) for a in data])
|
|
27 |
|
|
28 |
|
|
29 |
def clean_author(author):
|
|
30 |
match = re.match(r'^(.+?)\,\s*(.+?)\,\s*(.+?)\s*$', author)
|
|
31 |
if match:
|
|
32 |
return '{} {}'.format(match.group(2), match.group(1))
|
|
33 |
match = re.match(r'^(.+?)\,\s*(.+?)\s*$', author)
|
|
34 |
if match:
|
|
35 |
return '{} {}'.format(match.group(2), match.group(1))
|
|
36 |
return author.strip()
|
|
37 |
|
|
38 |
|
|
39 |
SHORTWORDS = ('for', 'in', 'the', 'of', 'a', 'with', 'to', 'from', 'and', 'on', 'can', 'do', 'over')
|
|
40 |
|
|
41 |
|
|
42 |
def titlecase(s):
|
|
43 |
ws = []
|
|
44 |
words = [w for w in s.split(' ') if w]
|
|
45 |
for i, w in enumerate(words):
|
|
46 |
if i == 0:
|
|
47 |
ws.append(w.capitalize())
|
|
48 |
elif w not in SHORTWORDS:
|
|
49 |
ws.append(w.capitalize())
|
|
50 |
else:
|
|
51 |
ws.append(w)
|
|
52 |
return (' '.join(ws)).strip()
|
|
53 |
|
|
54 |
|
|
55 |
def clean_title(title):
|
|
56 |
segments = title.split(':')
|
|
57 |
subtitle = None
|
|
58 |
if len(segments) > 1:
|
|
59 |
subtitle = titlecase(segments[-1])
|
|
60 |
title = ': '.join(segments[:-1])
|
|
61 |
else:
|
|
62 |
subtitle = None
|
|
63 |
title = ': '.join(segments)
|
|
64 |
|
|
65 |
title = titlecase(title)
|
|
66 |
return (title, subtitle)
|
|
67 |
|
|
68 |
|
|
69 |
def clean_url(url):
|
|
70 |
match = re.match(r'^.+?\:\/\/(archive.org\/details\/\w+)', url)
|
|
71 |
return 'https://' + match.group(1)
|
|
72 |
|
|
73 |
|
|
74 |
def fetch_details(url):
|
|
75 |
inlibrary = False
|
|
76 |
printdisabled = False
|
|
77 |
|
|
78 |
archive_page = requests.get(url).text
|
|
79 |
sleep(10) # be nice to the free online service.
|
|
80 |
|
|
81 |
inlibrary = "/details/inlibrary" in archive_page
|
|
82 |
printdisabled = "/details/printdisabled" in archive_page
|
|
83 |
if inlibrary:
|
|
84 |
verb = "borrow"
|
|
85 |
elif printdisabled:
|
|
86 |
verb = "borrow-with-print-disabilities"
|
|
87 |
else:
|
|
88 |
verb = "online"
|
|
89 |
|
|
90 |
match = re.match(
|
|
91 |
r'^.*?\<input\s+class\=\"js-ia-metadata\"\s+type\=\"hidden\"\s+value\=\'(\{.*?\})\'\/\>',
|
|
92 |
archive_page,
|
|
93 |
re.DOTALL | re.MULTILINE
|
|
94 |
)
|
|
95 |
if not match:
|
|
96 |
print(archive_page)
|
|
97 |
raise ValueError("couldn't find js-ia-metadata!")
|
|
98 |
|
|
99 |
data = json.loads(match.group(1))
|
|
100 |
|
|
101 |
try:
|
|
102 |
metadata = data["metadata"]
|
|
103 |
authors = clean_authors(metadata.get("creator", metadata.get("associated-names", "Unknown")))
|
|
104 |
date = metadata["date"]
|
|
105 |
(title, subtitle) = clean_title(metadata["title"])
|
|
106 |
url = clean_url(url)
|
|
107 |
except:
|
|
108 |
print(json.dumps(data, indent=4, sort_keys=True))
|
|
109 |
raise
|
|
110 |
|
|
111 |
print("### {}".format(title))
|
|
112 |
print("")
|
|
113 |
if subtitle:
|
|
114 |
print("* subtitle: {}".format(subtitle))
|
|
115 |
print("* authors: {}".format(authors))
|
|
116 |
print("* date: {}".format(date))
|
|
117 |
print("* {} @ [archive.org]({})".format(verb, url))
|
|
118 |
print("")
|
|
119 |
|
|
120 |
|
|
121 |
def fetch_mdlinks(f):
|
|
122 |
urls = []
|
|
123 |
for line in f:
|
|
124 |
line = line.strip()
|
|
125 |
match = re.match(r'^\s*\[(.+?)\]\((.+?)\)\s*(.*?)\s*$', line)
|
|
126 |
if not match:
|
|
127 |
continue
|
|
128 |
site = match.group(1)
|
|
129 |
url = match.group(2)
|
|
130 |
comments = match.group(3)
|
|
131 |
if 'archive.org' not in url:
|
|
132 |
continue
|
|
133 |
urls.append(url)
|
|
134 |
|
|
135 |
# print(urls)
|
|
136 |
|
|
137 |
for url in urls:
|
|
138 |
fetch_details(url)
|
|
139 |
|
|
140 |
|
|
141 |
def main(args):
|
|
142 |
if args[0] == 'url':
|
|
143 |
for url in args[1:]:
|
|
144 |
fetch_details(url)
|
|
145 |
elif args[0] == 'mdlinks':
|
|
146 |
fetch_mdlinks(sys.stdin)
|
|
147 |
|
|
148 |
|
|
149 |
if __name__ == "__main__":
|
|
150 |
import sys
|
|
151 |
main(sys.argv[1:])
|