git @ Cat's Eye Technologies The-Glosscubator / 089fbdd
Checkpoint improving this script. Chris Pressey a month ago
2 changed file(s) with 152 addition(s) and 86 deletion(s). Raw diff Collapse all Expand all
+0
-86
script/feedmarkize_archive_entry.py less more
0 #!/usr/bin/env python3
1
2 # SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
3 # For more information, please refer to <https://unlicense.org/>
4 # SPDX-License-Identifier: Unlicense
5
6
7 import json
8 import re
9 import os
10 from time import sleep
11
12 from feedmark.checkers import Schema
13 from feedmark.formats.markdown import feedmark_markdownize
14 from feedmark.loader import read_document_from
15 import requests
16
17
18 # Note: WIP!
19
20
21 def get_archive_url(link_markdown):
22 match = re.match(r'^\s*\[(.+?)\]\((.+?)\)\s*(.*?)\s*$', link_markdown)
23 site = match.group(1)
24 url = match.group(2)
25 comments = match.group(3)
26
27 if site != 'archive.org':
28 return None
29
30 return url
31
32
33 def fetch_details(url):
34 inlibrary = False
35 printdisabled = False
36 # print("fetching", url, "...")
37 sleep(5)
38 # print("fetched.\n")
39
40
41 archive_page = requests.get(url).text
42
43 inlibrary = "/details/inlibrary" in archive_page
44 printdisabled = "/details/printdisabled" in archive_page
45 if inlibrary:
46 verb = "borrow"
47 else:
48 verb = "borrow-with-print-disabilities"
49
50 match = re.match(
51 r'^.*?\<input\s+class\=\"js-ia-metadata\"\s+type\=\"hidden\"\s+value\=\'(\{.*?\})\'\/\>',
52 archive_page,
53 re.DOTALL | re.MULTILINE
54 )
55 if not match:
56 print(archive_page)
57 raise ValueError("couldn't find js-ia-metadata!")
58
59 data = json.loads(match.group(1))
60
61 try:
62 metadata = data["metadata"]
63 creator = metadata["creator"]
64 date = metadata["date"]
65 title = metadata["title"]
66 except:
67 print(json.dumps(data, indent=4, sort_keys=True))
68 raise
69
70 print("### {}".format(title))
71 print("")
72 print("* authors: {}".format(creator))
73 print("* date: {}".format(date))
74 print("* {} @ [archive.org]({})".format(verb, url))
75 print("")
76
77
78 def main(args):
79 for url in args:
80 fetch_details(url)
81
82
83 if __name__ == "__main__":
84 import sys
85 main(sys.argv[1:])
0 #!/usr/bin/env python3
1
2 # SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
3 # For more information, please refer to <https://unlicense.org/>
4 # SPDX-License-Identifier: Unlicense
5
6
7 import json
8 import re
9 import os
10 import sys
11 from time import sleep
12
13 from feedmark.checkers import Schema
14 from feedmark.formats.markdown import feedmark_markdownize
15 from feedmark.loader import read_document_from
16 import requests
17
18
19 # Note: WIP!
20
21
22 def clean_authors(data):
23 if not isinstance(data, list):
24 data = data.split(';')
25
26 return ', '.join([clean_author(a) for a in data])
27
28
29 def clean_author(author):
30 match = re.match(r'^(.+?)\,\s*(.+?)\,\s*(.+?)\s*$', author)
31 if match:
32 return '{} {}'.format(match.group(2), match.group(1))
33 match = re.match(r'^(.+?)\,\s*(.+?)\s*$', author)
34 if match:
35 return '{} {}'.format(match.group(2), match.group(1))
36 return author.strip()
37
38
39 SHORTWORDS = ('for', 'in', 'the', 'of', 'a', 'with', 'to', 'from', 'and', 'on', 'can', 'do', 'over')
40
41
42 def titlecase(s):
43 ws = []
44 words = [w for w in s.split(' ') if w]
45 for i, w in enumerate(words):
46 if i == 0:
47 ws.append(w.capitalize())
48 elif w not in SHORTWORDS:
49 ws.append(w.capitalize())
50 else:
51 ws.append(w)
52 return (' '.join(ws)).strip()
53
54
55 def clean_title(title):
56 segments = title.split(':')
57 subtitle = None
58 if len(segments) > 1:
59 subtitle = titlecase(segments[-1])
60 title = ': '.join(segments[:-1])
61 else:
62 subtitle = None
63 title = ': '.join(segments)
64
65 title = titlecase(title)
66 return (title, subtitle)
67
68
69 def clean_url(url):
70 match = re.match(r'^.+?\:\/\/(archive.org\/details\/\w+)', url)
71 return 'https://' + match.group(1)
72
73
74 def fetch_details(url):
75 inlibrary = False
76 printdisabled = False
77
78 archive_page = requests.get(url).text
79 sleep(10) # be nice to the free online service.
80
81 inlibrary = "/details/inlibrary" in archive_page
82 printdisabled = "/details/printdisabled" in archive_page
83 if inlibrary:
84 verb = "borrow"
85 elif printdisabled:
86 verb = "borrow-with-print-disabilities"
87 else:
88 verb = "online"
89
90 match = re.match(
91 r'^.*?\<input\s+class\=\"js-ia-metadata\"\s+type\=\"hidden\"\s+value\=\'(\{.*?\})\'\/\>',
92 archive_page,
93 re.DOTALL | re.MULTILINE
94 )
95 if not match:
96 print(archive_page)
97 raise ValueError("couldn't find js-ia-metadata!")
98
99 data = json.loads(match.group(1))
100
101 try:
102 metadata = data["metadata"]
103 authors = clean_authors(metadata.get("creator", metadata.get("associated-names", "Unknown")))
104 date = metadata["date"]
105 (title, subtitle) = clean_title(metadata["title"])
106 url = clean_url(url)
107 except:
108 print(json.dumps(data, indent=4, sort_keys=True))
109 raise
110
111 print("### {}".format(title))
112 print("")
113 if subtitle:
114 print("* subtitle: {}".format(subtitle))
115 print("* authors: {}".format(authors))
116 print("* date: {}".format(date))
117 print("* {} @ [archive.org]({})".format(verb, url))
118 print("")
119
120
121 def fetch_mdlinks(f):
122 urls = []
123 for line in f:
124 line = line.strip()
125 match = re.match(r'^\s*\[(.+?)\]\((.+?)\)\s*(.*?)\s*$', line)
126 if not match:
127 continue
128 site = match.group(1)
129 url = match.group(2)
130 comments = match.group(3)
131 if 'archive.org' not in url:
132 continue
133 urls.append(url)
134
135 # print(urls)
136
137 for url in urls:
138 fetch_details(url)
139
140
141 def main(args):
142 if args[0] == 'url':
143 for url in args[1:]:
144 fetch_details(url)
145 elif args[0] == 'mdlinks':
146 fetch_mdlinks(sys.stdin)
147
148
149 if __name__ == "__main__":
150 import sys
151 main(sys.argv[1:])