Commit 089fbddb7e974945b13ce6be9926049f8b4bc2fb - The-Glosscubator

Checkpoint improving this script. Chris Pressey a month ago

2 changed file(s) with 152 addition(s) and 86 deletion(s). Raw diff Collapse all Expand all

-86

~~script/feedmarkize_archive_entry.py~~ less more

0		#!/usr/bin/env python3
1
2		# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
3		# For more information, please refer to <https://unlicense.org/>
4		# SPDX-License-Identifier: Unlicense
5
6
7		import json
8		import re
9		import os
10		from time import sleep
11
12		from feedmark.checkers import Schema
13		from feedmark.formats.markdown import feedmark_markdownize
14		from feedmark.loader import read_document_from
15		import requests
16
17
18		# Note: WIP!
19
20
21		def get_archive_url(link_markdown):
22		match = re.match(r'^\s\[(.+?)\]$(.+?)$\s(.?)\s$', link_markdown)
23		site = match.group(1)
24		url = match.group(2)
25		comments = match.group(3)
26
27		if site != 'archive.org':
28		return None
29
30		return url
31
32
33		def fetch_details(url):
34		inlibrary = False
35		printdisabled = False
36		# print("fetching", url, "...")
37		sleep(5)
38		# print("fetched.\n")
39
40
41		archive_page = requests.get(url).text
42
43		inlibrary = "/details/inlibrary" in archive_page
44		printdisabled = "/details/printdisabled" in archive_page
45		if inlibrary:
46		verb = "borrow"
47		else:
48		verb = "borrow-with-print-disabilities"
49
50		match = re.match(
51		r'^.?\<input\s+class\=\"js-ia-metadata\"\s+type\=\"hidden\"\s+value\=\'(\{.?\})\'\/\>',
52		archive_page,
53		re.DOTALL \| re.MULTILINE
54		)
55		if not match:
56		print(archive_page)
57		raise ValueError("couldn't find js-ia-metadata!")
58
59		data = json.loads(match.group(1))
60
61		try:
62		metadata = data["metadata"]
63		creator = metadata["creator"]
64		date = metadata["date"]
65		title = metadata["title"]
66		except:
67		print(json.dumps(data, indent=4, sort_keys=True))
68		raise
69
70		print("### {}".format(title))
71		print("")
72		print("* authors: {}".format(creator))
73		print("* date: {}".format(date))
74		print("* {} @ [archive.org]({})".format(verb, url))
75		print("")
76
77
78		def main(args):
79		for url in args:
80		fetch_details(url)
81
82
83		if __name__ == "__main__":
84		import sys
85		main(sys.argv[1:])

+152

-0

script/feedmarkize_ia.py less more

	0	#!/usr/bin/env python3
	1
	2	# SPDX-FileCopyrightText: Chris Pressey, the original author of this work, has dedicated it to the public domain.
	3	# For more information, please refer to <https://unlicense.org/>
	4	# SPDX-License-Identifier: Unlicense
	5
	6
	7	import json
	8	import re
	9	import os
	10	import sys
	11	from time import sleep
	12
	13	from feedmark.checkers import Schema
	14	from feedmark.formats.markdown import feedmark_markdownize
	15	from feedmark.loader import read_document_from
	16	import requests
	17
	18
	19	# Note: WIP!
	20
	21
	22	def clean_authors(data):
	23	if not isinstance(data, list):
	24	data = data.split(';')
	25
	26	return ', '.join([clean_author(a) for a in data])
	27
	28
	29	def clean_author(author):
	30	match = re.match(r'^(.+?)\,\s(.+?)\,\s(.+?)\s*$', author)
	31	if match:
	32	return '{} {}'.format(match.group(2), match.group(1))
	33	match = re.match(r'^(.+?)\,\s(.+?)\s$', author)
	34	if match:
	35	return '{} {}'.format(match.group(2), match.group(1))
	36	return author.strip()
	37
	38
	39	SHORTWORDS = ('for', 'in', 'the', 'of', 'a', 'with', 'to', 'from', 'and', 'on', 'can', 'do', 'over')
	40
	41
	42	def titlecase(s):
	43	ws = []
	44	words = [w for w in s.split(' ') if w]
	45	for i, w in enumerate(words):
	46	if i == 0:
	47	ws.append(w.capitalize())
	48	elif w not in SHORTWORDS:
	49	ws.append(w.capitalize())
	50	else:
	51	ws.append(w)
	52	return (' '.join(ws)).strip()
	53
	54
	55	def clean_title(title):
	56	segments = title.split(':')
	57	subtitle = None
	58	if len(segments) > 1:
	59	subtitle = titlecase(segments[-1])
	60	title = ': '.join(segments[:-1])
	61	else:
	62	subtitle = None
	63	title = ': '.join(segments)
	64
	65	title = titlecase(title)
	66	return (title, subtitle)
	67
	68
	69	def clean_url(url):
	70	match = re.match(r'^.+?\:\/\/(archive.org\/details\/\w+)', url)
	71	return 'https://' + match.group(1)
	72
	73
	74	def fetch_details(url):
	75	inlibrary = False
	76	printdisabled = False
	77
	78	archive_page = requests.get(url).text
	79	sleep(10) # be nice to the free online service.
	80
	81	inlibrary = "/details/inlibrary" in archive_page
	82	printdisabled = "/details/printdisabled" in archive_page
	83	if inlibrary:
	84	verb = "borrow"
	85	elif printdisabled:
	86	verb = "borrow-with-print-disabilities"
	87	else:
	88	verb = "online"
	89
	90	match = re.match(
	91	r'^.?\<input\s+class\=\"js-ia-metadata\"\s+type\=\"hidden\"\s+value\=\'(\{.?\})\'\/\>',
	92	archive_page,
	93	re.DOTALL \| re.MULTILINE
	94	)
	95	if not match:
	96	print(archive_page)
	97	raise ValueError("couldn't find js-ia-metadata!")
	98
	99	data = json.loads(match.group(1))
	100
	101	try:
	102	metadata = data["metadata"]
	103	authors = clean_authors(metadata.get("creator", metadata.get("associated-names", "Unknown")))
	104	date = metadata["date"]
	105	(title, subtitle) = clean_title(metadata["title"])
	106	url = clean_url(url)
	107	except:
	108	print(json.dumps(data, indent=4, sort_keys=True))
	109	raise
	110
	111	print("### {}".format(title))
	112	print("")
	113	if subtitle:
	114	print("* subtitle: {}".format(subtitle))
	115	print("* authors: {}".format(authors))
	116	print("* date: {}".format(date))
	117	print("* {} @ [archive.org]({})".format(verb, url))
	118	print("")
	119
	120
	121	def fetch_mdlinks(f):
	122	urls = []
	123	for line in f:
	124	line = line.strip()
	125	match = re.match(r'^\s\[(.+?)\]$(.+?)$\s(.?)\s$', line)
	126	if not match:
	127	continue
	128	site = match.group(1)
	129	url = match.group(2)
	130	comments = match.group(3)
	131	if 'archive.org' not in url:
	132	continue
	133	urls.append(url)
	134
	135	# print(urls)
	136
	137	for url in urls:
	138	fetch_details(url)
	139
	140
	141	def main(args):
	142	if args[0] == 'url':
	143	for url in args[1:]:
	144	fetch_details(url)
	145	elif args[0] == 'mdlinks':
	146	fetch_mdlinks(sys.stdin)
	147
	148
	149	if __name__ == "__main__":
	150	import sys
	151	main(sys.argv[1:])