Commit c90be6f73606bb43a9d4d3e8ce756ce7eba831f7 - NaNoGenLab

Comes closer to the stated objective. Also, science. Chris Pressey 10 years ago

2 changed file(s) with 74 addition(s) and 19 deletion(s). Raw diff Collapse all Expand all

+17

-8

wikimedia-illustrations/README.md less more

0	0	wikimedia-illustrations
1	1	=======================
2	2
3		Requirements
4		------------
	3	Hypothesis
	4	----------
	5
	6	We hypothesize that if we download some random public-domain images from
	7	Wikimedia Commons and inject them randomly into a text, it'll make just
	8	about any text look more interesting.
	9
	10	Apparatus
	11	---------
5	12
6	13	* Python 2.7.6 (probably works with older versions too)
7	14	* [requests](http://docs.python-requests.org/)
8	15	* [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/)
	16	* some kind of input text (uses lorem ipsum for now)
9	17
10		Basic Strategy
11		--------------
	18	Method
	19	------
12	20
13	21	* Get URLs for all images from all pages of a Wikimedia Commons category,
14		such as "PD_Gutenberg" or "PD-Art_(PD-Japan)", and write that list of
	22	such as `PD_Gutenberg` or `PD-Art_(PD-Japan)`, and write that list of
15	23	URLs to an index file.
16	24	* Select _n_ images randomly from that index and download them.
17	25	* (TODO) Inject those images as illustrations in a given text.
18	26
19		Usage
20		-----
	27	Observations
	28	------------
21	29
22	30	NOTE 1: to stay (IMO) well within Wikimedia's [Terms of use](http://meta.wikimedia.org/wiki/Terms_of_use),
23	31	this script sleeps for 8 seconds after making any major HTTP request.

43	51	Add a flag that looks for the "guaranteed public domain" text on the media
44	52	page and only downloads if it finds it.
45	53
46		Create a Markdown or HTML file with boilerplate text and embedded illustrations.
	54	Resize illustrations used in HTML (more research into using paper sizes
	55	in HTML might be necessary)

+57

-11

wikimedia-illustrations/wikimedia-illustrations.py less more

5	5
6	6	from bs4 import BeautifulSoup
7	7	import requests
	8
	9
	10	LOREM_IPSUM = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
8	11
9	12
10	13	def comply_with_terms_of_use():

39	42	for block in response.iter_content(1024):
40	43	f.write(block)
41	44
	45	comply_with_terms_of_use()
42	46	return True
	47
	48
	49	def load_index(filename):
	50	index = []
	51	with open(filename) as f:
	52	for line in f:
	53	index.append(line.strip())
	54	return index
	55
	56
	57	def get_random_image(index, dest_dir):
	58	media_url = random.choice(index)
	59	print media_url
	60	local_filename = os.path.join(dest_dir, media_url.split(':')[-1])
	61	get_image_from_page(media_url, local_filename)
	62	return local_filename
43	63
44	64
45	65	def main(argv):

84	104	media_url = line.strip()
85	105	local_filename = media_url.split(':')[-1]
86	106	get_image_from_page(media_url, local_filename)
87		comply_with_terms_of_use()
88	107
89	108	elif argv[1] == 'convertmany':
90	109	dest_dir = argv[2]

103	122	elif argv[1] == 'random':
104	123	count = int(argv[2])
105	124	dest_dir = argv[3]
106		index_filename = argv[4]
107		index = []
108		with open(index_filename) as f:
109		for line in f:
110		index.append(line.strip())
	125	index = load_index(argv[4])
111	126	for n in xrange(0, count):
112		media_url = random.choice(index)
113		print media_url
114		local_filename = os.path.join(dest_dir, media_url.split(':')[-1])
115		get_image_from_page(media_url, local_filename)
116		comply_with_terms_of_use()
	127	get_random_image(index, dest_dir)
	128
	129	elif argv[1] == 'render':
	130	count = int(argv[2])
	131	dest_dir = argv[3]
	132	index = load_index(argv[4])
	133	template = """\
	134	<!DOCTYPE html>
	135	<html>
	136	<head>
	137	<meta charset="utf-8">
	138	<title>Lorum Ipsem Shkoo</title>
	139	<style>
	140	hr { page-break-before: always; }
	141	</style>
	142	<body>
	143	$
	144	</body>
	145	</html>"""
	146
	147	body = ''
	148	for x in xrange(0, count):
	149	filename = get_random_image(index, dest_dir)
	150	if x != 0:
	151	body += '<hr>'
	152
	153	paras = ['<p>' + LOREM_IPSUM + '</p>'] * 4
	154	paras.append('<img src="%s">' % filename)
	155	random.shuffle(paras)
	156	body += ''.join(paras)
	157
	158	template = template.replace('$', body)
	159	with open('tmp.html', 'w') as f:
	160	f.write(template)
	161	import webbrowser
	162	webbrowser.open('tmp.html')
117	163
118	164	else:
119	165	raise KeyError('please read the source code')