git @ Cat's Eye Technologies NaNoGenLab / c90be6f
Comes closer to the stated objective. Also, science. Chris Pressey 10 years ago
2 changed file(s) with 74 addition(s) and 19 deletion(s). Raw diff Collapse all Expand all
00 wikimedia-illustrations
11 =======================
22
3 Requirements
4 ------------
3 Hypothesis
4 ----------
5
6 We hypothesize that if we download some random public-domain images from
7 Wikimedia Commons and inject them randomly into a text, it'll make just
8 about any text look more interesting.
9
10 Apparatus
11 ---------
512
613 * Python 2.7.6 (probably works with older versions too)
714 * [requests](http://docs.python-requests.org/)
815 * [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/)
16 * some kind of input text (uses lorem ipsum for now)
917
10 Basic Strategy
11 --------------
18 Method
19 ------
1220
1321 * Get URLs for all images from all pages of a Wikimedia Commons category,
14 such as "PD_Gutenberg" or "PD-Art_(PD-Japan)", and write that list of
22 such as `PD_Gutenberg` or `PD-Art_(PD-Japan)`, and write that list of
1523 URLs to an index file.
1624 * Select _n_ images randomly from that index and download them.
1725 * (TODO) Inject those images as illustrations in a given text.
1826
19 Usage
20 -----
27 Observations
28 ------------
2129
2230 NOTE 1: to stay (IMO) well within Wikimedia's [Terms of use](http://meta.wikimedia.org/wiki/Terms_of_use),
2331 this script sleeps for 8 seconds after making any major HTTP request.
4351 Add a flag that looks for the "guaranteed public domain" text on the media
4452 page and only downloads if it finds it.
4553
46 Create a Markdown or HTML file with boilerplate text and embedded illustrations.
54 Resize illustrations used in HTML (more research into using paper sizes
55 in HTML might be necessary)
55
66 from bs4 import BeautifulSoup
77 import requests
8
9
10 LOREM_IPSUM = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
811
912
1013 def comply_with_terms_of_use():
3942 for block in response.iter_content(1024):
4043 f.write(block)
4144
45 comply_with_terms_of_use()
4246 return True
47
48
49 def load_index(filename):
50 index = []
51 with open(filename) as f:
52 for line in f:
53 index.append(line.strip())
54 return index
55
56
57 def get_random_image(index, dest_dir):
58 media_url = random.choice(index)
59 print media_url
60 local_filename = os.path.join(dest_dir, media_url.split(':')[-1])
61 get_image_from_page(media_url, local_filename)
62 return local_filename
4363
4464
4565 def main(argv):
84104 media_url = line.strip()
85105 local_filename = media_url.split(':')[-1]
86106 get_image_from_page(media_url, local_filename)
87 comply_with_terms_of_use()
88107
89108 elif argv[1] == 'convertmany':
90109 dest_dir = argv[2]
103122 elif argv[1] == 'random':
104123 count = int(argv[2])
105124 dest_dir = argv[3]
106 index_filename = argv[4]
107 index = []
108 with open(index_filename) as f:
109 for line in f:
110 index.append(line.strip())
125 index = load_index(argv[4])
111126 for n in xrange(0, count):
112 media_url = random.choice(index)
113 print media_url
114 local_filename = os.path.join(dest_dir, media_url.split(':')[-1])
115 get_image_from_page(media_url, local_filename)
116 comply_with_terms_of_use()
127 get_random_image(index, dest_dir)
128
129 elif argv[1] == 'render':
130 count = int(argv[2])
131 dest_dir = argv[3]
132 index = load_index(argv[4])
133 template = """\
134 <!DOCTYPE html>
135 <html>
136 <head>
137 <meta charset="utf-8">
138 <title>Lorum Ipsem Shkoo</title>
139 <style>
140 hr { page-break-before: always; }
141 </style>
142 <body>
143 $
144 </body>
145 </html>"""
146
147 body = ''
148 for x in xrange(0, count):
149 filename = get_random_image(index, dest_dir)
150 if x != 0:
151 body += '<hr>'
152
153 paras = ['<p>' + LOREM_IPSUM + '</p>'] * 4
154 paras.append('<img src="%s">' % filename)
155 random.shuffle(paras)
156 body += ''.join(paras)
157
158 template = template.replace('$', body)
159 with open('tmp.html', 'w') as f:
160 f.write(template)
161 import webbrowser
162 webbrowser.open('tmp.html')
117163
118164 else:
119165 raise KeyError('please read the source code')