git @ Cat's Eye Technologies yastasoti / ba7331a
Describe and implement "archive routers". Chris Pressey 1 year, 10 months ago
2 changed file(s) with 46 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
99 * input is a JSON list of objects containing links (such as those produced by Feedmark)
1010 * output is a JSON list of objects that could not be retrieved, which can be fed back
1111 into the script as input
12 * checks links with `HEAD` requests by default; if `--archive-to` is given,
13 fetches a copy of each resource with `GET` and saves it to disk
12 * checks links with `HEAD` requests by default. `--archive-to` causes each link to be
13 fetched with `GET` and saved to the specified directory. `--archive-via` specifies an
14 _archive router_ which causes each link to be fetched, and saved to a directory
15 which is selected based on the URL of the link.
1416 * tries to be idempotent and not create a new local file if the remote file hasn't changed
1517 * handles links that are local files; checks if the file exists locally
18
19 #### Archive routers ####
20
21 An archive router (used with `--archive-via`) is a JSON file that looks like this:
22
23 {
24 "http://catseye.tc/*": "/dev/null",
25 "https://footu.be/*": "footube/",
26 "*": "archive/"
27 }
28
29 Three guesses as to what these parts mean.
1630
1731 #### Planned features ####
1832
1933 * Archive youtube links with youtube-dl.
2034 * Handle failures (redirects, etc) better (detect 503 / "connection refused" better.)
2135 * Allow use of an external tool like `wget` or `curl` to do fetching.
22 * Allow categorization of downloaded stuff.
2336 * If the same link occurs more than once in the input, don't request it more than once.
2437
2538 ### Examples ###
55 #
66
77 from argparse import ArgumentParser
8 from fnmatch import fnmatch
89 import hashlib
910 import json
1011 import logging
150151
151152
152153 class LinkArchiver(LinkTraverser):
153 def __init__(self, links, dest_dir, missing_only=False, **kwargs):
154 super().__init__(links, **kwargs)
155 self.dest_dir = dest_dir
154 def __init__(self, links, router, missing_only=False, **kwargs):
155 super(LinkArchiver, self).__init__(links, **kwargs)
156 self.router = router
156157 self.missing_only = missing_only
157158
159 def select_dest_dir(self, url):
160 for key in sorted(self.router.keys(), key=lambda x: 0-len(x)):
161 if fnmatch(url, key):
162 return self.router[key]
163 raise NotImplementedError("archive router could not resolve {}".format(url))
164
158165 def handle_link(self, url):
159 logger.info(u"archiving {}".format(url).encode('utf-8'))
160166 dirname, filename = url_to_dirname_and_filename(url)
161 dirname = os.path.join(self.dest_dir, dirname)
167 dest_dir = self.select_dest_dir(url)
168 dirname = os.path.join(dest_dir, dirname)
169 logger.info(u"archiving {} to {}".format(url, dirname).encode('utf-8'))
162170 if not os.path.exists(dirname):
163171 os.makedirs(dirname)
164172 if self.missing_only and os.path.exists(os.path.join(dirname, filename)):
179187 )
180188
181189 argparser.add_argument('--archive-to', metavar='DIRNAME', type=str, default=None,
182 help='Download a copy of all web objects linked to from the entries'
190 help='Download a copy of each of the links, if changed, to the given directory'
183191 )
184192 argparser.add_argument('--archive-missing-only', action='store_true',
185193 help='When archiving links, only download the link if it is not already archived'
194 )
195 argparser.add_argument('--archive-via', metavar='ROUTERFILE', type=str, default=None,
196 help='Download links and save each in the directory given in the router file'
186197 )
187198 argparser.add_argument('--article-root', metavar='DIRNAME', type=str, default='.',
188199 help='Directory in which local files found when checking or archiving links should be located'
221232 fragile=options.fragile,
222233 )
223234
235 if options.archive_to and options.archive_via:
236 raise NotImplementedError("Specify either --archive-to or --archive-via, not both")
237
238 router = None
224239 if options.archive_to:
240 router = {
241 "*": options.archive_to
242 }
243 elif options.archive_via:
244 with open(options.archive_via, 'r') as f:
245 router = json.loads(f.read())
246
247 if router:
225248 traverser = LinkArchiver(
226 links, options.archive_to, missing_only=options.archive_missing_only,
249 links, router, missing_only=options.archive_missing_only,
227250 **common_kwargs
228251 )
229252 else: