git @ Cat's Eye Technologies yastasoti / 5698417
Replace singular --article-root with plural --extant-path. Chris Pressey 6 years ago
3 changed file(s) with 20 addition(s) and 13 deletion(s). Raw diff Collapse all Expand all
00 yastasoti
11 =========
22
3 _Version 0.3_
3 _Version 0.4_
44
55 Yet another script to archive stuff off teh internets.
66
2727
2828 #### Check all links in a set of Feedmark documents ####
2929
30 feedmark --output-links article/*.md | yastasoti --article-root=article/ - | tee results.json
30 feedmark --output-links article/*.md | yastasoti --extant-path=article/ - | tee results.json
3131
3232 This will make only `HEAD` requests to check that the resources exist.
3333 It will not fetch them. The ones that could not be fetches will appear
3434 in `results.json`, and you can run yastasoti on that again to re-try:
3535
36 yastasoti --article-root=article/ results.json | tee results2.json
36 yastasoti --extant-path=article/ results.json | tee results2.json
3737
3838 #### Archive stuff off teh internets ####
3939
9595 * Archive youtube links with youtube-dl.
9696 * Handle failures (redirects, etc) better (detect 503 / "connection refused" better.)
9797 * Allow use of an external tool like `wget` or `curl` to do fetching.
98 * Multiple `--article-roots`.
9998
10099 [Feedmark]: http://catseye.tc/node/Feedmark
00 requests==2.21.0
1 tqdm==4.31.1
8585
8686
8787 class LinkTraverser(object):
88 def __init__(self, links, article_root=None, ignore_urls=None, delay_between_requests=0.0, fragile=False):
88 def __init__(self, links, extant_path=None, ignore_urls=None, delay_between_requests=0.0, fragile=False):
8989 self.links = links
90 self.article_root = article_root
90 self.extant_path = extant_path or []
9191 self.ignore_urls = ignore_urls or []
9292 self.delay_between_requests = delay_between_requests
9393 self.fragile = fragile
126126 else:
127127 filename = url
128128 filename = unquote(filename)
129 filename = os.path.join(self.article_root, filename)
130 if not os.path.exists(filename):
131 raise ValueError('Local file "{}" does not exist'.format(filename))
129 found = False
130 for extant_dirname in self.extant_path:
131 extant_filename = os.path.join(extant_dirname, filename)
132 if os.path.exists(extant_filename):
133 found = True
134 break
135 if not found:
136 raise ValueError('Local file "{}" not found in extant-path'.format(filename))
132137 continue
133138 else:
134139 response = self.handle_link(link)
223228 argparser.add_argument('--archive-via', metavar='ROUTERFILE', type=str, default=None,
224229 help='Download links and save each in the directory given in the router file'
225230 )
226 argparser.add_argument('--article-root', metavar='DIRNAME', type=str, default='.',
227 help='Directory in which local files found when checking or archiving links should be located'
228 )
229231 argparser.add_argument('--delay-between-requests', metavar='SECONDS', type=float, default=0.0,
230232 help='Delay (in seconds, fractions allowed) between successive network requests'
233 )
234 argparser.add_argument('--extant-path', metavar='DIRNAMES', type=str, default=None,
235 help='When finding a relative link that would resolve to local file, check that '
236 'that file exists in in this search path, which should be given as a '
237 'comma-delimited list of directory names'
231238 )
232239 argparser.add_argument('--fragile', action='store_true', default=False,
233240 help='Exit on first error of any nature when processing links'
254261 links.extend(data)
255262
256263 common_kwargs = dict(
257 article_root=options.article_root,
264 extant_path=None if options.extant_path is None else options.extant_path.split(','),
258265 ignore_urls=[] if options.ignore_urls is None else options.ignore_urls.split(','),
259266 delay_between_requests=options.delay_between_requests,
260267 fragile=options.fragile,