git @ Cat's Eye Technologies yastasoti / e3b0a48
We really shouldn't need wget for just this. Save it for later. Chris Pressey 1 year, 10 months ago
1 changed file(s) with 8 addition(s) and 21 deletion(s). Raw diff Collapse all Expand all
00 #!/usr/bin/env python
1
2 #
3 # yastasoti -- Yet another script to archive stuff off teh internets.
4 # This work is in the public domain.
5 #
16
27 from argparse import ArgumentParser
38 import hashlib
49 import json
510 import logging
611 import os
7 from subprocess import check_call, CalledProcessError
812 import sys
913 from time import sleep, localtime, strftime
1014 try:
135139
136140 class LinkChecker(LinkTraverser):
137141 def handle_link(self, url):
138 response = requests.head(url, allow_redirects=True)
142 response = requests.head(url, allow_redirects=True, headers={
143 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
144 })
139145 return {
140146 'url': response.url,
141147 'status_code': response.status_code
142148 }
143
144
145 class WgetLinkChecker(LinkTraverser):
146 def handle_link(self, url):
147 try:
148 # TODO turn on redirects
149 check_call(['wget', '--spider', url])
150 print('OK', url)
151 # TODO extract from headers
152 return {
153 'status_code': 200
154 }
155 except CalledProcessError as e:
156 print('BAD {} {}'.format(e.returncode, url))
157 return {
158 'status_code': 600
159 }
160149
161150
162151 class LinkArchiver(LinkTraverser):
231220 links, options.archive_to, missing_only=options.archive_missing_only,
232221 **common_kwargs
233222 )
234 elif False:
235 traverser = WgetLinkChecker(links, **common_kwargs)
236223 else:
237224 traverser = LinkChecker(links, **common_kwargs)
238225