Handle Unicode URLs.
Chris Pressey
4 years ago
9 | 9 |
* input is a JSON list of objects containing links (such as those produced by Feedmark)
|
10 | 10 |
* output is a JSON list of objects that could not be retrieved, which can be fed back
|
11 | 11 |
into the script as input
|
12 | |
* checks links with `HEAD` requests by default; if `--archive-links-to` is given,
|
|
12 |
* checks links with `HEAD` requests by default; if `--archive-to` is given,
|
13 | 13 |
fetches a copy of each resource with `GET` and saves it to disk
|
14 | 14 |
* tries to be idempotent and not create a new local file if the remote file hasn't changed
|
15 | 15 |
* handles links that are local files; checks if the file exists locally
|
|
20 | 20 |
* Handle failures (redirects, etc) better (detect 503 / "connection refused" better.)
|
21 | 21 |
* Allow use of an external tool like `wget` or `curl` to do fetching.
|
22 | 22 |
* Allow categorization of downloaded stuff.
|
|
23 |
* If the same link occurs more than once in the input, don't request it more than once.
|
23 | 24 |
|
24 | 25 |
### Examples ###
|
25 | 26 |
|
139 | 139 |
|
140 | 140 |
class LinkChecker(LinkTraverser):
|
141 | 141 |
def handle_link(self, url):
|
142 | |
logger.info("checking {}".format(url))
|
|
142 |
logger.info(u"checking {}".format(url).encode('utf-8'))
|
143 | 143 |
response = requests.head(url, allow_redirects=True, headers={
|
144 | 144 |
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
145 | 145 |
})
|
|
156 | 156 |
self.missing_only = missing_only
|
157 | 157 |
|
158 | 158 |
def handle_link(self, url):
|
159 | |
logger.info("archiving {}".format(url))
|
|
159 |
logger.info(u"archiving {}".format(url).encode('utf-8'))
|
160 | 160 |
dirname, filename = url_to_dirname_and_filename(url)
|
161 | 161 |
dirname = os.path.join(self.dest_dir, dirname)
|
162 | 162 |
if not os.path.exists(dirname):
|