git @ Cat's Eye Technologies yastasoti / fffe16f
Handle Unicode URLs. Chris Pressey 1 year, 10 months ago
2 changed file(s) with 4 addition(s) and 3 deletion(s). Raw diff Collapse all Expand all
99 * input is a JSON list of objects containing links (such as those produced by Feedmark)
1010 * output is a JSON list of objects that could not be retrieved, which can be fed back
1111 into the script as input
12 * checks links with `HEAD` requests by default; if `--archive-links-to` is given,
12 * checks links with `HEAD` requests by default; if `--archive-to` is given,
1313 fetches a copy of each resource with `GET` and saves it to disk
1414 * tries to be idempotent and not create a new local file if the remote file hasn't changed
1515 * handles links that are local files; checks if the file exists locally
2020 * Handle failures (redirects, etc) better (detect 503 / "connection refused" better.)
2121 * Allow use of an external tool like `wget` or `curl` to do fetching.
2222 * Allow categorization of downloaded stuff.
23 * If the same link occurs more than once in the input, don't request it more than once.
2324
2425 ### Examples ###
2526
139139
140140 class LinkChecker(LinkTraverser):
141141 def handle_link(self, url):
142 logger.info("checking {}".format(url))
142 logger.info(u"checking {}".format(url).encode('utf-8'))
143143 response = requests.head(url, allow_redirects=True, headers={
144144 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
145145 })
156156 self.missing_only = missing_only
157157
158158 def handle_link(self, url):
159 logger.info("archiving {}".format(url))
159 logger.info(u"archiving {}".format(url).encode('utf-8'))
160160 dirname, filename = url_to_dirname_and_filename(url)
161161 dirname = os.path.join(self.dest_dir, dirname)
162162 if not os.path.exists(dirname):