handle_link() now takes the entire link dictionary.
Chris Pressey
6 years ago
89 | 89 | self.delay_between_requests = delay_between_requests |
90 | 90 | self.fragile = fragile |
91 | 91 | |
92 | def handle_link(self, url): | |
93 | """Given a URL, process that URL. Should either return None, meaning | |
92 | def handle_link(self, link): | |
93 | """Given a dict containing a URL under the key `url` (and possibly | |
94 | other information), process that URL. Should either return None, meaning | |
94 | 95 | it declined to process this URL (for whatever reason), or should return |
95 | 96 | a dict representing the response from processing the URL, which should |
96 | contain the following keys: | |
97 | contain (at least) the following keys: | |
97 | 98 | |
98 | 99 | status_code: an integer. 6xx can be used to indicate internal error. |
99 | 100 | |
127 | 128 | raise ValueError('Local file "{}" does not exist'.format(filename)) |
128 | 129 | continue |
129 | 130 | else: |
130 | response = self.handle_link(url) | |
131 | response = self.handle_link(link) | |
131 | 132 | if response is None: |
132 | 133 | continue |
133 | 134 | except Exception as e: |
152 | 153 | |
153 | 154 | |
154 | 155 | class LinkChecker(LinkTraverser): |
155 | def handle_link(self, url): | |
156 | def handle_link(self, link): | |
157 | url = link['url'] | |
156 | 158 | logger.info(u"checking {}".format(url).encode('utf-8')) |
157 | 159 | response = requests.head(url, allow_redirects=True, headers={ |
158 | 160 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0', |
175 | 177 | return self.router[key] |
176 | 178 | raise NotImplementedError("archive router could not resolve {}".format(url)) |
177 | 179 | |
178 | def handle_link(self, url): | |
180 | def handle_link(self, link): | |
181 | url = link['url'] | |
179 | 182 | dirname, filename = url_to_dirname_and_filename(url) |
180 | 183 | dest_dir = self.select_dest_dir(url) |
181 | 184 | if dest_dir == '/dev/null': |