git @ Cat's Eye Technologies yastasoti / 2c21ff1
The return value of handle_link() is not a Response object. Chris Pressey 4 years ago
1 changed file(s) with 24 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
7171 self.fragile = fragile
7272
7373 def handle_link(self, url):
74 """Given a URL, process that URL. Should either return None, meaning
75 it declined to process this URL (for whatever reason), or should return
76 a dict representing the response from processing the URL, which should
77 contain the following keys:
78
79 status_code: an integer. 6xx can be used to indicate internal error.
80
81 """
7482 raise NotImplementedError
7583
7684 def traverse(self):
96104 response = self.handle_link(url)
97105 if response is None:
98106 continue
99 status = response.status_code
107 status = response['status_code']
100108 except Exception as e:
101109 if self.fragile:
102110 raise
114122
115123 class LinkChecker(LinkTraverser):
116124 def handle_link(self, url):
117 return requests.head(url)
125 reponse = requests.head(url)
126 return {
127 'status_code': response.status_code
128 }
118129
119130
120131 class WgetLinkChecker(LinkTraverser):
121132 def handle_link(self, url):
122133 try:
134 # TODO turn on redirects
123135 check_call(['wget', '--spider', url])
124136 print('OK', url)
137 # TODO extract from headers
138 return {
139 'status_code': 200
140 }
125141 except CalledProcessError as e:
126142 print('BAD {} {}'.format(e.returncode, url))
127 return None
143 return {
144 'status_code': 600
145 }
128146
129147
130148 class LinkArchiver(LinkTraverser):
141159 if self.missing_only and os.path.exists(os.path.join(dirname, filename)):
142160 return None
143161 response = download(url, dirname, filename)
144 return response
162 return {
163 'status_code': response.status_code
164 }
145165
146166
147167 def main(args):