git @ Cat's Eye Technologies yastasoti / 144f432
Refactor again. Chris Pressey 1 year, 10 months ago
1 changed file(s) with 54 addition(s) and 49 deletion(s). Raw diff Collapse all Expand all
6868
6969 delay_between_fetches = 0
7070
71 class LinkTraverser(object):
72 def __init__(self, links, article_root=None, missing_only=False, ignore_urls=None):
73 self.links = links
74 self.article_root = article_root
75 self.missing_only = missing_only
76 self.ignore_urls = ignore_urls or []
7177
72 def foreach_link(links, callback, article_root=None, missing_only=False, ignore_urls=[]):
78 def handle_link(self, url):
79 raise NotImplementedError
7380
74 failures = []
75 for link in tqdm(links, total=len(links)):
76 url = link['url']
77 if url in ignore_urls:
78 continue
79 try:
80 if url.startswith(('#',)):
81 def traverse(self):
82 failures = []
83 for link in tqdm(self.links, total=len(self.links)):
84 url = link['url']
85 if url in self.ignore_urls:
8186 continue
82 elif not url.startswith(('http://', 'https://')):
83 if '#' in url:
84 filename = url.split('#')[0]
87 try:
88 if url.startswith(('#',)):
89 continue
90 elif not url.startswith(('http://', 'https://')):
91 if '#' in url:
92 filename = url.split('#')[0]
93 else:
94 filename = url
95 filename = urllib.unquote(filename)
96 filename = os.path.join(article_root, filename)
97 if not os.path.exists(filename):
98 raise ValueError('Local file "{}" does not exist'.format(filename))
99 continue
85100 else:
86 filename = url
87 filename = urllib.unquote(filename)
88 filename = os.path.join(article_root, filename)
89 if not os.path.exists(filename):
90 raise ValueError('Local file "{}" does not exist'.format(filename))
91 continue
92 else:
93 response = callback(url)
94 if response is None:
95 continue
96 status = response.status_code
97 except Exception as e:
98 status = str(e)
99 if status not in (200, 301, 302, 303):
100 failures.append({
101 'status': status,
102 'url': url,
103 'link': link,
104 })
105 if delay_between_fetches > 0:
106 sleep(delay_between_fetches)
107 return failures
101 response = self.handle_link(url)
102 if response is None:
103 continue
104 status = response.status_code
105 except Exception as e:
106 status = str(e)
107 if status not in (200, 301, 302, 303):
108 failures.append({
109 'status': status,
110 'url': url,
111 'link': link,
112 })
113 if delay_between_fetches > 0:
114 sleep(delay_between_fetches)
115 return failures
108116
109117
110 def check_links(links, **kwargs):
111 def callback(url):
118 class LinkChecker(LinkTraverser):
119 def handle_link(self, url):
112120 return requests.head(url)
113 return foreach_link(links, callback, **kwargs)
114121
115122
116 def archive_links_to(links, dest_dir, **kwargs):
123 class LinkArchiver(LinkTraverser):
124 def __init__(self, links, dest_dir, **kwargs):
125 super().__init__(links, **kwargs)
126 self.dest_dir = dest_dir
117127
118 missing_only = kwargs.get('missing_only', False)
119
120 def callback(url):
128 def handle_link(self, url):
121129 dirname, filename = url_to_dirname_and_filename(url)
122 dirname = os.path.join(dest_dir, dirname)
130 dirname = os.path.join(self.dest_dir, dirname)
123131 if not os.path.exists(dirname):
124132 os.makedirs(dirname)
125 if missing_only and os.path.exists(os.path.join(dirname, filename)):
133 if self.missing_only and os.path.exists(os.path.join(dirname, filename)):
126134 return None
127135 response = download(url, dirname, filename)
128136 return response
129
130 return foreach_link(links, callback, **kwargs)
131137
132138
133139 def main(args):
169175 options.ignore_urls = options.ignore_urls.split(',')
170176
171177 if options.archive_links_to:
172 result = archive_links_to(
173 links,
174 options.archive_links_to,
178 traverser = LinkArchiver(links, options.archive_links_to,
175179 article_root=options.article_root,
176180 missing_only=options.archive_missing_only,
177181 ignore_urls=options.ignore_urls,
178182 )
179183 else:
180 result = check_links(
181 links,
184 traverser = LinkChecker(links,
182185 article_root=options.article_root,
183186 missing_only=options.archive_missing_only,
184187 ignore_urls=options.ignore_urls,
185188 )
189
190 result = traverser.traverse()
186191
187192 sys.stdout.write(json.dumps(result, indent=4, sort_keys=True))
188193