git @ Cat's Eye Technologies yastasoti / 68b73c1
Refactor. Chris Pressey 4 years ago
1 changed file(s) with 44 addition(s) and 19 deletion(s). Raw diff Collapse all Expand all
6969 delay_between_fetches = 0
7070
7171
72 def archive_links(links, article_root=None, dest_dir=None, missing_only=False, ignore_urls=[]):
73 """If dest_dir is None, links will only be checked for existence, not downloaded."""
72 def foreach_link(links, callback, article_root=None, missing_only=False, ignore_urls=[]):
7473
7574 failures = []
7675 for link in tqdm(links, total=len(links)):
9089 if not os.path.exists(filename):
9190 raise ValueError('Local file "{}" does not exist'.format(filename))
9291 continue
93 elif dest_dir is not None:
94 dirname, filename = url_to_dirname_and_filename(url)
95 dirname = os.path.join(dest_dir, dirname)
96 if not os.path.exists(dirname):
97 os.makedirs(dirname)
98 if missing_only and os.path.exists(os.path.join(dirname, filename)):
99 continue
100 response = download(url, dirname, filename)
10192 else:
102 response = requests.head(url)
93 response = callback(url)
94 if response is None:
95 continue
10396 status = response.status_code
10497 except Exception as e:
10598 status = str(e)
112105 if delay_between_fetches > 0:
113106 sleep(delay_between_fetches)
114107 return failures
108
109
110 def check_links(links, **kwargs):
111 def callback(url):
112 return requests.head(url)
113 return foreach_link(links, callback, **kwargs)
114
115
116 def archive_links_to(links, dest_dir, **kwargs):
117
118 missing_only = kwargs.get('missing_only', False)
119
120 def callback(url):
121 dirname, filename = url_to_dirname_and_filename(url)
122 dirname = os.path.join(dest_dir, dirname)
123 if not os.path.exists(dirname):
124 os.makedirs(dirname)
125 if missing_only and os.path.exists(os.path.join(dirname, filename)):
126 return None
127 response = download(url, dirname, filename)
128 return response
129
130 return foreach_link(links, callback, **kwargs)
115131
116132
117133 def main(args):
132148 argparser.add_argument('--article-root', metavar='DIRNAME', type=str, default='.',
133149 help='Directory in which local files found when checking or archiving links should be located'
134150 )
135 argparser.add_argument('--ignore-urls', metavar='TARGETS', type=str, default=None,
151 argparser.add_argument('--ignore-urls', metavar='URLS', type=str, default=None,
136152 help='Comma-separated list of link targets that should not even try to be fetched'
137153 )
138154
152168 else:
153169 options.ignore_urls = options.ignore_urls.split(',')
154170
155 result = archive_links(
156 links,
157 article_root=options.article_root,
158 dest_dir=options.archive_links_to,
159 missing_only=options.archive_missing_only,
160 ignore_urls=options.ignore_urls,
161 )
171 if options.archive_links_to:
172 result = archive_links_to(
173 links,
174 options.archive_links_to,
175 article_root=options.article_root,
176 missing_only=options.archive_missing_only,
177 ignore_urls=options.ignore_urls,
178 )
179 else:
180 result = check_links(
181 links,
182 article_root=options.article_root,
183 missing_only=options.archive_missing_only,
184 ignore_urls=options.ignore_urls,
185 )
186
162187 sys.stdout.write(json.dumps(result, indent=4, sort_keys=True))
163188
164189