Refactor.
Chris Pressey
4 years ago
69 | 69 | delay_between_fetches = 0 |
70 | 70 | |
71 | 71 | |
72 | def archive_links(links, article_root=None, dest_dir=None, missing_only=False, ignore_urls=[]): | |
73 | """If dest_dir is None, links will only be checked for existence, not downloaded.""" | |
72 | def foreach_link(links, callback, article_root=None, missing_only=False, ignore_urls=[]): | |
74 | 73 | |
75 | 74 | failures = [] |
76 | 75 | for link in tqdm(links, total=len(links)): |
90 | 89 | if not os.path.exists(filename): |
91 | 90 | raise ValueError('Local file "{}" does not exist'.format(filename)) |
92 | 91 | continue |
93 | elif dest_dir is not None: | |
94 | dirname, filename = url_to_dirname_and_filename(url) | |
95 | dirname = os.path.join(dest_dir, dirname) | |
96 | if not os.path.exists(dirname): | |
97 | os.makedirs(dirname) | |
98 | if missing_only and os.path.exists(os.path.join(dirname, filename)): | |
99 | continue | |
100 | response = download(url, dirname, filename) | |
101 | 92 | else: |
102 | response = requests.head(url) | |
93 | response = callback(url) | |
94 | if response is None: | |
95 | continue | |
103 | 96 | status = response.status_code |
104 | 97 | except Exception as e: |
105 | 98 | status = str(e) |
112 | 105 | if delay_between_fetches > 0: |
113 | 106 | sleep(delay_between_fetches) |
114 | 107 | return failures |
108 | ||
109 | ||
110 | def check_links(links, **kwargs): | |
111 | def callback(url): | |
112 | return requests.head(url) | |
113 | return foreach_link(links, callback, **kwargs) | |
114 | ||
115 | ||
116 | def archive_links_to(links, dest_dir, **kwargs): | |
117 | ||
118 | missing_only = kwargs.get('missing_only', False) | |
119 | ||
120 | def callback(url): | |
121 | dirname, filename = url_to_dirname_and_filename(url) | |
122 | dirname = os.path.join(dest_dir, dirname) | |
123 | if not os.path.exists(dirname): | |
124 | os.makedirs(dirname) | |
125 | if missing_only and os.path.exists(os.path.join(dirname, filename)): | |
126 | return None | |
127 | response = download(url, dirname, filename) | |
128 | return response | |
129 | ||
130 | return foreach_link(links, callback, **kwargs) | |
115 | 131 | |
116 | 132 | |
117 | 133 | def main(args): |
132 | 148 | argparser.add_argument('--article-root', metavar='DIRNAME', type=str, default='.', |
133 | 149 | help='Directory in which local files found when checking or archiving links should be located' |
134 | 150 | ) |
135 | argparser.add_argument('--ignore-urls', metavar='TARGETS', type=str, default=None, | |
151 | argparser.add_argument('--ignore-urls', metavar='URLS', type=str, default=None, | |
136 | 152 | help='Comma-separated list of link targets that should not even try to be fetched' |
137 | 153 | ) |
138 | 154 | |
152 | 168 | else: |
153 | 169 | options.ignore_urls = options.ignore_urls.split(',') |
154 | 170 | |
155 | result = archive_links( | |
156 | links, | |
157 | article_root=options.article_root, | |
158 | dest_dir=options.archive_links_to, | |
159 | missing_only=options.archive_missing_only, | |
160 | ignore_urls=options.ignore_urls, | |
161 | ) | |
171 | if options.archive_links_to: | |
172 | result = archive_links_to( | |
173 | links, | |
174 | options.archive_links_to, | |
175 | article_root=options.article_root, | |
176 | missing_only=options.archive_missing_only, | |
177 | ignore_urls=options.ignore_urls, | |
178 | ) | |
179 | else: | |
180 | result = check_links( | |
181 | links, | |
182 | article_root=options.article_root, | |
183 | missing_only=options.archive_missing_only, | |
184 | ignore_urls=options.ignore_urls, | |
185 | ) | |
186 | ||
162 | 187 | sys.stdout.write(json.dumps(result, indent=4, sort_keys=True)) |
163 | 188 | |
164 | 189 |