git @ Cat's Eye Technologies yastasoti / bfef88c
Allow configuring a list of URLs that are simply ignored. Chris Pressey 4 years ago
1 changed file(s) with 12 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
6969 delay_between_fetches = 0
7070
7171
72 def archive_links(links, article_root=None, dest_dir=None, missing_only=False):
72 def archive_links(links, article_root=None, dest_dir=None, missing_only=False, ignore_urls=[]):
7373 """If dest_dir is None, links will only be checked for existence, not downloaded."""
7474
7575 failures = []
7676 for link in tqdm(links, total=len(links)):
7777 url = link['url']
78 if url in ignore_urls:
79 continue
7880 try:
7981 if url.startswith(('#',)):
8082 continue
130132 argparser.add_argument('--article-root', metavar='DIRNAME', type=str, default='.',
131133 help='Directory in which local files found when checking or archiving links should be located'
132134 )
135 argparser.add_argument('--ignore-urls', metavar='TARGETS', type=str, default=None,
136 help='Comma-separated list of link targets that should not even try to be fetched'
137 )
133138
134139 options = argparser.parse_args(sys.argv[1:])
135140
142147 data = json.loads(f.read())
143148 links.extend(data)
144149
150 if options.ignore_urls is None:
151 options.ignore_urls = []
152 else:
153 options.ignore_urls = options.ignore_urls.split(',')
154
145155 result = archive_links(
146156 links,
147157 article_root=options.article_root,
148158 dest_dir=options.archive_links_to,
149159 missing_only=options.archive_missing_only,
160 ignore_urls=options.ignore_urls,
150161 )
151162 sys.stdout.write(json.dumps(result, indent=4, sort_keys=True))
152163