Allow configuring a list of URLs that are simply ignored.
Chris Pressey
4 years ago
69 | 69 | delay_between_fetches = 0 |
70 | 70 | |
71 | 71 | |
72 | def archive_links(links, article_root=None, dest_dir=None, missing_only=False): | |
72 | def archive_links(links, article_root=None, dest_dir=None, missing_only=False, ignore_urls=[]): | |
73 | 73 | """If dest_dir is None, links will only be checked for existence, not downloaded.""" |
74 | 74 | |
75 | 75 | failures = [] |
76 | 76 | for link in tqdm(links, total=len(links)): |
77 | 77 | url = link['url'] |
78 | if url in ignore_urls: | |
79 | continue | |
78 | 80 | try: |
79 | 81 | if url.startswith(('#',)): |
80 | 82 | continue |
130 | 132 | argparser.add_argument('--article-root', metavar='DIRNAME', type=str, default='.', |
131 | 133 | help='Directory in which local files found when checking or archiving links should be located' |
132 | 134 | ) |
135 | argparser.add_argument('--ignore-urls', metavar='TARGETS', type=str, default=None, | |
136 | help='Comma-separated list of link targets that should not even try to be fetched' | |
137 | ) | |
133 | 138 | |
134 | 139 | options = argparser.parse_args(sys.argv[1:]) |
135 | 140 | |
142 | 147 | data = json.loads(f.read()) |
143 | 148 | links.extend(data) |
144 | 149 | |
150 | if options.ignore_urls is None: | |
151 | options.ignore_urls = [] | |
152 | else: | |
153 | options.ignore_urls = options.ignore_urls.split(',') | |
154 | ||
145 | 155 | result = archive_links( |
146 | 156 | links, |
147 | 157 | article_root=options.article_root, |
148 | 158 | dest_dir=options.archive_links_to, |
149 | 159 | missing_only=options.archive_missing_only, |
160 | ignore_urls=options.ignore_urls, | |
150 | 161 | ) |
151 | 162 | sys.stdout.write(json.dumps(result, indent=4, sort_keys=True)) |
152 | 163 |