git @ Cat's Eye Technologies yastasoti / 3f089db
Deduplicate links when processing, and have more thorough logging. Chris Pressey 1 year, 10 months ago
2 changed file(s) with 19 addition(s) and 5 deletion(s). Raw diff Collapse all Expand all
6868 * Archive youtube links with youtube-dl.
6969 * Handle failures (redirects, etc) better (detect 503 / "connection refused" better.)
7070 * Allow use of an external tool like `wget` or `curl` to do fetching.
71 * If the same link occurs more than once in the input, don't request it more than once.
5555 def download(url, dirname, filename):
5656 response = requests.get(url, stream=True)
5757 partname = os.path.join(dirname, filename + '_part')
58 logger.info(u"downloading '{}' to {}".format(url, partname).encode('utf-8'))
5859 with open(partname, "wb") as f:
5960 for data in response.iter_content():
6061 f.write(data)
6162 destname = os.path.join(dirname, filename)
6263 if os.path.exists(destname):
64 logger.info(u"{} exists, computing hashes".format(destname).encode('utf-8'))
6365 desthash = compute_hash(destname)
6466 parthash = compute_hash(partname)
6567 if desthash == parthash:
68 logger.info(u"hash {} matches, deleting {}".format(desthash, partname).encode('utf-8'))
6669 os.unlink(partname)
6770 else:
71 logger.info(u"incoming hash {} does not match existing hash {}".format(parthash, desthash).encode('utf-8'))
6872 mtime = os.path.getmtime(destname)
6973 timestring = strftime('%Y.%m%d.%H%M%S', localtime(mtime))
7074 archname = '{}_REV{}'.format(destname, timestring)
75 logger.info(u"moving {} to {} and {} to {}".format(desthash, archname, partname, destname).encode('utf-8'))
7176 os.rename(destname, archname)
7277 os.rename(partname, destname)
7378 else:
79 logger.info(u"moving {} to {}".format(partname, destname).encode('utf-8'))
7480 os.rename(partname, destname)
7581 return response
7682
96102
97103 def traverse(self):
98104 self.results = []
105 processed_urls = set()
99106 for link in tqdm(self.links, total=len(self.links)):
100 url = link['url']
101 if url in self.ignore_urls:
102 continue
103107 try:
108 url = link['url']
109 logger.info(u"processing '{}'".format(url).encode('utf-8'))
110 if url in self.ignore_urls:
111 logger.info(u"URL is being ignored, skipping")
112 continue
113 if url in processed_urls:
114 logger.info(u"URL has already been processed, skipping")
115 continue
116 processed_urls.add(url)
104117 if url.startswith(('#',)):
105118 continue
106119 elif not url.startswith(('http://', 'https://')):
169182 logger.info(u"archiving {} to {}".format(url, dirname).encode('utf-8'))
170183 if not os.path.exists(dirname):
171184 os.makedirs(dirname)
172 if self.missing_only and os.path.exists(os.path.join(dirname, filename)):
185 existing_file = os.path.join(dirname, filename)
186 if self.missing_only and os.path.exists(existing_file):
187 logger.info(u"file {} already exists, not downloading".format(existing_file).encode('utf-8'))
173188 return None
174189 response = download(url, dirname, filename)
175190 return {