#!/usr/bin/env python
#
# yastasoti -- Yet another script to archive stuff off teh internets.
# This work is in the public domain.
#
from argparse import ArgumentParser
from fnmatch import fnmatch
import hashlib
import json
import logging
import os
import sys
from time import sleep, localtime, strftime
try:
from urllib import unquote, quote_plus
except ImportError:
from urllib.parse import unquote, quote_plus
assert unquote and quote_plus
import requests
try:
from tqdm import tqdm
except ImportError:
def tqdm(x, **kwargs): return x
logger = logging.getLogger("yastasoti")
CHUNK_SIZE = 8192
def url_to_dirname_and_filename(url):
parts = url.split(u'/')
parts = parts[2:]
domain_name = parts[0]
domain_name = quote_plus(domain_name)
parts = parts[1:]
filename = u'/'.join(parts)
filename = quote_plus(filename.encode('utf-8'))
if not filename:
filename = 'index.html'
return (domain_name, filename)
def compute_hash(filename):
collector = hashlib.sha1()
with open(filename, 'rb') as f:
while True:
data = f.read(CHUNK_SIZE)
if not data:
break
collector.update(data)
return collector.hexdigest()
def download(url, dirname, filename):
response = requests.get(url, stream=True)
partname = os.path.join(dirname, filename + '_part')
logger.info(u"downloading '{}' to {}".format(url, partname).encode('utf-8'))
with open(partname, "wb") as f:
for data in response.iter_content(chunk_size=CHUNK_SIZE):
f.write(data)
destname = os.path.join(dirname, filename)
if os.path.exists(destname):
logger.info(u"{} exists, computing hashes".format(destname).encode('utf-8'))
desthash = compute_hash(destname)
parthash = compute_hash(partname)
if desthash == parthash:
logger.info(u"hash {} matches, deleting {}".format(desthash, partname).encode('utf-8'))
os.unlink(partname)
else:
logger.info(u"incoming hash {} does not match existing hash {}".format(parthash, desthash).encode('utf-8'))
mtime = os.path.getmtime(destname)
timestring = strftime('%Y.%m%d.%H%M%S', localtime(mtime))
archname = '{}_REV{}'.format(destname, timestring)
logger.info(u"moving {} to {} and {} to {}".format(desthash, archname, partname, destname).encode('utf-8'))
os.rename(destname, archname)
os.rename(partname, destname)
else:
logger.info(u"moving {} to {}".format(partname, destname).encode('utf-8'))
os.rename(partname, destname)
return response
class LinkTraverser(object):
def __init__(self, links, extant_path=None, ignore_urls=None, delay_between_requests=0.0, fragile=False):
self.links = links
self.extant_path = extant_path or []
self.ignore_urls = ignore_urls or []
self.delay_between_requests = delay_between_requests
self.fragile = fragile
def handle_link(self, link):
"""Given a dict containing a URL under the key `url` (and possibly
other information), process that URL. Should either return None, meaning
it declined to process this URL (for whatever reason), or should return
a dict representing the response from processing the URL, which should
contain (at least) the following keys:
status_code: an integer. 6xx can be used to indicate internal error.
"""
raise NotImplementedError
def traverse(self):
self.results = []
processed_urls = set()
for link in tqdm(self.links, total=len(self.links)):
try:
url = link['url']
logger.info(u"processing '{}'".format(url).encode('utf-8'))
if url in self.ignore_urls:
logger.info(u"URL is being ignored, skipping")
continue
if url in processed_urls:
logger.info(u"URL has already been processed, skipping")
continue
processed_urls.add(url)
if url.startswith(('#',)):
continue
elif not url.startswith(('http://', 'https://')):
if '#' in url:
filename = url.split('#')[0]
else:
filename = url
filename = unquote(filename)
found = False
for extant_dirname in self.extant_path:
extant_filename = os.path.join(extant_dirname, filename)
if os.path.exists(extant_filename):
found = True
break
if not found:
raise ValueError('Local file "{}" not found in extant-path'.format(filename))
continue
else:
response = self.handle_link(link)
if response is None:
continue
except Exception as e:
if self.fragile:
raise
response = {
"status_code": 600,
"error": "{}: {}".format(e.__class__.__name__, e)
}
self.results.append({
'response': response,
'url': url,
'link': link,
})
if self.delay_between_requests > 0.0:
sleep(self.delay_between_requests)
def failures(self):
for result in self.results:
if result['response']['status_code'] != 200:
yield result
class LinkChecker(LinkTraverser):
def handle_link(self, link):
url = link['url']
logger.info(u"checking {}".format(url).encode('utf-8'))
response = requests.head(url, allow_redirects=True, headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
})
return {
'url': response.url,
'status_code': response.status_code
}
class LinkArchiver(LinkTraverser):
def __init__(self, links, router, missing_only=False, **kwargs):
super(LinkArchiver, self).__init__(links, **kwargs)
self.router = router
self.missing_only = missing_only
def select_dest_dir(self, url):
for key in sorted(self.router.keys(), key=lambda x: 0-len(x)):
if fnmatch(url, key):
return self.router[key]
raise NotImplementedError("archive router could not resolve {}".format(url))
def handle_link(self, link):
url = link['url']
dirname, filename = url_to_dirname_and_filename(url)
if 'dest_filename' in link:
filename = link['dest_filename']
dest_dir = self.select_dest_dir(url)
if dest_dir == '/dev/null':
logger.info(u"{} routed to {}, skipping".format(url, dest_dir).encode('utf-8'))
return {
'status_code': 200
}
dirname = os.path.join(dest_dir, dirname)
logger.info(u"archiving {} to {}/{}".format(url, dirname, filename).encode('utf-8'))
if not os.path.exists(dirname):
os.makedirs(dirname)
existing_file = os.path.join(dirname, filename)
if self.missing_only and os.path.exists(existing_file):
logger.info(u"file {} already exists, not downloading".format(existing_file).encode('utf-8'))
return None
response = download(url, dirname, filename)
return {
'status_code': response.status_code
}
def main(args):
argparser = ArgumentParser()
# Checks if the links are resolvable, and optionally downloads a copy of each
argparser.add_argument('input_files', nargs='+', metavar='FILENAME', type=str,
help='JSON files containing the links to archive'
)
argparser.add_argument('--archive-to', metavar='DIRNAME', type=str, default=None,
help='Download a copy of each of the links, if changed, to the given directory'
)
argparser.add_argument('--archive-missing-only', action='store_true',
help='When archiving links, only download the link if it is not already archived'
)
argparser.add_argument('--archive-via', metavar='ROUTERFILE', type=str, default=None,
help='Download links and save each in the directory given in the router file'
)
argparser.add_argument('--delay-between-requests', metavar='SECONDS', type=float, default=0.0,
help='Delay (in seconds, fractions allowed) between successive network requests'
)
argparser.add_argument('--extant-path', metavar='DIRNAMES', type=str, default=None,
help='When finding a relative link that would resolve to local file, assert that '
'a file by that name exists in this search path, which should be given as a '
'comma-delimited list of directory names'
)
argparser.add_argument('--fragile', action='store_true', default=False,
help='Exit on first error of any nature when processing links'
)
argparser.add_argument('--ignore-urls', metavar='URLS', type=str, default=None,
help='Comma-separated list of link targets that should not even try to be fetched'
)
argparser.add_argument('--log-to', metavar='FILENAME', type=str, default=None,
help='Enable logging and direct the messages to the specified file'
)
options = argparser.parse_args(args)
if options.log_to:
logging.basicConfig(level=logging.INFO, filename=options.log_to)
links = []
for filename in options.input_files:
if filename == '-':
data = json.loads(sys.stdin.read())
else:
with open(filename, 'r') as f:
data = json.loads(f.read())
links.extend(data)
common_kwargs = dict(
extant_path=None if options.extant_path is None else options.extant_path.split(','),
ignore_urls=[] if options.ignore_urls is None else options.ignore_urls.split(','),
delay_between_requests=options.delay_between_requests,
fragile=options.fragile,
)
if options.archive_to and options.archive_via:
raise NotImplementedError("Specify either --archive-to or --archive-via, not both")
router = None
if options.archive_to:
router = {
"*": options.archive_to
}
elif options.archive_via:
with open(options.archive_via, 'r') as f:
router = json.loads(f.read())
if router:
traverser = LinkArchiver(
links, router, missing_only=options.archive_missing_only,
**common_kwargs
)
else:
traverser = LinkChecker(links, **common_kwargs)
traverser.traverse()
result = list(traverser.failures())
sys.stdout.write(json.dumps(result, indent=4, sort_keys=True))
if __name__ == '__main__':
main(sys.argv[1:])