git @ Cat's Eye Technologies yastasoti / master script / yastasoti
master

Tree @master (Download .tar.gz)

yastasoti @masterraw · history · blame

#!/usr/bin/env python

#
# yastasoti -- Yet another script to archive stuff off teh internets.
# This work is in the public domain.
#

from argparse import ArgumentParser
from fnmatch import fnmatch
import hashlib
import json
import logging
import os
import sys
from time import sleep, localtime, strftime
try:
    from urllib import unquote, quote_plus
except ImportError:
    from urllib.parse import unquote, quote_plus
assert unquote and quote_plus

import requests
try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x, **kwargs): return x


logger = logging.getLogger("yastasoti")


CHUNK_SIZE = 8192


def url_to_dirname_and_filename(url):
    parts = url.split(u'/')
    parts = parts[2:]
    domain_name = parts[0]
    domain_name = quote_plus(domain_name)
    parts = parts[1:]
    filename = u'/'.join(parts)
    filename = quote_plus(filename.encode('utf-8'))
    if not filename:
        filename = 'index.html'
    return (domain_name, filename)


def compute_hash(filename):
    collector = hashlib.sha1()
    with open(filename, 'rb') as f:
        while True:
            data = f.read(CHUNK_SIZE)
            if not data:
                break
            collector.update(data)
    return collector.hexdigest()


def download(url, dirname, filename):
    response = requests.get(url, stream=True)
    partname = os.path.join(dirname, filename + '_part')
    logger.info(u"downloading '{}' to {}".format(url, partname).encode('utf-8'))
    with open(partname, "wb") as f:
        for data in response.iter_content(chunk_size=CHUNK_SIZE):
            f.write(data)
    destname = os.path.join(dirname, filename)
    if os.path.exists(destname):
        logger.info(u"{} exists, computing hashes".format(destname).encode('utf-8'))
        desthash = compute_hash(destname)
        parthash = compute_hash(partname)
        if desthash == parthash:
            logger.info(u"hash {} matches, deleting {}".format(desthash, partname).encode('utf-8'))
            os.unlink(partname)
        else:
            logger.info(u"incoming hash {} does not match existing hash {}".format(parthash, desthash).encode('utf-8'))
            mtime = os.path.getmtime(destname)
            timestring = strftime('%Y.%m%d.%H%M%S', localtime(mtime))
            archname = '{}_REV{}'.format(destname, timestring)
            logger.info(u"moving {} to {} and {} to {}".format(desthash, archname, partname, destname).encode('utf-8'))
            os.rename(destname, archname)
            os.rename(partname, destname)
    else:
        logger.info(u"moving {} to {}".format(partname, destname).encode('utf-8'))
        os.rename(partname, destname)
    return response


class LinkTraverser(object):
    def __init__(self, links, extant_path=None, ignore_urls=None, delay_between_requests=0.0, fragile=False):
        self.links = links
        self.extant_path = extant_path or []
        self.ignore_urls = ignore_urls or []
        self.delay_between_requests = delay_between_requests
        self.fragile = fragile

    def handle_link(self, link):
        """Given a dict containing a URL under the key `url` (and possibly
        other information), process that URL.  Should either return None, meaning
        it declined to process this URL (for whatever reason), or should return
        a dict representing the response from processing the URL, which should
        contain (at least) the following keys:

            status_code: an integer.  6xx can be used to indicate internal error.

        """
        raise NotImplementedError

    def traverse(self):
        self.results = []
        processed_urls = set()
        for link in tqdm(self.links, total=len(self.links)):
            try:
                url = link['url']
                logger.info(u"processing '{}'".format(url).encode('utf-8'))
                if url in self.ignore_urls:
                    logger.info(u"URL is being ignored, skipping")
                    continue
                if url in processed_urls:
                    logger.info(u"URL has already been processed, skipping")
                    continue
                processed_urls.add(url)
                if url.startswith(('#',)):
                    continue
                elif not url.startswith(('http://', 'https://')):
                    if '#' in url:
                        filename = url.split('#')[0]
                    else:
                        filename = url
                    filename = unquote(filename)
                    found = False
                    for extant_dirname in self.extant_path:
                        extant_filename = os.path.join(extant_dirname, filename)
                        if os.path.exists(extant_filename):
                            found = True
                            break
                    if not found:
                        raise ValueError('Local file "{}" not found in extant-path'.format(filename))
                    continue
                else:
                    response = self.handle_link(link)
                if response is None:
                    continue
            except Exception as e:
                if self.fragile:
                    raise
                response = {
                    "status_code": 600,
                    "error": "{}: {}".format(e.__class__.__name__, e)
                }
            self.results.append({
                'response': response,
                'url': url,
                'link': link,
            })
            if self.delay_between_requests > 0.0:
                sleep(self.delay_between_requests)

    def failures(self):
        for result in self.results:
            if result['response']['status_code'] != 200:
                yield result


class LinkChecker(LinkTraverser):
    def handle_link(self, link):
        url = link['url']
        logger.info(u"checking {}".format(url).encode('utf-8'))
        response = requests.head(url, allow_redirects=True, headers={
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
        })
        return {
            'url': response.url,
            'status_code': response.status_code
        }


class LinkArchiver(LinkTraverser):
    def __init__(self, links, router, missing_only=False, **kwargs):
        super(LinkArchiver, self).__init__(links, **kwargs)
        self.router = router
        self.missing_only = missing_only

    def select_dest_dir(self, url):
        for key in sorted(self.router.keys(), key=lambda x: 0-len(x)):
            if fnmatch(url, key):
                return self.router[key]
        raise NotImplementedError("archive router could not resolve {}".format(url))

    def handle_link(self, link):
        url = link['url']
        dirname, filename = url_to_dirname_and_filename(url)
        if 'dest_filename' in link:
            filename = link['dest_filename']
        dest_dir = self.select_dest_dir(url)
        if dest_dir == '/dev/null':
            logger.info(u"{} routed to {}, skipping".format(url, dest_dir).encode('utf-8'))
            return {
                'status_code': 200
            }
        dirname = os.path.join(dest_dir, dirname)
        logger.info(u"archiving {} to {}/{}".format(url, dirname, filename).encode('utf-8'))
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        existing_file = os.path.join(dirname, filename)
        if self.missing_only and os.path.exists(existing_file):
            logger.info(u"file {} already exists, not downloading".format(existing_file).encode('utf-8'))
            return None
        response = download(url, dirname, filename)
        return {
            'status_code': response.status_code
        }


def main(args):
    argparser = ArgumentParser()

    # Checks if the links are resolvable, and optionally downloads a copy of each

    argparser.add_argument('input_files', nargs='+', metavar='FILENAME', type=str,
        help='JSON files containing the links to archive'
    )

    argparser.add_argument('--archive-to', metavar='DIRNAME', type=str, default=None,
        help='Download a copy of each of the links, if changed, to the given directory'
    )
    argparser.add_argument('--archive-missing-only', action='store_true',
        help='When archiving links, only download the link if it is not already archived'
    )
    argparser.add_argument('--archive-via', metavar='ROUTERFILE', type=str, default=None,
        help='Download links and save each in the directory given in the router file'
    )
    argparser.add_argument('--delay-between-requests', metavar='SECONDS', type=float, default=0.0,
        help='Delay (in seconds, fractions allowed) between successive network requests'
    )
    argparser.add_argument('--extant-path', metavar='DIRNAMES', type=str, default=None,
        help='When finding a relative link that would resolve to local file, assert that '
             'a file by that name exists in this search path, which should be given as a '
             'comma-delimited list of directory names'
    )
    argparser.add_argument('--fragile', action='store_true', default=False,
        help='Exit on first error of any nature when processing links'
    )
    argparser.add_argument('--ignore-urls', metavar='URLS', type=str, default=None,
        help='Comma-separated list of link targets that should not even try to be fetched'
    )
    argparser.add_argument('--log-to', metavar='FILENAME', type=str, default=None,
        help='Enable logging and direct the messages to the specified file'
    )

    options = argparser.parse_args(args)

    if options.log_to:
        logging.basicConfig(level=logging.INFO, filename=options.log_to)

    links = []
    for filename in options.input_files:
        if filename == '-':
            data = json.loads(sys.stdin.read())
        else:
            with open(filename, 'r') as f:
                data = json.loads(f.read())
        links.extend(data)

    common_kwargs = dict(
        extant_path=None if options.extant_path is None else options.extant_path.split(','),
        ignore_urls=[] if options.ignore_urls is None else options.ignore_urls.split(','),
        delay_between_requests=options.delay_between_requests,
        fragile=options.fragile,
    )

    if options.archive_to and options.archive_via:
        raise NotImplementedError("Specify either --archive-to or --archive-via, not both")

    router = None
    if options.archive_to:
        router = {
            "*": options.archive_to
        }
    elif options.archive_via:
        with open(options.archive_via, 'r') as f:
            router = json.loads(f.read())

    if router:
        traverser = LinkArchiver(
            links, router, missing_only=options.archive_missing_only,
            **common_kwargs
        )
    else:
        traverser = LinkChecker(links, **common_kwargs)

    traverser.traverse()
    result = list(traverser.failures())
    sys.stdout.write(json.dumps(result, indent=4, sort_keys=True))


if __name__ == '__main__':
    main(sys.argv[1:])