git @ Cat's Eye Technologies yastasoti / ec38d56
Merge pull request #1 from catseye/develop-0.2 Develop 0.2 Chris Pressey authored 6 years ago GitHub committed 6 years ago
2 changed file(s) with 35 addition(s) and 11 deletion(s). Raw diff Collapse all Expand all
00 yastasoti
11 =========
22
3 _Version 0.1_
3 _Version 0.2_
44
55 Yet another script to archive stuff off teh internets.
66
4646 EOF
4747 yastasoti --archive-to=downloads links.json
4848
49 #### Override the filename the stuff is archived as ####
50
51 By default, the subdirectory and filename to which the stuff is archived are
52 based on the site's domain name and the stuff's path. The filename, however,
53 can be overridden if the input JSON contains a `dest_filename` field.
54
55 cat >links.json << EOF
56 [
57 {
58 "url": "http://catseye.tc/",
59 "dest_filename": "home_page.html"
60 }
61 ]
62 EOF
63 yastasoti --archive-to=downloads links.json
64
4965 #### Categorize archived materials with a router ####
5066
5167 An archive router (used with `--archive-via`) is a JSON file that looks like this:
2828 logger = logging.getLogger("yastasoti")
2929
3030
31 CHUNK_SIZE = 8192
32
33
3134 def url_to_dirname_and_filename(url):
3235 parts = url.split(u'/')
3336 parts = parts[2:]
4548 collector = hashlib.sha1()
4649 with open(filename, 'rb') as f:
4750 while True:
48 data = f.read(1024)
51 data = f.read(CHUNK_SIZE)
4952 if not data:
5053 break
5154 collector.update(data)
5760 partname = os.path.join(dirname, filename + '_part')
5861 logger.info(u"downloading '{}' to {}".format(url, partname).encode('utf-8'))
5962 with open(partname, "wb") as f:
60 for data in response.iter_content():
63 for data in response.iter_content(chunk_size=CHUNK_SIZE):
6164 f.write(data)
6265 destname = os.path.join(dirname, filename)
6366 if os.path.exists(destname):
8992 self.delay_between_requests = delay_between_requests
9093 self.fragile = fragile
9194
92 def handle_link(self, url):
93 """Given a URL, process that URL. Should either return None, meaning
95 def handle_link(self, link):
96 """Given a dict containing a URL under the key `url` (and possibly
97 other information), process that URL. Should either return None, meaning
9498 it declined to process this URL (for whatever reason), or should return
9599 a dict representing the response from processing the URL, which should
96 contain the following keys:
100 contain (at least) the following keys:
97101
98102 status_code: an integer. 6xx can be used to indicate internal error.
99103
127131 raise ValueError('Local file "{}" does not exist'.format(filename))
128132 continue
129133 else:
130 response = self.handle_link(url)
134 response = self.handle_link(link)
131135 if response is None:
132136 continue
133137 except Exception as e:
152156
153157
154158 class LinkChecker(LinkTraverser):
155 def handle_link(self, url):
159 def handle_link(self, link):
160 url = link['url']
156161 logger.info(u"checking {}".format(url).encode('utf-8'))
157162 response = requests.head(url, allow_redirects=True, headers={
158163 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
175180 return self.router[key]
176181 raise NotImplementedError("archive router could not resolve {}".format(url))
177182
178 def handle_link(self, url):
183 def handle_link(self, link):
184 url = link['url']
179185 dirname, filename = url_to_dirname_and_filename(url)
186 if 'dest_filename' in link:
187 filename = link['dest_filename']
180188 dest_dir = self.select_dest_dir(url)
181189 if dest_dir == '/dev/null':
182190 logger.info(u"{} routed to {}, skipping".format(url, dest_dir).encode('utf-8'))
184192 'status_code': 200
185193 }
186194 dirname = os.path.join(dest_dir, dirname)
187 logger.info(u"archiving {} to {}".format(url, dirname).encode('utf-8'))
195 logger.info(u"archiving {} to {}/{}".format(url, dirname, filename).encode('utf-8'))
188196 if not os.path.exists(dirname):
189197 os.makedirs(dirname)
190198 existing_file = os.path.join(dirname, filename)
231239 help='Enable logging and direct the messages to the specified file'
232240 )
233241
234 options = argparser.parse_args(sys.argv[1:])
242 options = argparser.parse_args(args)
235243
236244 if options.log_to:
237245 logging.basicConfig(level=logging.INFO, filename=options.log_to)