28 | 28 |
logger = logging.getLogger("yastasoti")
|
29 | 29 |
|
30 | 30 |
|
|
31 |
CHUNK_SIZE = 8192
|
|
32 |
|
|
33 |
|
31 | 34 |
def url_to_dirname_and_filename(url):
|
32 | 35 |
parts = url.split(u'/')
|
33 | 36 |
parts = parts[2:]
|
|
45 | 48 |
collector = hashlib.sha1()
|
46 | 49 |
with open(filename, 'rb') as f:
|
47 | 50 |
while True:
|
48 | |
data = f.read(1024)
|
|
51 |
data = f.read(CHUNK_SIZE)
|
49 | 52 |
if not data:
|
50 | 53 |
break
|
51 | 54 |
collector.update(data)
|
|
57 | 60 |
partname = os.path.join(dirname, filename + '_part')
|
58 | 61 |
logger.info(u"downloading '{}' to {}".format(url, partname).encode('utf-8'))
|
59 | 62 |
with open(partname, "wb") as f:
|
60 | |
for data in response.iter_content():
|
|
63 |
for data in response.iter_content(chunk_size=CHUNK_SIZE):
|
61 | 64 |
f.write(data)
|
62 | 65 |
destname = os.path.join(dirname, filename)
|
63 | 66 |
if os.path.exists(destname):
|
|
89 | 92 |
self.delay_between_requests = delay_between_requests
|
90 | 93 |
self.fragile = fragile
|
91 | 94 |
|
92 | |
def handle_link(self, url):
|
93 | |
"""Given a URL, process that URL. Should either return None, meaning
|
|
95 |
def handle_link(self, link):
|
|
96 |
"""Given a dict containing a URL under the key `url` (and possibly
|
|
97 |
other information), process that URL. Should either return None, meaning
|
94 | 98 |
it declined to process this URL (for whatever reason), or should return
|
95 | 99 |
a dict representing the response from processing the URL, which should
|
96 | |
contain the following keys:
|
|
100 |
contain (at least) the following keys:
|
97 | 101 |
|
98 | 102 |
status_code: an integer. 6xx can be used to indicate internal error.
|
99 | 103 |
|
|
127 | 131 |
raise ValueError('Local file "{}" does not exist'.format(filename))
|
128 | 132 |
continue
|
129 | 133 |
else:
|
130 | |
response = self.handle_link(url)
|
|
134 |
response = self.handle_link(link)
|
131 | 135 |
if response is None:
|
132 | 136 |
continue
|
133 | 137 |
except Exception as e:
|
|
152 | 156 |
|
153 | 157 |
|
154 | 158 |
class LinkChecker(LinkTraverser):
|
155 | |
def handle_link(self, url):
|
|
159 |
def handle_link(self, link):
|
|
160 |
url = link['url']
|
156 | 161 |
logger.info(u"checking {}".format(url).encode('utf-8'))
|
157 | 162 |
response = requests.head(url, allow_redirects=True, headers={
|
158 | 163 |
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
|
|
175 | 180 |
return self.router[key]
|
176 | 181 |
raise NotImplementedError("archive router could not resolve {}".format(url))
|
177 | 182 |
|
178 | |
def handle_link(self, url):
|
|
183 |
def handle_link(self, link):
|
|
184 |
url = link['url']
|
179 | 185 |
dirname, filename = url_to_dirname_and_filename(url)
|
|
186 |
if 'dest_filename' in link:
|
|
187 |
filename = link['dest_filename']
|
180 | 188 |
dest_dir = self.select_dest_dir(url)
|
181 | 189 |
if dest_dir == '/dev/null':
|
182 | 190 |
logger.info(u"{} routed to {}, skipping".format(url, dest_dir).encode('utf-8'))
|
|
184 | 192 |
'status_code': 200
|
185 | 193 |
}
|
186 | 194 |
dirname = os.path.join(dest_dir, dirname)
|
187 | |
logger.info(u"archiving {} to {}".format(url, dirname).encode('utf-8'))
|
|
195 |
logger.info(u"archiving {} to {}/{}".format(url, dirname, filename).encode('utf-8'))
|
188 | 196 |
if not os.path.exists(dirname):
|
189 | 197 |
os.makedirs(dirname)
|
190 | 198 |
existing_file = os.path.join(dirname, filename)
|
|
231 | 239 |
help='Enable logging and direct the messages to the specified file'
|
232 | 240 |
)
|
233 | 241 |
|
234 | |
options = argparser.parse_args(sys.argv[1:])
|
|
242 |
options = argparser.parse_args(args)
|
235 | 243 |
|
236 | 244 |
if options.log_to:
|
237 | 245 |
logging.basicConfig(level=logging.INFO, filename=options.log_to)
|