Commit a19e0c4c27e69cccdba9efddf1a5033394b5d82c - Feedmark

Merge pull request #1 from catseye/check-links --archive-links-to and --check-links options. Chris Pressey authored 7 years ago GitHub committed 7 years ago

4 changed file(s) with 105 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all

-0

README.markdown less more

24	24	* dump an inverted index of each property found, and its entries
25	25	* write out an Atom (née RSS) feed containing the parsed entries
26	26	* parse all of the "Items of Note" lists in The Dossier
	27	* archive locally all the web objects linked to in the entries
27	28
28	29	Example Feedmark documents can be found in the `eg/` directory.
29	30

-0

requirements.txt less more

0	0	atomize==0.2.0
1	1	Markdown==2.6.8
	2	beautifulsoup4==4.6.0
	3	requests==2.17.3

+90

-0

src/feedmark/checkers.py less more

	0	import os
	1	import urllib
	2
	3	from bs4 import BeautifulSoup
	4	import markdown
	5	import requests
	6
	7	try:
	8	from tqdm import tqdm
	9	except ImportError:
	10	def tqdm(x, **kwargs): return x
	11
	12
	13	def extract_links(html_text):
	14
	15	links = []
	16	soup = BeautifulSoup(html_text, 'html.parser')
	17	for link in soup.find_all('a'):
	18	url = link.get('href')
	19	if not url.startswith(('http://', 'https://')):
	20	print('skipping url', url)
	21	continue
	22	links.append(url)
	23
	24	return links
	25
	26
	27	def extract_links_from_documents(documents):
	28	links = []
	29	for document in documents:
	30	for section in document.sections:
	31	for (name, url) in section.images:
	32	links.append((url, section))
	33	for key, value in section.properties.iteritems():
	34	if isinstance(value, list):
	35	for subitem in value:
	36	links.extend([(url, section) for url in extract_links(markdown.markdown(subitem))])
	37	else:
	38	links.extend([(url, section) for url in extract_links(markdown.markdown(value))])
	39	links.extend([(url, section) for url in extract_links(markdown.markdown(section.body))])
	40	return links
	41
	42
	43	def url_to_dirname_and_filename(url):
	44	parts = url.split('/')
	45	parts = parts[2:]
	46	domain_name = parts[0]
	47	domain_name = urllib.quote_plus(domain_name)
	48	parts = parts[1:]
	49	filename = '/'.join(parts)
	50	filename = urllib.quote_plus(filename)
	51	return (domain_name, filename)
	52
	53
	54	def download(url, filename):
	55	response = requests.get(url, stream=True)
	56	part_filename = filename + '_part'
	57	with open(part_filename, "wb") as f:
	58	for data in response.iter_content():
	59	f.write(data)
	60	os.rename(part_filename, filename)
	61	return response
	62
	63
	64	def archive_links(documents, dest_dir):
	65	"""If dest_dir is None, links will only be checked for existence, not downloaded."""
	66	links = extract_links_from_documents(documents)
	67
	68	failures = []
	69	for url, section in tqdm(links, total=len(links)):
	70	try:
	71	if dest_dir is not None:
	72	dirname, filename = url_to_dirname_and_filename(url)
	73	dirname = os.path.join(dest_dir, dirname)
	74	if not os.path.exists(dirname):
	75	os.makedirs(dirname)
	76	filename = os.path.join(dirname, filename)
	77	response = download(url, filename)
	78	else:
	79	response = requests.head(url)
	80	status = response.status_code
	81	except Exception as e:
	82	status = str(e)
	83	if status not in (200, 301, 302, 303):
	84	failures.append({
	85	'status': status,
	86	'url': url,
	87	'section': section.title
	88	})
	89	return failures

+12

-0

src/feedmark/main.py less more

0	0	from argparse import ArgumentParser
1	1	import codecs
	2	import json
2	3	import sys
3	4
4	5	from feedmark.atomizer import feedmark_atomize

18	19	)
19	20	argparser.add_argument('--dump-entries', action='store_true',
20	21	help='Display a summary of the entries on standard output'
	22	)
	23	argparser.add_argument('--archive-links-to', metavar='DIRNAME', type=str, default=None,
	24	help='Download a copy of all web objects linked to from the entries'
	25	)
	26	argparser.add_argument('--check-links', action='store_true',
	27	help='Check if web objects linked to from the entries exist'
21	28	)
22	29	argparser.add_argument('--output-atom', metavar='FILENAME', type=str,
23	30	help='Construct an Atom XML feed from the entries and write it out to this file'

42	49
43	50	def write(s):
44	51	print(s.encode('utf-8'))
	52
	53	if options.check_links or options.archive_links_to is not None:
	54	from feedmark.checkers import archive_links
	55	result = archive_links(documents, options.archive_links_to)
	56	write(json.dumps(result, indent=4))
45	57
46	58	if options.dump_entries:
47	59	for document in documents: