git @ Cat's Eye Technologies Feedmark / a19e0c4
Merge pull request #1 from catseye/check-links --archive-links-to and --check-links options. Chris Pressey authored 7 years ago GitHub committed 7 years ago
4 changed file(s) with 105 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
2424 * dump an inverted index of each property found, and its entries
2525 * write out an Atom (née RSS) feed containing the parsed entries
2626 * parse all of the "Items of Note" lists in The Dossier
27 * archive locally all the web objects linked to in the entries
2728
2829 Example Feedmark documents can be found in the `eg/` directory.
2930
00 atomize==0.2.0
11 Markdown==2.6.8
2 beautifulsoup4==4.6.0
3 requests==2.17.3
0 import os
1 import urllib
2
3 from bs4 import BeautifulSoup
4 import markdown
5 import requests
6
7 try:
8 from tqdm import tqdm
9 except ImportError:
10 def tqdm(x, **kwargs): return x
11
12
13 def extract_links(html_text):
14
15 links = []
16 soup = BeautifulSoup(html_text, 'html.parser')
17 for link in soup.find_all('a'):
18 url = link.get('href')
19 if not url.startswith(('http://', 'https://')):
20 print('skipping url', url)
21 continue
22 links.append(url)
23
24 return links
25
26
27 def extract_links_from_documents(documents):
28 links = []
29 for document in documents:
30 for section in document.sections:
31 for (name, url) in section.images:
32 links.append((url, section))
33 for key, value in section.properties.iteritems():
34 if isinstance(value, list):
35 for subitem in value:
36 links.extend([(url, section) for url in extract_links(markdown.markdown(subitem))])
37 else:
38 links.extend([(url, section) for url in extract_links(markdown.markdown(value))])
39 links.extend([(url, section) for url in extract_links(markdown.markdown(section.body))])
40 return links
41
42
43 def url_to_dirname_and_filename(url):
44 parts = url.split('/')
45 parts = parts[2:]
46 domain_name = parts[0]
47 domain_name = urllib.quote_plus(domain_name)
48 parts = parts[1:]
49 filename = '/'.join(parts)
50 filename = urllib.quote_plus(filename)
51 return (domain_name, filename)
52
53
54 def download(url, filename):
55 response = requests.get(url, stream=True)
56 part_filename = filename + '_part'
57 with open(part_filename, "wb") as f:
58 for data in response.iter_content():
59 f.write(data)
60 os.rename(part_filename, filename)
61 return response
62
63
64 def archive_links(documents, dest_dir):
65 """If dest_dir is None, links will only be checked for existence, not downloaded."""
66 links = extract_links_from_documents(documents)
67
68 failures = []
69 for url, section in tqdm(links, total=len(links)):
70 try:
71 if dest_dir is not None:
72 dirname, filename = url_to_dirname_and_filename(url)
73 dirname = os.path.join(dest_dir, dirname)
74 if not os.path.exists(dirname):
75 os.makedirs(dirname)
76 filename = os.path.join(dirname, filename)
77 response = download(url, filename)
78 else:
79 response = requests.head(url)
80 status = response.status_code
81 except Exception as e:
82 status = str(e)
83 if status not in (200, 301, 302, 303):
84 failures.append({
85 'status': status,
86 'url': url,
87 'section': section.title
88 })
89 return failures
00 from argparse import ArgumentParser
11 import codecs
2 import json
23 import sys
34
45 from feedmark.atomizer import feedmark_atomize
1819 )
1920 argparser.add_argument('--dump-entries', action='store_true',
2021 help='Display a summary of the entries on standard output'
22 )
23 argparser.add_argument('--archive-links-to', metavar='DIRNAME', type=str, default=None,
24 help='Download a copy of all web objects linked to from the entries'
25 )
26 argparser.add_argument('--check-links', action='store_true',
27 help='Check if web objects linked to from the entries exist'
2128 )
2229 argparser.add_argument('--output-atom', metavar='FILENAME', type=str,
2330 help='Construct an Atom XML feed from the entries and write it out to this file'
4249
4350 def write(s):
4451 print(s.encode('utf-8'))
52
53 if options.check_links or options.archive_links_to is not None:
54 from feedmark.checkers import archive_links
55 result = archive_links(documents, options.archive_links_to)
56 write(json.dumps(result, indent=4))
4557
4658 if options.dump_entries:
4759 for document in documents: