git @ Cat's Eye Technologies Feedmark / master src / feedmark / parser.py
master

Tree @master (Download .tar.gz)

parser.py @masterraw · history · blame

#!/usr/bin/env python

# Copyright (c) 2019-2024 Chris Pressey, Cat's Eye Technologies
# This file is distributed under an MIT license.  See LICENSES/ directory.
# SPDX-License-Identifier: LicenseRef-MIT-X-Feedmark

# encoding: UTF-8

from datetime import datetime
from collections import OrderedDict
import re

from feedmark.formats.markdown import markdown_to_html5, markdown_to_html5_deep
from feedmark.utils import quote


def rewrite_reference_links(refdex, reference_links):
    new_reference_links = []
    seen_names = set()
    for (name, url) in reference_links:
        if name in seen_names:
            continue
        seen_names.add(name)
        if name in refdex:
            entry = refdex[name]
            if 'filename' in entry and 'anchor' in entry:
                filename = quote(entry['filename'].encode('utf-8'))
                anchor = quote(entry['anchor'].encode('utf-8'))
                url = u'{}#{}'.format(filename, anchor)
            elif 'filenames' in entry and 'anchor' in entry:
                # pick the last one, for compatibility with single-refdex style
                filename = quote(entry['filenames'][-1].encode('utf-8'))
                anchor = quote(entry['anchor'].encode('utf-8'))
                url = u'{}#{}'.format(filename, anchor)
            elif 'url' in entry:
                url = entry['url']
            else:
                raise ValueError("Badly formed refdex entry: {}".format(entry))
        new_reference_links.append((name, url))
    return new_reference_links


class Document(object):
    def __init__(self, title):
        self.title = title
        self.properties = OrderedDict()

        self.preamble = None
        self.sections = []

    def __str__(self):
        return "document '{}'".format(self.title.encode('utf-8'))

    def rewrite_reference_links(self, refdex):
        self.reference_links = rewrite_reference_links(refdex, self.reference_links)
        for section in self.sections:
            section.reference_links = rewrite_reference_links(refdex, section.reference_links)

    def global_reference_links(self):
        reference_links = []
        reference_links.extend(self.reference_links)
        for section in self.sections:
            reference_links.extend(section.reference_links)
        return reference_links

    def to_json_data(self, **kwargs):

        if kwargs.get('htmlize', False):
            if 'reference_links' not in kwargs:
                kwargs['reference_links'] = self.global_reference_links()
            preamble = markdown_to_html5(self.preamble, reference_links=kwargs['reference_links'])
            properties = markdown_to_html5_deep(self.properties, reference_links=kwargs['reference_links'])
        else:
            preamble = self.preamble
            properties = self.properties

        if kwargs.get('ordered', False):
            properties_list = []
            for key, value in properties.items():
                properties_list.append([key, value])
            properties = properties_list
        else:
            properties = dict(properties)

        return {
            'filename': self.filename,
            'title': self.title,
            'properties': properties,
            'preamble': preamble,
            'sections': [s.to_json_data(**kwargs) for s in self.sections],
        }


class Section(object):
    def __init__(self, title):
        self.document = None
        self.title = title
        self.properties = OrderedDict()

        self.lines = []

    def __str__(self):
        s = "section '{}'".format(self.title.encode('utf-8'))
        if self.document:
            s += " of " + str(self.document)
        return s

    @property
    def body(self):
        return '\n'.join(self.lines)

    @property
    def publication_date(self):
        formats = (
            "%b %d %Y %H:%M:%S",
            "%a, %d %b %Y %H:%M:%S GMT",
        )
        for format in formats:
            try:
                return datetime.strptime(self.properties['date'], format)
            except KeyError:
                raise KeyError("could not find 'date' on {}".format(self))
            except ValueError:
                pass
        raise NotImplementedError

    @property
    def anchor(self):
        return re.sub(
            r'[-\s]',
            '-',
            re.sub(r'[^\w\s-]', '', self.title).strip().lower()
        )

    def to_json_data(self, **kwargs):

        if kwargs.get('htmlize', False):
            body = markdown_to_html5(self.body, reference_links=kwargs['reference_links'])
            properties = markdown_to_html5_deep(self.properties, reference_links=kwargs['reference_links'])
        else:
            body = self.body
            properties = self.properties

        if kwargs.get('ordered', False):
            properties_list = []
            for key, value in properties.items():
                properties_list.append([key, value])
            properties = properties_list
        else:
            properties = dict(properties)

        return {
            'title': self.title,
            'anchor': self.anchor,
            'images': self.images,
            'properties': properties,
            'body': body,
        }


class Parser(object):

    def __init__(self, doc):
        self.lines = doc.split('\n')
        self.index = 0
        self.line = self.lines[self.index]
        self.index += 1

    def scan(self):
        self.line = self.lines[self.index]
        self.index += 1

    def eof(self):
        return self.index > len(self.lines) - 1

    def is_blank_line(self):
        return re.match(r'^\s*$', self.line)

    def is_image_line(self):
        return (
            re.match(r'^\!\[.*?\]\(.*?\)\s*$', self.line) or
            re.match(r'^\[\!\[.*?\]\(.*?\)\]\(.*?\)\s*$', self.line)
        )

    def is_property_line(self):
        return re.match(r'^\*\s+(.*?)\s*(\:|\@)\s*(.*?)\s*$', self.line)

    def is_heading_line(self):
        return re.match(r'^\#\#\#\s+(.*?)\s*$', self.line)

    def is_reference_link_line(self):
        return re.match(r'^\[(.*?)\]\:\s*(.*?)\s*$', self.line)

    def parse_document(self):
        # Feed       ::= :Title Properties Body {Section}.
        # Section    ::= {:Blank} :Heading {Image} Properties Body.
        # Properties ::= {:Blank | :Property}.
        # Body       ::= {:NonHeadingLine}.

        title = self.parse_title()
        document = Document(title)
        document.properties = self.parse_properties()
        lines, reference_links = self.parse_body()
        document.preamble = u'\n'.join(lines)
        document.reference_links = reference_links
        while not self.eof():
            section = self.parse_section()
            section.document = document
            document.sections.append(section)
        return document

    def parse_title(self):
        match = re.match(r'^\#\s+(.*?)\s*$', self.line)
        if match:
            title = match.group(1)
            self.scan()
            return title
        match = re.match(r'^\s*(.*?)\s*$', self.line)
        if match:
            title = match.group(1)
            self.scan()
            match = re.match(r'^\s*(\=+)\s*$', self.line)
            if match:
                self.scan()
                return title
        raise ValueError('Expected title')

    def parse_property(self):
        match = re.match(r'^\*\s+(.*?)\s*\@\s*(.*?)\s*$', self.line)
        if match:
            (key, val) = (match.group(1), match.group(2))
            self.scan()
            return ('@', key, val)
        match = re.match(r'^\*\s+(.*?)\s*\:\s*(.*?)\s*$', self.line)
        if match:
            (key, val) = (match.group(1), match.group(2))
            self.scan()
            return (':', key, val)
        raise ValueError('Expected property')

    def parse_properties(self):
        properties = OrderedDict()
        while self.is_blank_line() or self.is_property_line():
            if self.is_property_line():
                kind, key, val = self.parse_property()
                if kind == ':':
                    if key in properties:
                        raise KeyError('{} already given'.format(key))
                    properties[key] = val
                elif kind == '@':
                    properties.setdefault(key, []).append(val)
                else:
                    raise NotImplementedError(kind)
            else:
                self.scan()
        return properties

    def parse_section(self):
        while self.is_blank_line():
            self.scan()

        match = re.match(r'^\#\#\#\s+(.*?)\s*(\#\#\#)?\s*$', self.line)
        if not match:
            raise ValueError('Expected section, found "{}"'.format(self.line))

        section = Section(match.group(1))
        self.scan()
        section.images = self.parse_images()
        section.properties = self.parse_properties()
        lines, reference_links = self.parse_body()
        section.lines = lines
        section.reference_links = reference_links
        return section

    def parse_images(self):
        images = []
        while self.is_blank_line() or self.is_image_line():
            if self.is_image_line():
                match = re.match(r'^\!\[(.*?)\]\((.*?)\)\s*$', self.line)
                if match:
                    images.append({
                        'description': match.group(1),
                        'source': match.group(2),
                    })
                else:
                    match = re.match(r'^\[\!\[(.*?)\]\((.*?)\)\]\((.*?)\)\s*$', self.line)
                    images.append({
                        'description': match.group(1),
                        'source': match.group(2),
                        'link': match.group(3),
                    })
            self.scan()
        return images

    def parse_body(self):
        lines = []
        reference_links = []
        while not self.eof() and not self.is_heading_line() and not self.is_reference_link_line():
            lines.append(self.line)
            self.scan()
        while not self.eof() and (self.is_reference_link_line() or self.is_blank_line()):
            if self.is_reference_link_line():
                match = re.match(r'^\[(.*?)\]\:\s*(.*?)\s*$', self.line)
                if match:
                    reference_links.append((match.group(1), match.group(2)))
            self.scan()
        return (lines, reference_links)


if __name__ == "__main__":
    import doctest
    doctest.testmod()