git @ Cat's Eye Technologies hatoucan / master script / hatoucan
master

Tree @master (Download .tar.gz)

hatoucan @masterraw · history · blame

#!/usr/bin/env python3

import struct
import sys

# Inefficient-but-public-domain Commodore BASIC 2.0 tokenizer.
# This work is in the public domain, covered under the UNLICENSE;
# see the file UNLICENSE in the root directory of this distribution,
# or http://www.unlicense.org/ for full details.

# references:
#   http://justsolve.archiveteam.org/wiki/Commodore_BASIC_tokenized_file
#   http://www.c64-wiki.com/index.php/BASIC_token

DEBUG = False

TOKENS = (
    ('restore', 140),
    ('input#', 132),
    ('return', 142),
    ('verify', 149),
    ('print#', 152),
    ('right$', 201),
    ('input', 133),
    ('gosub', 141),
    ('print', 153),
    ('close', 160),
    ('left$', 200),
    ('next', 130),
    ('data', 131),
    ('read', 135),
    ('goto', 137),
    ('stop', 144),
    ('wait', 146),
    ('load', 147),
    ('save', 148),
    ('poke', 151),
    ('cont', 154),
    ('list', 155),
    ('open', 159),
    ('tab(', 163),
    ('spc(', 166),
    ('then', 167),
    ('step', 169),
    ('peek', 194),
    ('str$', 196),
    ('chr$', 199),
    ('mid$', 202),
    ('end', 128),
    ('for', 129),
    ('dim', 134),
    ('let', 136),
    ('run', 138),
    ('rem', 143),
    ('def', 150),
    ('clr', 156),
    ('cmd', 157),
    ('sys', 158),
    ('get', 161),
    ('new', 162),
    ('not', 168),
    ('and', 175),
    ('sgn', 180),
    ('int', 181),
    ('abs', 182),
    ('usr', 183),
    ('fre', 184),
    ('pos', 185),
    ('sqr', 186),
    ('rnd', 187),
    ('log', 188),
    ('exp', 189),
    ('cos', 190),
    ('sin', 191),
    ('tan', 192),
    ('atn', 193),
    ('len', 195),
    ('val', 197),
    ('asc', 198),
    ('if', 139),
    ('on', 145),
    ('to', 164),
    ('fn', 165),
    ('or', 176),
    ('go', 203),
    ('+', 170),
    ('-', 171),
    ('*', 172),
    ('/', 173),
    ('^', 174),
    ('>', 177),
    ('=', 178),
    ('<', 179),
)

SPECIAL = (
    ('{rvs off}',  0x92),
    ('{rvof}',     0x92),
    ('{SHIFT-@}',  0xba),
    ('{rvs on}',   0x12),
    ('{rvon}',     0x12),
    ('{CBM-+}',    0xa6),
    ('{CBM-E}',    0xb1),
    ('{CBM-R}',    0xb2),
    ('{CBM-T}',    0xa3),
    ('{down}',     0x11),
    ('{home}',     0x13),
    ('{lblu}',     0x9a),
    ('{left}',     0x9d),
    ('{rght}',     0x1d),
    ('{blk}',      0x90),    
    ('{blu}',      0x1f),
    ('{clr}',      0x93),
    ('{cyn}',      0x9f),
    ('{grn}',      0x1e),
    ('{pur}',      0x9c),
    ('{red}',      0x1c),
    ('{wht}',      0x05),
    ('{yel}',      0x9e),
    ('{up}',       0x91),
)


def ascii_to_petscii(o):  # int -> int
    if o <= ord('@') or o in (ord('['), ord(']')):
        return o
    if o >= ord('a') and o <= ord('z'):
        return o - ord('a') + 0x41
    if o >= ord('A') and o <= ord('Z'):
        # adding 0x61 should be enough for PETSCII but.. tokenization adds
        # something more?  oh dear.
        return o - ord('A') + 0x61 + 0x60
    # TODO:
    # pound sign? 0x5c
    # up arrow? 0x5e
    # left arrow? 0x5f
    # hatched box? 0x60
    # pi? 0x7e
    # upper right triangle? 0x7f
    raise NotImplementedError("Cannot PETSCII: %s" % hex(o))


def scan(s, tokenize=True):
    # so inefficient.  I don't care.
    if tokenize:
        for (token, value) in TOKENS:
            if s.startswith(token):
               return (value, s[len(token):])
    if s[0] == '{':
        for (token, value) in SPECIAL:
           if s.startswith(token):
               return (value, s[len(token):])                
        raise NotImplementedError(s)
    return (ascii_to_petscii(ord(s[0])), s[1:])


def scan_line_number(s):
    s = s.lstrip()
    acc = []
    while s and s[0].isdigit():
        acc.append(s[0])
        s = s[1:]
    return (int(''.join(acc)), s.lstrip())


def tokenize(s):
    (line_number, s) = scan_line_number(s)
    bytes = []
    in_quotes = False
    in_remark = False
    while s:
        (byte, s) = scan(s, tokenize=not (in_quotes or in_remark))
        bytes.append(byte)
        if byte == ord('"'):
            in_quotes = not in_quotes
        if byte == 143:
            in_remark = True
    return (line_number, bytes)


def packbyte(byte):
    return struct.pack('B', byte)


def write_word(f, word):
    """f being a file-like object, word being an integer 0-65535"""
    low = word & 255
    high = (word >> 8) & 255
    f.write(packbyte(low))
    f.write(packbyte(high))


class TokenizedLine(object):
    def __init__(self, s, addr):
        (line_number, bytes) = tokenize(s)
        self.line_number = line_number
        self.bytes = bytes
        self.addr = addr
        self.next_addr = None

    def __str__(self):
        return '{} @{}: {}'.format(self.line_number, self.addr, self.bytes)

    def __len__(self):
        return len(self.bytes) + 5

    def write_to(self, f):
        """f being a file-like object"""
        assert self.next_addr is not None
        write_word(f, self.next_addr)
        write_word(f, self.line_number)
        for byte in self.bytes:
            f.write(packbyte(byte))
        f.write(packbyte(0))


class Sentinel(object):
    def __init__(self, addr):
        self.addr = addr
        self.next_addr = None

    def __len__(self):
        return 2

    def write_to(self, f):
        """f being a file-like object"""
        write_word(f, 0)


def main(argv):
    start_addr = 0x0801

    # parse command line
    while argv:
        switch = argv.pop(0)
        if switch == '-l':
            start_addr = eval('0x' + argv.pop(0))
        else:
            raise NotImplementedError(switch)

    # set sys.stdout to binary mode for Windows folks
    # reference: http://code.activestate.com/recipes/65443-sending-binary-data-to-stdout-under-windows/
    if sys.platform == "win32":
        import os, msvcrt
        msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)

    # tokenize all lines of input, and terminate with a sentinel
    tokenized_lines = []
    addr = start_addr
    for line in sys.stdin:
        if not line.strip():
            continue
        tokenized_line = TokenizedLine(line.rstrip(), addr)
        addr += len(tokenized_line)
        if DEBUG:
            sys.stderr.write('{}\n'.format(tokenized_line))
        tokenized_lines.append(tokenized_line)
    tokenized_lines.append(Sentinel(addr))

    # make second pass to resolve each line's pointer to start of next line
    i = 1
    while i < len(tokenized_lines):
        tokenized_lines[i - 1].next_addr = tokenized_lines[i].addr
        i += 1

    # obtain a file-like object for stdout to which we can write bytes
    try:
        # Python 3
        outfile = sys.stdout.buffer
    except AttributeError:
        # Python 2
        outfile = sys.stdout

    # write tokenized lines to output
    write_word(outfile, start_addr)
    for tokenized_line in tokenized_lines:
        tokenized_line.write_to(outfile)


if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))