git @ Cat's Eye Technologies Tamsin / fafd601
Unescape lit strs in parser, not in scanner; not yet in tamsin. Cat's Eye Technologies 11 years ago
5 changed file(s) with 54 addition(s) and 24 deletion(s). Raw diff Collapse all Expand all
00 TODO
11 ----
22
3 * `$:deep_reverse`
3 * unescape scanned atoms/""'s in tamsin_parser -- `$:unescape`?
4 * `$:unescape` must support \xXX codes
45 * use Tamsin repr in error messages
6 * __str__ should be Tamsin repr()?
57 * `$:substr` and/or `$:atom_to_list`
8 * should not really need `$:substr` if we implement `@`... just parse it!
69 * finish implementing compiler (to C) for a subset of Tamsin, in Tamsin
710 * `$:alpha`
811 * `$:digit`
9 * remove `$.alnum`
10 * `$:add`, `$:sub`, `$:mul`, `$:div`, `$:rem`, for atoms which look like
11 integers: `["-"] & {$:digit}`.
12 * `$:tell` and `$:seek` the implicit buffer -- for VM's etc -- although
13 note, this may have scary consequences when combined with backtracking
14 * non-backtracking versions of `|` and `{}`: `|!` and `{}!`
12 * remove `$.alnum`?
1513 * scanner should probably not be responsible for escaping characters;
1614 the token `"\n"` should turn into the term whose repr is `'"\\n"'`.
1715 ("scanner sanity")
16 * analyzer → fold all rules with same name into a linked list
1817
1918 ### testing ###
2019
2524
2625 ### lower-priority ###
2726
27 * `$:add`, `$:sub`, `$:mul`, `$:div`, `$:rem`, for atoms which look like
28 integers: `["-"] & {$:digit}`.
29 * `$:tell` and `$:seek` the implicit buffer -- for VM's etc -- although
30 note, this may have scary consequences when combined with backtracking
31 * non-backtracking versions of `|` and `{}`: `|!` and `{}!`
32 * `$:deep_reverse`
2833 * find different way to match variables in libtamsin, so that
2934 struct term's can be const all the way down — then share terms
3035 * and maybe even garbage-collect them, ooh.
409409
410410 Note that in the above, "printable" means ASCII characters between 32 ` `
411411 (space) and 126 `~`. It is not dependent on locale.
412
413 Also, `\xXX` escapes will always be output in lowercase, e.g. `\x0a`, not
414 `\x0A`.
412415
413416 The input to a Tamsin production is, in fact, an atom (although it's hardly
414417 atomic; "atom" is sort of a quaint moniker for the role these objects play.)
10891092
10901093 | main = $:repr(a(b(c('qu\'are\\')))).
10911094 = a(b(c('qu\'are\\')))
1095
1096 | main = $:repr('\x99').
1097 = '\x99'
10921098
10931099 Here's `$:reverse`, which takes a term E, and a term of the form
10941100 `X(a, X(b, ... X(z, E)) ... )`, and returns a term of the form
1313 | "/" | "," | "." | "@" | "+" | "$" | "→" | "←" | "«" | "»".
1414 str(Q) = «Q» → T & {(escape | !«Q» & any) → S & T ← T + S} & «Q» &
1515 return T + Q.
16 # TODO: don't unescape these here yet -- scanner should return what it sees
17 # TODO: when you do unescape them -- handle `\xXX` sequences too
1618 escape = "\\" & "n" & '\n'
1719 | "\\" & "r" & '\r'
1820 | "\\" & "t" & '\t'
161161 self.expect('}')
162162 return While(e)
163163 elif self.peek()[0] == '"':
164 s = self.consume_any()[1:-1]
164 s = unescape(self.consume_any()[1:-1])
165165 return Call(Prodref('$', 'expect'), [Atom(s)], None)
166166 elif self.consume(u'«') or self.consume('<<'):
167167 t = self.texpr()
257257 self.peek()[0] == "'"):
258258 atom = self.consume_any()
259259 if atom[0] in ('\'',):
260 atom = atom[1:-1]
260 atom = unescape(atom[1:-1])
261261 subs = []
262262 if self.consume('('):
263263 if self.peek() != ')':
270270 return Atom(atom)
271271 else:
272272 self.error('term')
273
274 ESCAPE_SEQUENCE = {
275 'r': "\r",
276 'n': "\n",
277 't': "\t",
278 "'": "'",
279 '"': '"',
280 '\\': '\\',
281 }
282
283 def unescape(s):
284 t = ''
285 i = 0
286 while i < len(s):
287 char = s[i]
288 if char == '\\':
289 i += 1
290 if i == len(s):
291 raise ValueError(s)
292 char = s[i]
293 if char in ESCAPE_SEQUENCE:
294 char = ESCAPE_SEQUENCE[char]
295 elif char == 'x':
296 k = s[i + 1] + s[i + 2]
297 i += 2
298 char = chr(int(k, 16))
299 else:
300 raise ValueError("bad escape")
301 t += char
302 i += 1
303 return t
195195 '\'': '\'',
196196 }
197197
198 ESCAPE_SEQUENCE = {
199 'r': "\r",
200 'n': "\n",
201 't': "\t",
202 "'": "'",
203 '"': '"',
204 '\\': '\\',
205 }
206
207198 class TamsinScannerEngine(ScannerEngine):
208199 def scan_impl(self, scanner):
209200 while not scanner.is_at_eof() and scanner.startswith(('#', ' ', '\t', '\r', '\n')):
242233 not scanner.startswith((CLOSE_QUOTE[quote],))):
243234 char = scanner.chop(1)
244235 if char == '\\':
236 token += char
245237 char = scanner.chop(1)
246 if char in ESCAPE_SEQUENCE:
247 char = ESCAPE_SEQUENCE[char]
248 elif char == 'x':
249 char = chr(int(scanner.chop(2), 16))
250 else:
251 scanner.error('legal escape sequence')
252238 token += char
253239 scanner.chop(1) # chop ending quote
254240 # we add the specific close quote we expect, in case it was EOF