Unescape lit strs in parser, not in scanner; not yet in tamsin.
Cat's Eye Technologies
11 years ago
0 | 0 | TODO |
1 | 1 | ---- |
2 | 2 | |
3 | * `$:deep_reverse` | |
3 | * unescape scanned atoms/""'s in tamsin_parser -- `$:unescape`? | |
4 | * `$:unescape` must support \xXX codes | |
4 | 5 | * use Tamsin repr in error messages |
6 | * __str__ should be Tamsin repr()? | |
5 | 7 | * `$:substr` and/or `$:atom_to_list` |
8 | * should not really need `$:substr` if we implement `@`... just parse it! | |
6 | 9 | * finish implementing compiler (to C) for a subset of Tamsin, in Tamsin |
7 | 10 | * `$:alpha` |
8 | 11 | * `$:digit` |
9 | * remove `$.alnum` | |
10 | * `$:add`, `$:sub`, `$:mul`, `$:div`, `$:rem`, for atoms which look like | |
11 | integers: `["-"] & {$:digit}`. | |
12 | * `$:tell` and `$:seek` the implicit buffer -- for VM's etc -- although | |
13 | note, this may have scary consequences when combined with backtracking | |
14 | * non-backtracking versions of `|` and `{}`: `|!` and `{}!` | |
12 | * remove `$.alnum`? | |
15 | 13 | * scanner should probably not be responsible for escaping characters; |
16 | 14 | the token `"\n"` should turn into the term whose repr is `'"\\n"'`. |
17 | 15 | ("scanner sanity") |
16 | * analyzer → fold all rules with same name into a linked list | |
18 | 17 | |
19 | 18 | ### testing ### |
20 | 19 | |
25 | 24 | |
26 | 25 | ### lower-priority ### |
27 | 26 | |
27 | * `$:add`, `$:sub`, `$:mul`, `$:div`, `$:rem`, for atoms which look like | |
28 | integers: `["-"] & {$:digit}`. | |
29 | * `$:tell` and `$:seek` the implicit buffer -- for VM's etc -- although | |
30 | note, this may have scary consequences when combined with backtracking | |
31 | * non-backtracking versions of `|` and `{}`: `|!` and `{}!` | |
32 | * `$:deep_reverse` | |
28 | 33 | * find different way to match variables in libtamsin, so that |
29 | 34 | struct term's can be const all the way down — then share terms |
30 | 35 | * and maybe even garbage-collect them, ooh. |
409 | 409 | |
410 | 410 | Note that in the above, "printable" means ASCII characters between 32 ` ` |
411 | 411 | (space) and 126 `~`. It is not dependent on locale. |
412 | ||
413 | Also, `\xXX` escapes will always be output in lowercase, e.g. `\x0a`, not | |
414 | `\x0A`. | |
412 | 415 | |
413 | 416 | The input to a Tamsin production is, in fact, an atom (although it's hardly |
414 | 417 | atomic; "atom" is sort of a quaint moniker for the role these objects play.) |
1089 | 1092 | |
1090 | 1093 | | main = $:repr(a(b(c('qu\'are\\')))). |
1091 | 1094 | = a(b(c('qu\'are\\'))) |
1095 | ||
1096 | | main = $:repr('\x99'). | |
1097 | = '\x99' | |
1092 | 1098 | |
1093 | 1099 | Here's `$:reverse`, which takes a term E, and a term of the form |
1094 | 1100 | `X(a, X(b, ... X(z, E)) ... )`, and returns a term of the form |
13 | 13 | | "/" | "," | "." | "@" | "+" | "$" | "→" | "←" | "«" | "»". |
14 | 14 | str(Q) = «Q» → T & {(escape | !«Q» & any) → S & T ← T + S} & «Q» & |
15 | 15 | return T + Q. |
16 | # TODO: don't unescape these here yet -- scanner should return what it sees | |
17 | # TODO: when you do unescape them -- handle `\xXX` sequences too | |
16 | 18 | escape = "\\" & "n" & '\n' |
17 | 19 | | "\\" & "r" & '\r' |
18 | 20 | | "\\" & "t" & '\t' |
161 | 161 | self.expect('}') |
162 | 162 | return While(e) |
163 | 163 | elif self.peek()[0] == '"': |
164 | s = self.consume_any()[1:-1] | |
164 | s = unescape(self.consume_any()[1:-1]) | |
165 | 165 | return Call(Prodref('$', 'expect'), [Atom(s)], None) |
166 | 166 | elif self.consume(u'«') or self.consume('<<'): |
167 | 167 | t = self.texpr() |
257 | 257 | self.peek()[0] == "'"): |
258 | 258 | atom = self.consume_any() |
259 | 259 | if atom[0] in ('\'',): |
260 | atom = atom[1:-1] | |
260 | atom = unescape(atom[1:-1]) | |
261 | 261 | subs = [] |
262 | 262 | if self.consume('('): |
263 | 263 | if self.peek() != ')': |
270 | 270 | return Atom(atom) |
271 | 271 | else: |
272 | 272 | self.error('term') |
273 | ||
274 | ESCAPE_SEQUENCE = { | |
275 | 'r': "\r", | |
276 | 'n': "\n", | |
277 | 't': "\t", | |
278 | "'": "'", | |
279 | '"': '"', | |
280 | '\\': '\\', | |
281 | } | |
282 | ||
283 | def unescape(s): | |
284 | t = '' | |
285 | i = 0 | |
286 | while i < len(s): | |
287 | char = s[i] | |
288 | if char == '\\': | |
289 | i += 1 | |
290 | if i == len(s): | |
291 | raise ValueError(s) | |
292 | char = s[i] | |
293 | if char in ESCAPE_SEQUENCE: | |
294 | char = ESCAPE_SEQUENCE[char] | |
295 | elif char == 'x': | |
296 | k = s[i + 1] + s[i + 2] | |
297 | i += 2 | |
298 | char = chr(int(k, 16)) | |
299 | else: | |
300 | raise ValueError("bad escape") | |
301 | t += char | |
302 | i += 1 | |
303 | return t |
195 | 195 | '\'': '\'', |
196 | 196 | } |
197 | 197 | |
198 | ESCAPE_SEQUENCE = { | |
199 | 'r': "\r", | |
200 | 'n': "\n", | |
201 | 't': "\t", | |
202 | "'": "'", | |
203 | '"': '"', | |
204 | '\\': '\\', | |
205 | } | |
206 | ||
207 | 198 | class TamsinScannerEngine(ScannerEngine): |
208 | 199 | def scan_impl(self, scanner): |
209 | 200 | while not scanner.is_at_eof() and scanner.startswith(('#', ' ', '\t', '\r', '\n')): |
242 | 233 | not scanner.startswith((CLOSE_QUOTE[quote],))): |
243 | 234 | char = scanner.chop(1) |
244 | 235 | if char == '\\': |
236 | token += char | |
245 | 237 | char = scanner.chop(1) |
246 | if char in ESCAPE_SEQUENCE: | |
247 | char = ESCAPE_SEQUENCE[char] | |
248 | elif char == 'x': | |
249 | char = chr(int(scanner.chop(2), 16)) | |
250 | else: | |
251 | scanner.error('legal escape sequence') | |
252 | 238 | token += char |
253 | 239 | scanner.chop(1) # chop ending quote |
254 | 240 | # we add the specific close quote we expect, in case it was EOF |