Fix pathological memory use of scanner on large input files.
Chris Pressey
6 years ago
5 | 5 | self.text = text |
6 | 6 | self.token = None |
7 | 7 | self.type = None |
8 | self.pos = 0 | |
8 | 9 | self.scan() |
9 | 10 | |
10 | def scan_pattern(self, pattern, type, token_group=1, rest_group=2): | |
11 | pattern = r'^(' + pattern + r')(.*?)$' | |
12 | match = re.match(pattern, self.text, re.DOTALL) | |
11 | def scan_pattern(self, pattern, type_, token_group=1): | |
12 | pattern = r'(' + pattern + r')' | |
13 | regexp = re.compile(pattern, flags=re.DOTALL) | |
14 | match = regexp.match(self.text, pos=self.pos) | |
13 | 15 | if not match: |
14 | 16 | return False |
15 | 17 | else: |
16 | self.type = type | |
18 | self.type = type_ | |
17 | 19 | self.token = match.group(token_group) |
18 | self.text = match.group(rest_group) | |
19 | #print self.type, self.token | |
20 | self.pos += len(match.group(0)) | |
20 | 21 | return True |
21 | 22 | |
22 | 23 | def scan(self): |
23 | 24 | self.scan_pattern(r'[ \t\n\r]*', 'whitespace') |
24 | 25 | while self.text.startswith('/*'): |
25 | 26 | self.scan_pattern(r'\/\*.*?\*\/[ \t\n\r]*', 'comment') |
26 | if not self.text: | |
27 | if self.pos >= len(self.text): | |
27 | 28 | self.token = None |
28 | 29 | self.type = 'EOF' |
29 | 30 | return |
40 | 41 | return |
41 | 42 | if self.scan_pattern(r'\d+', 'integer literal'): |
42 | 43 | return |
43 | if self.scan_pattern(r'\"(.*?)\"', 'string literal', | |
44 | token_group=2, rest_group=3): | |
44 | if self.scan_pattern(r'\"(.*?)\"', 'string literal', token_group=2): | |
45 | 45 | return |
46 | 46 | if self.scan_pattern(r'[a-zA-Z][a-zA-Z0-9]*', 'identifier'): |
47 | 47 | return |
48 | 48 | if self.scan_pattern(r'.', 'unknown character'): |
49 | 49 | return |
50 | 50 | else: |
51 | raise ValueError("this should never happen, " | |
52 | "self.text=(%s)" % self.text) | |
51 | raise AssertionError( | |
52 | "this should never happen, self.text=(%s), self.pos=(%s)" % | |
53 | (self.text, self.pos) | |
54 | ) | |
53 | 55 | |
54 | 56 | def scan_playfield(self): |
55 | 57 | """Called when the token which introduces the playfield has |
60 | 62 | self.scan_pattern(r'[\n\r]', 'eol') |
61 | 63 | elems = [] |
62 | 64 | y = 0 |
63 | while self.text: | |
65 | while self.pos < len(self.text): | |
64 | 66 | x = 0 |
65 | 67 | while self.scan_pattern(r'[^\n\r]', 'arbitrary character'): |
66 | 68 | #print repr((x, y, self.token)) |