git @ Cat's Eye Technologies ALPACA / fffcad2
Fix pathological memory use of scanner on large input files. Chris Pressey 6 years ago
1 changed file(s) with 14 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
55 self.text = text
66 self.token = None
77 self.type = None
8 self.pos = 0
89 self.scan()
910
10 def scan_pattern(self, pattern, type, token_group=1, rest_group=2):
11 pattern = r'^(' + pattern + r')(.*?)$'
12 match = re.match(pattern, self.text, re.DOTALL)
11 def scan_pattern(self, pattern, type_, token_group=1):
12 pattern = r'(' + pattern + r')'
13 regexp = re.compile(pattern, flags=re.DOTALL)
14 match = regexp.match(self.text, pos=self.pos)
1315 if not match:
1416 return False
1517 else:
16 self.type = type
18 self.type = type_
1719 self.token = match.group(token_group)
18 self.text = match.group(rest_group)
19 #print self.type, self.token
20 self.pos += len(match.group(0))
2021 return True
2122
2223 def scan(self):
2324 self.scan_pattern(r'[ \t\n\r]*', 'whitespace')
2425 while self.text.startswith('/*'):
2526 self.scan_pattern(r'\/\*.*?\*\/[ \t\n\r]*', 'comment')
26 if not self.text:
27 if self.pos >= len(self.text):
2728 self.token = None
2829 self.type = 'EOF'
2930 return
4041 return
4142 if self.scan_pattern(r'\d+', 'integer literal'):
4243 return
43 if self.scan_pattern(r'\"(.*?)\"', 'string literal',
44 token_group=2, rest_group=3):
44 if self.scan_pattern(r'\"(.*?)\"', 'string literal', token_group=2):
4545 return
4646 if self.scan_pattern(r'[a-zA-Z][a-zA-Z0-9]*', 'identifier'):
4747 return
4848 if self.scan_pattern(r'.', 'unknown character'):
4949 return
5050 else:
51 raise ValueError("this should never happen, "
52 "self.text=(%s)" % self.text)
51 raise AssertionError(
52 "this should never happen, self.text=(%s), self.pos=(%s)" %
53 (self.text, self.pos)
54 )
5355
5456 def scan_playfield(self):
5557 """Called when the token which introduces the playfield has
6062 self.scan_pattern(r'[\n\r]', 'eol')
6163 elems = []
6264 y = 0
63 while self.text:
65 while self.pos < len(self.text):
6466 x = 0
6567 while self.scan_pattern(r'[^\n\r]', 'arbitrary character'):
6668 #print repr((x, y, self.token))