git @ Cat's Eye Technologies T-Rext / 0.1
Initial import of T-Rext sources. Chris Pressey 4 years ago
8 changed file(s) with 413 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 *.pyc
0 syntax: glob
1
2 *.pyc
0 T-Rext
1 ======
2
3 T-Rext is a command-line filter that attempts to clean up spaces and
4 punctuation in a text file. Its purpose is so that, when you are writing
5 a text generator, such as a Markov processor, you need not worry too much
6 about its output format; just toss its output through T-Rext when you're
7 done to make it more presentable.
8
9 Usage
10 -----
11
12 bin/t-rext raw_output.txt > cleaned_output.txt
13
14 This will take lines that look like this:
15
16 " Well , " said the king , , " no . "
17
18 and reformat them to look like this:
19
20 “Well,” said the king, “no.”
21
22 To use T-Rext from any working directory, add the `bin` directory in this
23 repository to your `PATH`. For example, you might add this line to your
24 `.bashrc`:
25
26 export PATH=/path/to/this/repo/bin:$PATH
27
28 T-Rext is built on an over-engineered library of pipeline processors, which
29 you can use directly (note, its interface is not stable and liable to change.)
30 To use the T-Rext Python modules in other Python programs, make sure the
31 `src` directory of this repository is on your `PYTHONPATH`. For example,
32 you might add this line to your `.bashrc`:
33
34 export PYTHONPATH=/path/to/this/repo/src:$PYTHONPATH
35
36 Then you can add imports like this to the top of your script:
37
38 from t_rext.processors import TrailingWhitespaceProcessor
39
40 An easy way to accomplish the above two things is to dock T-Rext using
41 [toolshelf][]:
42
43 toolshelf dock gh:catseye/t-rext
44
45 Tests
46 -----
47
48 This is a test suite, written in [Falderal][] format, for the `t-rext`
49 utility. It also serves as documentation for said utility.
50
51 -> Functionality "Clean up punctuation and spaces" is implemented by
52 -> shell command "bin/t-rext %(test-body-file)"
53
54 -> Tests for functionality "Clean up punctuation and spaces"
55
56 Spaces before commas and periods are elided.
57
58 | Well , that is good .
59 = Well, that is good.
60
61 Multiple commas are collapsed into a single comma.
62
63 | Well , , that is good .
64 = Well, that is good.
65
66 Multiple periods are not collapsed into a single period.
67
68 | Well . . . that is good.
69 = Well... that is good.
70
71 Quotes are oriented.
72
73 | "Yes," he said.
74 = “Yes,” he said.
75
76 Spaces after opening quotes and before closing quotes are elided.
77
78 | " Yes , " he said.
79 = “Yes,” he said.
80
81 But not the other way 'round.
82
83 | Muttering "Yes," he turned around.
84 = Muttering “Yes,” he turned around.
85
86 Quotes do not match across paragraphs.
87
88 | Turbid "Waters" that "leak.
89 |
90 | You "don't" have a clue.
91 = Turbid “Waters” that “leak.
92 =
93 = You “don't” have a clue.
94
95 [Falderal]: http://catseye.tc/node/Falderal
96 [toolshelf]: http://catseye.tc/node/toolshelf
0 This is free and unencumbered software released into the public domain.
1
2 Anyone is free to copy, modify, publish, use, compile, sell, or
3 distribute this software, either in source code form or as a compiled
4 binary, for any purpose, commercial or non-commercial, and by any
5 means.
6
7 In jurisdictions that recognize copyright laws, the author or authors
8 of this software dedicate any and all copyright interest in the
9 software to the public domain. We make this dedication for the benefit
10 of the public at large and to the detriment of our heirs and
11 successors. We intend this dedication to be an overt act of
12 relinquishment in perpetuity of all present and future rights to this
13 software under copyright law.
14
15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 OTHER DEALINGS IN THE SOFTWARE.
22
23 For more information, please refer to <http://unlicense.org/>
0 #!/usr/bin/env python
1
2 """Usage: t-rext [OPTIONS] FILES
3
4 Adjusts spacing (and other factors) to make a generated text more presentable.
5 """
6
7 from os.path import realpath, dirname, join
8 import sys
9
10 sys.path.insert(0, join(dirname(realpath(sys.argv[0])), '..', 'src'))
11
12 # ----------------------------------------------------------------- #
13
14 import codecs
15 from optparse import OptionParser
16 import sys
17
18 from t_rext.processors import (
19 TidyPunctuationLineFilter,
20 QuoteOrienterLineFilter,
21 LinesToParagraphsRegrouper,
22 )
23
24
25 optparser = OptionParser(__doc__.strip())
26 (options, args) = optparser.parse_args(sys.argv[1:])
27
28 for filename in args:
29 with codecs.open(filename, 'r', encoding='UTF-8') as f:
30 for para in LinesToParagraphsRegrouper(f):
31 for line in TidyPunctuationLineFilter(
32 QuoteOrienterLineFilter(para),
33 ):
34 sys.stdout.write(line.encode('UTF-8'))
35 sys.stdout.write('\n')
(New empty file)
0 # encoding: UTF-8
1
2 import re
3
4
5 class Processor(object):
6 """An abstract base class that defines the protocol for Processor objects.
7 """
8
9 def __init__(self, iterable):
10 """Given an iterable of objects, become an iterable of other objects.
11
12 The two sets of objects need not be the same type.
13
14 Note that a file-like object is an iterable of lines.
15 """
16 self._iterable = iterable
17 self.errors = []
18
19 @property
20 def iterable(self):
21 for thing in self._iterable:
22 self.check_input_value(thing)
23 yield thing
24
25 def check_input_value(self, value):
26 pass
27
28 def has_failed(self, original, result):
29 """Given two iterables, representing the input and the output
30 of this Processor, return a boolean indicating whether we think
31 this Processor has failed or not.
32 """
33 return False
34
35 def __iter__(self):
36 raise NotImplementedError
37
38 def __str__(self):
39 return self.__class__.__name__
40
41
42 class LineProcessor(Processor):
43
44 def check_input_value(self, value):
45 assert isinstance(value, unicode)
46
47
48 class TrailingWhitespaceProcessor(LineProcessor):
49
50 def __iter__(self):
51 for line in self.iterable:
52 yield line.rstrip()
53
54
55 class SentinelProcessor(LineProcessor):
56 """Yields only those lines of the input between the start
57 sentinel (exclusive) and the end sentinel (exclusive.)
58
59 The start sentinel is actually "super-exclusive" in that neither it,
60 nor any non-blank lines immediately following it, are included in
61 the output.
62
63 Note that cleaned lines are stripped of trailing whitespace.
64 """
65
66 def __iter__(self):
67 self.state = 'pre'
68 for line in self.iterable:
69 line = line.rstrip()
70 if self.state == 'pre':
71 match = re.match(self.START_RE, line.upper())
72 if match:
73 self.state = 'consuming-start'
74 elif self.state == 'consuming-start':
75 if not line:
76 self.state = 'mid'
77 elif self.state == 'mid':
78 match = re.match(self.END_RE, line.upper())
79 if match:
80 self.state = 'post'
81 else:
82 yield line
83 else:
84 assert self.state == 'post'
85 pass
86
87
88 class ComposedProcessor(LineProcessor):
89 """A Processor which applies multiple Processors to an input in
90 sequence. If any Processor fails, it returns the result of
91 processing only up to the point of the failure.
92 """
93
94 def __init__(self, lines, classes, name=''):
95 LineProcessor.__init__(self, lines)
96 self.classes = classes
97 self.name = name
98
99 def __iter__(self):
100 lines = list(self.iterable)
101 for cls in self.classes:
102 filter_ = cls(lines)
103 new_lines = list(filter_)
104 if filter_.has_failed(lines, new_lines):
105 self.errors.append("%s failed to clean '%s'" % (filter_, self.name))
106 break
107 lines = new_lines
108
109 for line in lines:
110 yield line
111
112
113 class RewritingProcessor(LineProcessor):
114 SUBSTITUTIONS = ()
115
116 def rewrite_line(self, subject, replacement, line):
117 count = 1
118 while count > 0:
119 (line, count) = re.subn(subject, replacement, line)
120 return line
121
122 def __iter__(self):
123 for line in self.iterable:
124 line = line.rstrip()
125 for (subject, replacement) in self.SUBSTITUTIONS:
126 line = self.rewrite_line(subject, replacement, line)
127 yield line
128
129
130 class TidyPunctuationLineFilter(RewritingProcessor):
131 SUBSTITUTIONS = (
132 (ur'- ', u'-'),
133 (ur' ,', u','),
134 (ur' \.', u'.'),
135 (ur' \;', u';'),
136 (ur' \:', u':'),
137 (ur' \?', u'?'),
138 (ur' \!', u'!'),
139 (ur',,', u','),
140 (ur',\.', u'.'),
141 (ur'“ ', u'“'),
142 (ur' ”', u'”'),
143 )
144
145
146 class FixProductiveEndingsLineFilter(RewritingProcessor):
147 SUBSTITUTIONS = (
148 (r'olfs ', 'olves '),
149 (r'xs', 'xes'),
150 (r'ullly', 'ully'),
151 (r'yly', 'ily'),
152 (r'icly', 'ically'),
153 (r'lely', 'ly'),
154 (r' coily', ' coyly'),
155 )
156
157
158 class FixIndefiniteArticlesLineFilter(RewritingProcessor):
159 SUBSTITUTIONS = (
160 (r' An unique', ' A unique'),
161 (r' an unique', ' a unique'),
162 (r' An unicorn', ' A unicorn'),
163 (r' an unicorn', ' a unicorn'),
164 )
165
166
167 class QuoteOrienterLineFilter(LineProcessor):
168 """Note that this expects to work on a single paragraph
169 only. (If you give it more than one paragraph, it will
170 happily match quotes between adjacent paragraphs, which
171 is probably not what you want.)
172 """
173
174 def __iter__(self):
175 self.state = 0
176 for line in self.iterable:
177 new_line = u''
178 for character in line:
179 character = unicode(character)
180 if character == u'"':
181 if self.state == 0:
182 character = u'“'
183 self.state = 1
184 else:
185 assert self.state == 1
186 character = u'”'
187 self.state = 0
188 new_line += character
189 yield new_line
190
191
192 class Regrouper(Processor):
193 """An abstract class that defines the protocol for Regrouper objects."""
194 pass
195
196
197 class LinesToParagraphsRegrouper(Regrouper):
198 """A Regrouper that groups lines into paragraphs and collections of
199 intervening blank lines.
200 """
201
202 def __iter__(self):
203 state = 'begin'
204 group = []
205 for line in self.iterable:
206 line = line.rstrip()
207 if line:
208 if state == 'begin':
209 state = 'para'
210 group.append(line)
211 elif state == 'para':
212 group.append(line)
213 else:
214 assert state == 'blank'
215 yield group
216 state = 'para'
217 group = []
218 group.append(line)
219 else:
220 if state == 'begin':
221 state = 'blank'
222 group.append(line)
223 elif state == 'blank':
224 group.append(line)
225 else:
226 assert state == 'para'
227 yield group
228 state = 'blank'
229 group = []
230 group.append(line)
231 if group:
232 yield group
233
234
235 class ParagraphsToLinesRegrouper(Regrouper):
236 """A Regrouper that ungroups paragraphs (and collections of blank lines)
237 into individual lines.
238 """
239
240 def check_input_value(self, value):
241 assert isinstance(value, list)
242 for element in value:
243 assert isinstance(element, unicode)
244
245 def __iter__(self):
246 for para in self.iterable:
247 for line in para:
248 yield line
0 #!/bin/sh
1
2 falderal README.md