git @ Cat's Eye Technologies relwrite / 6270d90
Initial import of files for relwrite. Chris Pressey 4 months ago
8 changed file(s) with 277 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 __pycache__/
1 *.pyc
2
0 `relwrite`
1 ==========
2
3 `relwrite` relates strings to string via a grammar in the Chomsky hierarchy.
4
5 What does "relate" mean in this context?
6
7 * Given a grammar and a string of terminals, it can _parse_ that string, and
8 report if is in the language of the grammar or not.
9 * Given a grammar and a nonterminal, it can _generate_ a string of terminals
10 from them.
11
12 The relational engine in `relwrite` is a very general one, based on string rewriting.
13 There are therefore no restrictions on the input grammar -- it may be **regular**,
14 **context-free**, **context-sensitive**, or **unrestricted**. If the grammar is
15 ambiguous, then all possible parses (or generations) can be returned.
16
17 It should be understood that `relwrite` trades off performance and small
18 memory footprint in favour of generality. There is, however, a feature to improve
19 the performance in the case of very long derivations. Specifying a search strategies
20 enables a **beam search** algorithm which aggressively focuses on derivations with a
21 desirable character, e.g. a particular minimum length.
22
23 The grammar must be provided in the form of a JSON file. There are example
24 grammar files in the `eg/` directory of this repo.
25
26 ### Example usage
27
28 Generate a string from a non-terminal in a grammar:
29
30 ```
31 ./bin/relwrite eg/recursive-grammar.json --start "<Sentence>" --max-derivations=1
32 ```
33
34 Parse a string w.r.t. a grammar:
35
36 ```
37 ./bin/relwrite eg/recursive-grammar.json --parse --start "a penguin sees a penguin then sees a dog"
38 ```
39
40 Generate a really long string from a non-terminal in a grammar:
41
42 (TK)
43
44 ### Notes
45
46 `relwrite` uses the term "derivation" as a generic term meaning "parse or generation".
47
48 `relwrite` also uses the term "utterance" to mean "any string of terminals and non-terminals".
49
50 ### TODO
51
52 * specify output filename
53 * try heuristic for contraction phase: highest proportion of terminals
0 #!/usr/bin/env python3
1
2 from os.path import realpath, dirname, join
3 import sys
4
5 sys.path.insert(0, join(dirname(realpath(sys.argv[0])), '..', 'src'))
6
7 from relwrite.main import main
8
9
10 if __name__ == '__main__':
11 main(sys.argv[1:])
0 [
1 [["<Sentence>"], ["<NP>", "<VP>"]],
2 [["<Sentence>"], ["<Sentence>", "then", "<VP>"]],
3 [["<NP>"], ["<Det>", "<Noun>"]],
4 [["<VP>"], ["<Verb>", "<NP>"]],
5 [["<Det>"], ["the"]],
6 [["<Det>"], ["a"]],
7 [["<Noun>"], ["dog"]],
8 [["<Noun>"], ["penguin"]],
9 [["<Verb>"], ["sees"]]
10 ]
0 [
1 [["<Sentence>"], ["<NP>", "<VP>"]],
2 [["<NP>"], ["<Det>", "<Noun>"]],
3 [["<VP>"], ["<Verb>", "<NP>"]],
4 [["<Det>"], ["the"]],
5 [["<Det>"], ["a"]],
6 [["<Noun>"], ["dog"]],
7 [["<Noun>"], ["penguin"]],
8 [["<Verb>"], ["sees"]]
9 ]
(New empty file)
0 def generate(rules, working_utterances, max_matches=None):
1 """Note that an "utterance" can be any mix of terminals and non-terminals.
2 The "final utterances" will consist of only one or the other.
3 """
4 new_working_utterances = set()
5 final_utterances = set()
6 for utterance in working_utterances:
7 num_rewrites_of_this_utterance = 0
8 for (pattern, replacement) in rules:
9 indices = get_match_indices(utterance, pattern, max_matches=max_matches)
10 for index in indices:
11 new_utterance = replace_at_index(
12 utterance, pattern, replacement, index
13 )
14 new_working_utterances.add(new_utterance)
15 num_rewrites_of_this_utterance += 1
16 if num_rewrites_of_this_utterance == 0:
17 final_utterances.add(utterance)
18
19 return new_working_utterances, final_utterances
20
21
22 def get_match_indices(utterance, pattern, max_matches=None):
23 length = len(pattern)
24 matches = []
25 for index, _ in enumerate(utterance):
26 if pattern == utterance[index:index + length]:
27 matches.append(index)
28 if max_matches and len(matches) >= max_matches:
29 break
30 return matches
31
32
33 def replace_at_index(utterance, pattern, replacement, index):
34 length = len(pattern)
35 new_utterance = list(utterance)
36 new_utterance[index:index + length] = replacement
37 return tuple(new_utterance)
38
39
40 def derive(rules, working_utterances, max_derivations=None, max_matches=None, verbose=False, strategy=None, expand_until=None, beam_width=10):
41 final_utterances = None
42 collected_utterances = []
43 num_derivations = 0
44
45 def score_expansion(u):
46 return 0 - len(u)
47
48 def score_contraction(u):
49 return len(u)
50
51 while working_utterances:
52
53 length = len(working_utterances)
54 lengths = [len(u) for u in working_utterances]
55 min_length = min(lengths)
56 if verbose:
57 print('{} working utterances, min length = {}'.format(
58 length, min_length
59 ))
60 if strategy == 'expansion' and min_length >= (expand_until or 0):
61 if verbose:
62 print('Reached {} threshold'.format(expand_until))
63 strategy = 'contraction'
64
65 working_utterances, final_utterances = generate(rules, working_utterances, max_matches=max_matches)
66
67 # beam search: sort by score and trim before continuing
68 if strategy:
69 scoring_function = score_contraction if strategy == 'contraction' else score_expansion
70 working_utterances = sorted(working_utterances, key=scoring_function)[:beam_width]
71
72 for utterance in final_utterances:
73 print(' '.join(utterance))
74 collected_utterances.append(utterance)
75 num_derivations += 1
76 if max_derivations and num_derivations >= max_derivations:
77 working_utterances = []
78 break
79
80 return collected_utterances
0 from argparse import ArgumentParser
1 import json
2
3 from .engine import derive
4
5
6 def main(args):
7 argparser = ArgumentParser()
8
9 # NOTE: these options are provisional and will change
10
11 argparser.add_argument(
12 'grammar_filename', metavar='FILENAME', type=str,
13 help='JSON file containing the grammar to use'
14 )
15 argparser.add_argument(
16 "--parse", action="store_true", default=False,
17 help="Process rules from right to left"
18 )
19 argparser.add_argument(
20 "--verbose", action="store_true", default=False,
21 help="Display some vital statistics while processing"
22 )
23 argparser.add_argument(
24 "--start", metavar='UTTERANCE', type=str, default=None,
25 help="A single utterance to use as "
26 "the starting point of the derivation"
27 )
28 argparser.add_argument(
29 "--start-set-file", metavar='FILENAME', type=str, default=None,
30 help="Use the set of utterances in this JSON file as "
31 "the starting point of the derivation"
32 )
33
34 argparser.add_argument(
35 "--max-rewrites-per-utterance", metavar='COUNT', type=int, default=None,
36 help="If given, limits the number of times a pattern can rewrite "
37 "any particular utterance during a single sweep "
38 "(default: no limit, unless beam search is applied, in which case 10)"
39 )
40 argparser.add_argument(
41 "--max-derivations", metavar='COUNT', type=int, default=None,
42 help="The maximum number of derivations to produce "
43 "(default: no limit)"
44 )
45 argparser.add_argument(
46 "--expand-until", metavar='SIZE', type=int, default=None,
47 help="Implies the `expansion` strategy. Specifies that the "
48 "resulting derivations must be at least this long"
49 )
50 argparser.add_argument(
51 "--beam-width", metavar='SIZE', type=int, default=10,
52 help="When traversing with a strategy, specify the beam width "
53 "for the beam search"
54 )
55
56 argparser.add_argument(
57 "--strategy", metavar='STRATEGY', type=str, default=None,
58 help="Will apply a particular strategy (`expansion` or `contraction`) "
59 "under beam search"
60 )
61
62 options = argparser.parse_args(args)
63
64 with open(options.grammar_filename, 'r') as f:
65 grammar = json.loads(f.read())
66
67 rules = [(tuple(lhs), tuple(rhs)) for [lhs, rhs] in grammar]
68
69 if options.parse:
70 rules = [(b,a) for (a,b) in rules]
71
72 if options.start:
73 working_utterances = [tuple(options.start.split())]
74 elif options.start_set_file:
75 with open(options.start_set_file, 'r') as f:
76 working_utterances = json.loads(f.read())
77 working_utterances = [tuple(x) for x in working_utterances]
78 else:
79 print("No start set given, please supply --start or --start-set-file")
80 working_utterances = []
81
82 if options.expand_until:
83 options.strategy = 'expansion'
84 max_matches = options.max_rewrites_per_utterance
85 if options.strategy:
86 max_matches = max_matches or 10
87
88 result = derive(
89 rules,
90 working_utterances,
91 max_derivations=options.max_derivations,
92 max_matches=max_matches,
93 verbose=options.verbose,
94 strategy=options.strategy,
95 expand_until=options.expand_until,
96 beam_width=options.beam_width,
97 )
98
99 with open('out.json', 'w') as f:
100 f.write(json.dumps(result, indent=4))
101
102
103 if __name__ == '__main__':
104 import sys
105 main(sys.argv[1:])