Initial import of files for relwrite.
Chris Pressey
4 months ago
0 | `relwrite` | |
1 | ========== | |
2 | ||
3 | `relwrite` relates strings to string via a grammar in the Chomsky hierarchy. | |
4 | ||
5 | What does "relate" mean in this context? | |
6 | ||
7 | * Given a grammar and a string of terminals, it can _parse_ that string, and | |
8 | report if is in the language of the grammar or not. | |
9 | * Given a grammar and a nonterminal, it can _generate_ a string of terminals | |
10 | from them. | |
11 | ||
12 | The relational engine in `relwrite` is a very general one, based on string rewriting. | |
13 | There are therefore no restrictions on the input grammar -- it may be **regular**, | |
14 | **context-free**, **context-sensitive**, or **unrestricted**. If the grammar is | |
15 | ambiguous, then all possible parses (or generations) can be returned. | |
16 | ||
17 | It should be understood that `relwrite` trades off performance and small | |
18 | memory footprint in favour of generality. There is, however, a feature to improve | |
19 | the performance in the case of very long derivations. Specifying a search strategies | |
20 | enables a **beam search** algorithm which aggressively focuses on derivations with a | |
21 | desirable character, e.g. a particular minimum length. | |
22 | ||
23 | The grammar must be provided in the form of a JSON file. There are example | |
24 | grammar files in the `eg/` directory of this repo. | |
25 | ||
26 | ### Example usage | |
27 | ||
28 | Generate a string from a non-terminal in a grammar: | |
29 | ||
30 | ``` | |
31 | ./bin/relwrite eg/recursive-grammar.json --start "<Sentence>" --max-derivations=1 | |
32 | ``` | |
33 | ||
34 | Parse a string w.r.t. a grammar: | |
35 | ||
36 | ``` | |
37 | ./bin/relwrite eg/recursive-grammar.json --parse --start "a penguin sees a penguin then sees a dog" | |
38 | ``` | |
39 | ||
40 | Generate a really long string from a non-terminal in a grammar: | |
41 | ||
42 | (TK) | |
43 | ||
44 | ### Notes | |
45 | ||
46 | `relwrite` uses the term "derivation" as a generic term meaning "parse or generation". | |
47 | ||
48 | `relwrite` also uses the term "utterance" to mean "any string of terminals and non-terminals". | |
49 | ||
50 | ### TODO | |
51 | ||
52 | * specify output filename | |
53 | * try heuristic for contraction phase: highest proportion of terminals |
0 | #!/usr/bin/env python3 | |
1 | ||
2 | from os.path import realpath, dirname, join | |
3 | import sys | |
4 | ||
5 | sys.path.insert(0, join(dirname(realpath(sys.argv[0])), '..', 'src')) | |
6 | ||
7 | from relwrite.main import main | |
8 | ||
9 | ||
10 | if __name__ == '__main__': | |
11 | main(sys.argv[1:]) |
0 | [ | |
1 | [["<Sentence>"], ["<NP>", "<VP>"]], | |
2 | [["<Sentence>"], ["<Sentence>", "then", "<VP>"]], | |
3 | [["<NP>"], ["<Det>", "<Noun>"]], | |
4 | [["<VP>"], ["<Verb>", "<NP>"]], | |
5 | [["<Det>"], ["the"]], | |
6 | [["<Det>"], ["a"]], | |
7 | [["<Noun>"], ["dog"]], | |
8 | [["<Noun>"], ["penguin"]], | |
9 | [["<Verb>"], ["sees"]] | |
10 | ] |
0 | [ | |
1 | [["<Sentence>"], ["<NP>", "<VP>"]], | |
2 | [["<NP>"], ["<Det>", "<Noun>"]], | |
3 | [["<VP>"], ["<Verb>", "<NP>"]], | |
4 | [["<Det>"], ["the"]], | |
5 | [["<Det>"], ["a"]], | |
6 | [["<Noun>"], ["dog"]], | |
7 | [["<Noun>"], ["penguin"]], | |
8 | [["<Verb>"], ["sees"]] | |
9 | ] |
0 | def generate(rules, working_utterances, max_matches=None): | |
1 | """Note that an "utterance" can be any mix of terminals and non-terminals. | |
2 | The "final utterances" will consist of only one or the other. | |
3 | """ | |
4 | new_working_utterances = set() | |
5 | final_utterances = set() | |
6 | for utterance in working_utterances: | |
7 | num_rewrites_of_this_utterance = 0 | |
8 | for (pattern, replacement) in rules: | |
9 | indices = get_match_indices(utterance, pattern, max_matches=max_matches) | |
10 | for index in indices: | |
11 | new_utterance = replace_at_index( | |
12 | utterance, pattern, replacement, index | |
13 | ) | |
14 | new_working_utterances.add(new_utterance) | |
15 | num_rewrites_of_this_utterance += 1 | |
16 | if num_rewrites_of_this_utterance == 0: | |
17 | final_utterances.add(utterance) | |
18 | ||
19 | return new_working_utterances, final_utterances | |
20 | ||
21 | ||
22 | def get_match_indices(utterance, pattern, max_matches=None): | |
23 | length = len(pattern) | |
24 | matches = [] | |
25 | for index, _ in enumerate(utterance): | |
26 | if pattern == utterance[index:index + length]: | |
27 | matches.append(index) | |
28 | if max_matches and len(matches) >= max_matches: | |
29 | break | |
30 | return matches | |
31 | ||
32 | ||
33 | def replace_at_index(utterance, pattern, replacement, index): | |
34 | length = len(pattern) | |
35 | new_utterance = list(utterance) | |
36 | new_utterance[index:index + length] = replacement | |
37 | return tuple(new_utterance) | |
38 | ||
39 | ||
40 | def derive(rules, working_utterances, max_derivations=None, max_matches=None, verbose=False, strategy=None, expand_until=None, beam_width=10): | |
41 | final_utterances = None | |
42 | collected_utterances = [] | |
43 | num_derivations = 0 | |
44 | ||
45 | def score_expansion(u): | |
46 | return 0 - len(u) | |
47 | ||
48 | def score_contraction(u): | |
49 | return len(u) | |
50 | ||
51 | while working_utterances: | |
52 | ||
53 | length = len(working_utterances) | |
54 | lengths = [len(u) for u in working_utterances] | |
55 | min_length = min(lengths) | |
56 | if verbose: | |
57 | print('{} working utterances, min length = {}'.format( | |
58 | length, min_length | |
59 | )) | |
60 | if strategy == 'expansion' and min_length >= (expand_until or 0): | |
61 | if verbose: | |
62 | print('Reached {} threshold'.format(expand_until)) | |
63 | strategy = 'contraction' | |
64 | ||
65 | working_utterances, final_utterances = generate(rules, working_utterances, max_matches=max_matches) | |
66 | ||
67 | # beam search: sort by score and trim before continuing | |
68 | if strategy: | |
69 | scoring_function = score_contraction if strategy == 'contraction' else score_expansion | |
70 | working_utterances = sorted(working_utterances, key=scoring_function)[:beam_width] | |
71 | ||
72 | for utterance in final_utterances: | |
73 | print(' '.join(utterance)) | |
74 | collected_utterances.append(utterance) | |
75 | num_derivations += 1 | |
76 | if max_derivations and num_derivations >= max_derivations: | |
77 | working_utterances = [] | |
78 | break | |
79 | ||
80 | return collected_utterances |
0 | from argparse import ArgumentParser | |
1 | import json | |
2 | ||
3 | from .engine import derive | |
4 | ||
5 | ||
6 | def main(args): | |
7 | argparser = ArgumentParser() | |
8 | ||
9 | # NOTE: these options are provisional and will change | |
10 | ||
11 | argparser.add_argument( | |
12 | 'grammar_filename', metavar='FILENAME', type=str, | |
13 | help='JSON file containing the grammar to use' | |
14 | ) | |
15 | argparser.add_argument( | |
16 | "--parse", action="store_true", default=False, | |
17 | help="Process rules from right to left" | |
18 | ) | |
19 | argparser.add_argument( | |
20 | "--verbose", action="store_true", default=False, | |
21 | help="Display some vital statistics while processing" | |
22 | ) | |
23 | argparser.add_argument( | |
24 | "--start", metavar='UTTERANCE', type=str, default=None, | |
25 | help="A single utterance to use as " | |
26 | "the starting point of the derivation" | |
27 | ) | |
28 | argparser.add_argument( | |
29 | "--start-set-file", metavar='FILENAME', type=str, default=None, | |
30 | help="Use the set of utterances in this JSON file as " | |
31 | "the starting point of the derivation" | |
32 | ) | |
33 | ||
34 | argparser.add_argument( | |
35 | "--max-rewrites-per-utterance", metavar='COUNT', type=int, default=None, | |
36 | help="If given, limits the number of times a pattern can rewrite " | |
37 | "any particular utterance during a single sweep " | |
38 | "(default: no limit, unless beam search is applied, in which case 10)" | |
39 | ) | |
40 | argparser.add_argument( | |
41 | "--max-derivations", metavar='COUNT', type=int, default=None, | |
42 | help="The maximum number of derivations to produce " | |
43 | "(default: no limit)" | |
44 | ) | |
45 | argparser.add_argument( | |
46 | "--expand-until", metavar='SIZE', type=int, default=None, | |
47 | help="Implies the `expansion` strategy. Specifies that the " | |
48 | "resulting derivations must be at least this long" | |
49 | ) | |
50 | argparser.add_argument( | |
51 | "--beam-width", metavar='SIZE', type=int, default=10, | |
52 | help="When traversing with a strategy, specify the beam width " | |
53 | "for the beam search" | |
54 | ) | |
55 | ||
56 | argparser.add_argument( | |
57 | "--strategy", metavar='STRATEGY', type=str, default=None, | |
58 | help="Will apply a particular strategy (`expansion` or `contraction`) " | |
59 | "under beam search" | |
60 | ) | |
61 | ||
62 | options = argparser.parse_args(args) | |
63 | ||
64 | with open(options.grammar_filename, 'r') as f: | |
65 | grammar = json.loads(f.read()) | |
66 | ||
67 | rules = [(tuple(lhs), tuple(rhs)) for [lhs, rhs] in grammar] | |
68 | ||
69 | if options.parse: | |
70 | rules = [(b,a) for (a,b) in rules] | |
71 | ||
72 | if options.start: | |
73 | working_utterances = [tuple(options.start.split())] | |
74 | elif options.start_set_file: | |
75 | with open(options.start_set_file, 'r') as f: | |
76 | working_utterances = json.loads(f.read()) | |
77 | working_utterances = [tuple(x) for x in working_utterances] | |
78 | else: | |
79 | print("No start set given, please supply --start or --start-set-file") | |
80 | working_utterances = [] | |
81 | ||
82 | if options.expand_until: | |
83 | options.strategy = 'expansion' | |
84 | max_matches = options.max_rewrites_per_utterance | |
85 | if options.strategy: | |
86 | max_matches = max_matches or 10 | |
87 | ||
88 | result = derive( | |
89 | rules, | |
90 | working_utterances, | |
91 | max_derivations=options.max_derivations, | |
92 | max_matches=max_matches, | |
93 | verbose=options.verbose, | |
94 | strategy=options.strategy, | |
95 | expand_until=options.expand_until, | |
96 | beam_width=options.beam_width, | |
97 | ) | |
98 | ||
99 | with open('out.json', 'w') as f: | |
100 | f.write(json.dumps(result, indent=4)) | |
101 | ||
102 | ||
103 | if __name__ == '__main__': | |
104 | import sys | |
105 | main(sys.argv[1:]) |