|
0 |
#!/usr/bin/env python
|
|
1 |
|
|
2 |
import random
|
|
3 |
import re
|
|
4 |
import sys
|
|
5 |
|
|
6 |
from gutenberg import GutenbergCleaner
|
|
7 |
|
|
8 |
try:
|
|
9 |
from tqdm import tqdm
|
|
10 |
except ImportError:
|
|
11 |
def tqdm(x):
|
|
12 |
return x
|
|
13 |
|
|
14 |
|
|
15 |
DEBUG = False
|
|
16 |
|
|
17 |
|
|
18 |
def main(argv):
|
|
19 |
filenames = argv[1:]
|
|
20 |
|
|
21 |
words = []
|
|
22 |
|
|
23 |
for filename in filenames:
|
|
24 |
with open(filename, 'r') as f:
|
|
25 |
c = GutenbergCleaner(f)
|
|
26 |
lines = c.extract_text().split('\n')
|
|
27 |
for line in lines:
|
|
28 |
bits = line.split()
|
|
29 |
for bit in bits:
|
|
30 |
words.extend(bit.split('--'))
|
|
31 |
|
|
32 |
wordmap = {}
|
|
33 |
freq = {}
|
|
34 |
|
|
35 |
def clean(word):
|
|
36 |
if word.endswith(('.', '!', '?', ';', ',')):
|
|
37 |
word = word[:-1]
|
|
38 |
if word.startswith(('"', "'", '(')):
|
|
39 |
word = word[1:]
|
|
40 |
if word.endswith(('"', "'", ')')):
|
|
41 |
word = word[:-1]
|
|
42 |
if word.endswith(('.', '!', '?', ';', ',')):
|
|
43 |
word = word[:-1]
|
|
44 |
return word.lower()
|
|
45 |
|
|
46 |
words = [clean(word) for word in words]
|
|
47 |
|
|
48 |
last = None
|
|
49 |
for word in words:
|
|
50 |
word = word.lower()
|
|
51 |
freq[word] = freq.get(word, 0) + 1
|
|
52 |
if last is None:
|
|
53 |
last = word
|
|
54 |
continue
|
|
55 |
#print last, word
|
|
56 |
m = wordmap.setdefault(last, {})
|
|
57 |
m[word] = m.get(word, 0) + 1
|
|
58 |
#print wordmap
|
|
59 |
last = word
|
|
60 |
|
|
61 |
word = random.choice(freq.keys())
|
|
62 |
#print word, freq[word], wordmap[word]
|
|
63 |
sys.stdout.write(word + ' ')
|
|
64 |
|
|
65 |
for i in xrange(0, 100):
|
|
66 |
num = random.randint(1, freq[word])
|
|
67 |
acc = 0
|
|
68 |
for key, value in wordmap[word].iteritems():
|
|
69 |
acc += value
|
|
70 |
if acc >= num:
|
|
71 |
word = key
|
|
72 |
word, freq[word], wordmap[word]
|
|
73 |
sys.stdout.write(word + ' ')
|
|
74 |
break
|
|
75 |
|
|
76 |
|
|
77 |
if __name__ == '__main__':
|
|
78 |
main(sys.argv)
|