git @ Cat's Eye Technologies NaNoGenLab / master quick-and-dirty-markov / quick-and-dirty-markov.py
master

Tree @master (Download .tar.gz)

quick-and-dirty-markov.py @masterraw · history · blame

#!/usr/bin/env python

import random
import re
import sys

from gutenberg import GutenbergCleaner

try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x):
        return x


DEBUG = False


def main(argv):
    filenames = argv[1:]

    words = []

    for filename in filenames:
        with open(filename, 'r') as f:
            c = GutenbergCleaner(f)
            lines = c.extract_text().split('\n')
            for line in lines:
                bits = line.split()
                for bit in bits:
                    words.extend(bit.split('--'))

    wordmap = {}
    freq = {}

    def clean(word):
        if word.endswith(('.', '!', '?', ';', ',')):
            word = word[:-1]
        if word.startswith(('"', "'", '(')):
            word = word[1:]
        if word.endswith(('"', "'", ')')):
            word = word[:-1]
        if word.endswith(('.', '!', '?', ';', ',')):
            word = word[:-1]
        return word.lower()

    words = [clean(word) for word in words]

    last = None
    for word in words:
        word = word.lower()
        freq[word] = freq.get(word, 0) + 1
        if last is None:
            last = word
            continue
        #print last, word
        m = wordmap.setdefault(last, {})
        m[word] = m.get(word, 0) + 1
        #print wordmap
        last = word

    word = random.choice(freq.keys())
    #print word, freq[word], wordmap[word]
    sys.stdout.write(word + ' ')

    for i in xrange(0, 100):
        num = random.randint(1, freq[word])
        acc = 0
        for key, value in wordmap[word].iteritems():
            acc += value
            if acc >= num:
                word = key
                word, freq[word], wordmap[word]
                sys.stdout.write(word + ' ')
                break


if __name__ == '__main__':
    main(sys.argv)