git @ Cat's Eye Technologies NaNoGenLab / master join-at-pivot / join-at-pivot.py
master

Tree @master (Download .tar.gz)

join-at-pivot.py @masterraw · history · blame

#!/usr/bin/env python

import random
import re
import sys

try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x):
        return x


DEBUG = False


def sentencify(words):
    return ' '.join(words)


def main(argv):
    filenames = argv[1:]

    words = []

    for filename in tqdm(filenames):
        with open(filename, 'r') as fh:
            for line in fh:
                bits = line.split()
                for bit in bits:
                    words.extend(bit.split('--'))

    sentences = []
    sentence = []
    for word in tqdm(words):
        if word.startswith(('"', "'")):
            word = word[1:]
        if word.endswith(('"', "'")):
            word = word[:-1]
        sentence.append(word)
        if word not in ('Mr.', 'Mrs.', 'Dr.') and word.endswith(('.', '!', '?')):
            sentences.append(sentence)
            sentence = []

    sentences.append(sentence)

    beginners = {}
    enders = {}

    for sentence in tqdm(sentences):
        l = len(sentence)
        if l % 2 == 1:
            # odd. odd is good
            i = l / 2
            middle = sentence[i]
        elif l != 0:
            # even. we can work with even, yes we can.
            i = l / 2
            middle = sentence[i]  # and i - 1
        else:
            continue
        if not middle:
            continue
        begin = sentence[:i]
        end = sentence[i+1:]
        beginners.setdefault(middle, []).append(begin)
        enders.setdefault(middle, []).append(end)

    frequency = {}
    common_keys = []
    MIN = 100
    for middle in tqdm(beginners):
        if middle in enders:
            frequency.setdefault(len(beginners[middle]) + len(enders[middle]), []).append(middle)
            if len(beginners[middle]) < MIN and len(enders[middle]) < MIN:
                pass
            else:
                common_keys.append(middle)

    if DEBUG:
        for freq in sorted(frequency, reverse=True):
            print freq, frequency[freq]

    for x in xrange(0, 10):
        for y in xrange(0, 10):
            middle = random.choice(common_keys)
            beginner = random.choice(beginners[middle])
            ender = random.choice(enders[middle])
            if DEBUG:
                print repr(middle), len(beginners[middle]), len(enders[middle])
            print sentencify(beginner + [middle] + ender),
        print
        print

    if DEBUG:
        for middle in common_keys:
            print '======='
            print repr(middle)
            print '----'
            for x in beginners[middle]:
                print sentencify(x)
            print '----'
            for x in enders[middle]:
                print sentencify(x)


if __name__ == '__main__':
    main(sys.argv)