Palindrome Program: pal2.py

import random, re, bisect, time

"""Produce Panama-ish Palindromes. Copyright (C) 2002-2008, Peter Norvig."""

################ Checking for Palindromes

def is_panama(s):
    "Test if string s is a Panama-ish palindrome."
    return is_palindrome(s) and is_unique(phrases(s))

def is_palindrome(s):
    "Test if a string is a palindrome."
    s1 = canonical(s)
    return s1 == reversestr(s1)

def phrases(s):
    "Break a string s into comma-separated phrases."
    return [phrase.strip() for phrase in s.split(',')]

def canonical(word, sub=re.compile('''[-* \t\n\r.,;!?:()`"']''').sub):
    "The canonical form for comparing: lowercase, no blanks or punctuation."
    return sub('', word).lower()

################ Utilities

def reversestr(x):
    "Reverse a string."
    return x[::-1]

def is_unique(seq):
    "Return true if seq has no duplicate elements."
    return len(seq) == len(set(seq))

def update(obj, **entries):
    "Change attributes of obj, according to the keyword args."
    obj.__dict__.update(entries)
    return obj

################ Reading in a dictionary

class PalDict:
    """A dictionary from which you can find canonical words that start or end
    with a given canonical substring, and find the true name of a
    canonical word with d.truename[canonicalword]."""
    
    def __init__(self, k=1000, filename='npdict.txt'):
        words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}
        for tword in open(filename).read().splitlines():
            word = canonical(tword)
            words.append(word)
            rwords.append(reversestr(word))
            truename[word] = tword
        words.sort()
        rwords.sort()
        update(self, k=k, words=words, rwords=rwords, truename=truename,
               reversibles={}, rangek=range(k), tryharder=False)

    def startswith(self, prefix):
        """Return up to k canonical words that start with prefix.
        If there are more than k, choose from them at random."""
        return self._k_startingwith(self.words, prefix)

    def endswith(self, rsuffix):
        """Return up to k canonical words that end with the reversed suffix.
        If you want words ending in 'ing', ask for d.endswith('gni').
        If there are more than k, choose from them at random."""
        return map(reversestr, self._k_startingwith(self.rwords, rsuffix))

    def __contains__(self, word):
        return word in self.truename

    def reversible_words(self):
        "Find words that have a reverse in the dict, like {'Camus': 'Sumac'}"
        if not self.reversibles:
            reversibles = self.reversibles
            for rw in self.rwords:
                if rw in self:
                    w = reversestr(rw)
                    if w != rw and w not in reversibles:
                        reversibles[w] = rw
            self.reversibles = reversibles
        return self.reversibles

    def _k_startingwith(self, words, prefix):
        start = bisect.bisect_left(words, prefix)
        end = bisect.bisect(words, prefix + 'zzzz')
        n = end - start
        if self.k >= n: # get all the words that start with prefix
            results = words[start:end]
        else: # sample from words starting with prefix 
            indexes = random.sample(xrange(start, end), self.k)
            results = [words[i] for i in indexes]
        random.shuffle(results)
        ## Consider words that are prefixes of the prefix.
        ## This is very slow, so don't use it until late in the game.
        if self.tryharder:
            for i in range(3, len(prefix)):
                w = prefix[0:i]
                if ((words == self.words and w in self.truename) or
                    (words == self.rwords and reversestr(w) in self.truename)):
                    results.append(w)
        return results

paldict = PalDict() 

def anpdictshort():
    "Find the words that are valid when every phrase must start with 'a'"
    def segment(word):  return [s for s in word.split('a') if s]
    def valid(word): return all(reversestr(s) in segments for s in segment(word))
    words = map(canonical, file('anpdict.txt'))
    segments = set(s for w in words for s in segment(canonical(w)))
    valid_words = [paldict.truename[w] for w in words if valid(w)]
    file('anpdict-short.txt', 'w').write('\n'.join(valid_words))

################ Search for a palindrome

class Panama:
    def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):
        ## .left and .right hold lists of canonical words
        ## .diff holds the number of characters that are not matched,
        ##  positive for words on left, negative for right.
        ## .stack holds (action, side, arg) tuples
        update(self, left=[], right=[], best=0, seen={}, diff=0, stack=[],
               used_reversibles=False, starttime=time.clock(), dict=dict)
        for word in L.split(','):
            self.add('left', canonical(word))
        for rword in reversestr(R).split(','):
            self.add('right', canonical(reversestr(rword)))
        self.consider_candidates()
        
    def search(self, steps=50000000):
        "Search for palindromes."
        for _ in xrange(steps):
            if not self.stack:
                return 'done'
            action, dir, substr, arg = self.stack[-1]
            if action == 'added': # undo the last word added
                self.remove(dir, arg)
            elif action == 'trying' and arg: # try the next word if there is one
                self.add(dir, arg.pop()) and self.consider_candidates()
            elif action == 'trying' and not arg: # otherwise backtrack
                self.stack.pop()
            else:
                raise ValueError(action)

    def add(self, dir, word):
        "add a word"
        if word in self.seen:
            return False
        else:
            getattr(self, dir).append(word)
            self.diff += factor[dir] * len(word)
            self.seen[word] = True
            self.stack.append(('added', dir, '?', word))
            return True

    def remove(self, dir, word):
        "remove a word"
        oldword = getattr(self, dir).pop()
        assert word == oldword
        self.diff -= factor[dir] * len(word)
        del self.seen[word]
        self.stack.pop()
        
    def consider_candidates(self):
        """Push a new state with a set of candidate words onto stack."""
        if self.diff > 0: # Left is longer, consider adding on right
            dir = 'right'
            substr =  self.left[-1][-self.diff:]
            candidates = self.dict.endswith(substr)
        elif self.diff < 0: # Right is longer, consider adding on left
            dir = 'left'
            substr =  reversestr(self.right[-1][0:-self.diff])
            candidates = self.dict.startswith(substr)
        else: # Both sides are same size
            dir = 'left'
            if not self.used_reversibles:
                self.report()
                self.add_reversibles()
            substr = ''
            candidates = self.dict.startswith('')
        if substr == reversestr(substr):
            self.report()
        self.stack.append(('trying', dir, substr, candidates))

    def add_reversibles(self):
        "Add in reversible words."
        print 'using reversibles ...'
        for (word, rword) in self.dict.reversible_words().items():
            if word not in self.seen and rword not in self.seen:
                self.add('left', word)
                self.add('right', rword)
        self.used_reversibles = True
        self.stack = []
        print '...done'
                
    def report(self):
        "Report a new palindrome to log file (if it is sufficiently big)."
        N = len(self)
        if N > 13333:
            self.dict.tryharder = True
        if N > self.best and (N > 12500 or N > self.best+500):
            self.best = len(self)
            self.bestphrase = str(self)
            print '%5d phrases (%5d words) in %3d seconds' % (
                self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime)
            assert is_panama(self.bestphrase)
            f = open('pallog%d.txt' % (id(self) % 10000), 'w')
            f.write(self.bestphrase + '\n')
            f.close()

    def __len__(self):
        return len(self.left) + len(self.right)

    def __str__(self):
        truename = self.dict.truename
        lefts = [truename[w] for w in self.left]
        rights =[truename[w] for w in self.right]
        return ', '.join(lefts + rights[::-1])

factor = {'left': +1, 'right': -1}

# Note that we only allow one truename per canonical name.  Occasionally
# this means we miss a good word (as in "a node" vs. "an ode"), but there
# are only 665 of these truename collisions, and most of them are of the
# form "a mark-up" vs. "a markup" so it seemed better to disallow them.

################ Unit Tests
 
def tests(p=Panama()):
    assert is_panama('A man, a plan, a canal, Panama.')
    assert is_panama('''A (man),     a   plan,,;, a ```canal?'' -- Panama!''')
    assert not is_panama('A man, a plan, a radar, a canal, Panama.')
    assert is_palindrome('A man, a plan, a canal, Panama.')
    assert is_palindrome('radar, radar? radar!')
    assert not is_palindrome('radars')
    assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']
    assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'
    assert reversestr('foo') == 'oof'
    assert is_unique([1, 2, 3])
    assert not is_unique([1, 2, 2])
    d = p.dict
    def sameset(a, b): return set(a) == set(b)
    assert 'panama' in d
    assert d.words[0] in d
    assert d.words[-1] in d
    assert sameset(d.startswith('aword'), ['awording', 'awordbreak',
        'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',
        'awordlessness', 'aword', 'awordsmith'])
    assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',
        'asubglobosity', 'anonverbosity', 'agibbosity'])
    d.tryharder = True
    assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])
    d.tryharder = False
    assert d.startswith('oklahoma') == ['oklahoma']
    assert d.startswith('fsfdsfdsfds') == []
    print 'all tests pass'

if __name__ == '__main__': 
    p = Panama();
    tests(p)
    p.search()