winners = set('''washington adams jefferson jefferson madison madison monroe 
monroe adams jackson jackson vanburen harrison polk taylor pierce buchanan 
lincoln lincoln grant grant hayes garfield cleveland harrison cleveland mckinley
 mckinley roosevelt taft wilson wilson harding coolidge hoover roosevelt 
roosevelt roosevelt roosevelt truman eisenhower eisenhower kennedy johnson nixon 
nixon carter reagan reagan bush clinton clinton bush bush obama obama'''.split())

losers = set('''clinton jefferson adams pinckney pinckney clinton king adams 
jackson adams clay vanburen vanburen clay cass scott fremont breckinridge 
mcclellan seymour greeley tilden hancock blaine cleveland harrison bryan bryan 
parker bryan roosevelt hughes cox davis smith hoover landon wilkie dewey dewey 
stevenson stevenson nixon goldwater humphrey mcgovern ford carter mondale 
dukakis bush dole gore kerry mccain romney'''.split())

winners & losers

losers = losers - winners

import re

def verify(regex, winners, losers):
    "Return true iff the regex matches all winners but no losers."
    missed_winners = {w for w in winners if not re.search(regex, w)}
    matched_losers = {L for L in losers if re.search(regex, L)}
    if missed_winners:
        print "Error: should match but did not:", ', '.join(missed_winners)
    if matched_losers:
        print "Error: should not match but did:", ', '.join(matched_losers)
    return not (missed_winners or matched_losers)

xkcd = "bu|[rn]t|[coy]e|[mtg]a|j|iso|n[hl]|[ae]d|lev|sh|[lnd]i|[po]o|ls"
verify(xkcd, winners, losers)

verify(xkcd, winners, losers - {'fremont'} | {'fillmore'})

def findregex(winners, losers):
    "Find a regex that matches all winners but no losers (sets of strings)."
    # Make a pool of regex components, then pick from them to cover winners.
    # On each iteration, add the 'best' component to 'cover',
    # remove winners covered by best, and keep in 'pool' only components
    # that still match some winner.
    pool = regex_components(winners, losers)
    cover = []
    while winners:
        best = max(pool, key=lambda c: 3 * len(matches(c, winners)) - len(c))
        cover.append(best)
        winners = winners - matches(best, winners)
        pool = {c for c in pool if matches(c, winners)}
    return '|'.join(cover)

def regex_components(winners, losers):
    "Return components that match at least one winner, but no loser."
    wholes = {'^'+winner+'$' for winner in winners}
    parts = {d for w in wholes for p in subparts(w) for d in dotify(p) 
             if not matches(d, losers)}
    return wholes | parts

def subparts(word):
    "Return a set of subparts of word, consecutive characters up to length 4."
    return set(word[i:i+n] for i in range(len(word)) for n in (1, 2, 3, 4)) 

def dotify(part):
    "Return all ways to replace a subset of chars in part with '.'."
    if part == '':
        return {''}  
    else:
        return {c+rest for rest in dotify(part[1:]) for c in replacements(part[0])}
        
def replacements(char):
    "Return possible replacement characters for char (char + '.' unless char is special)."
    return char if (char in '^$') else char + '.'

def matches(regex, strings):
    "Return a set of all the strings that are matched by regex."
    return {s for s in strings if re.search(regex, s)}

def test():
    assert regex_components({'win'}, {'losers', 'bin', 'won'}) == {
      '^win$', '^win', '^wi.', 'wi.',  'wi', '^wi', 'win$', 'win', 'wi.$'}
    assert subparts('^it$') == {'^', 'i', 't', '$', '^i', 'it', 't$', '^it', 'it$', '^it$'}
    assert subparts('this') == {'t', 'h', 'i', 's', 'th', 'hi', 'is', 'thi', 'his', 'this'}
    assert dotify('it') == {'it', 'i.', '.t', '..'}
    assert dotify('^it$') == {'^it$', '^i.$', '^.t$', '^..$'}
    assert dotify('this') == {'this', 'thi.', 'th.s', 'th..', 't.is', 't.i.', 't..s', 't...',
                              '.his', '.hi.', '.h.s', '.h..', '..is', '..i.', '...s', '....'}
    assert replacements('x') == 'x.'
    assert replacements('^') == '^'
    assert replacements('$') == '$'
    assert matches('a|b|c', {'a', 'b', 'c', 'd', 'e'}) == {'a', 'b', 'c'}
    assert matches('a|b|c', {'any', 'bee', 'succeed', 'dee', 'eee!'}) == {'any', 'bee', 'succeed'}
    assert verify('a+b+', {'ab', 'aaabb'}, {'a', 'bee', 'a b'})
    assert findregex({"ahahah", "ciao"},  {"ahaha", "bye"}) == 'a.$' 
    return 'test passes'

test()

findregex(winners, losers)

solution = findregex(winners, losers)
verify(solution, winners, losers)

def findboth(list1, list2):
    "Find, in both directions, a regex to separate one list from the other.  Print summary."
    def find1(list1, list2, legend):
        solution = findregex(list1, list2)
        assert verify(solution, list1, list2)
        trivial = '|'.join(list1)
        print '%s: %2d chars, %4.1f ratio: %r' % (
            legend, len(solution), len(trivial) / float(len(solution)), solution)
    find1(list1, list2, '1 vs. 2')
    find1(list2, list1, '2 vs. 1')

findboth(winners, losers)

boys  = set('jacob mason ethan noah william liam jayden michael alexander aiden'.split())
girls = set('sophia emma isabella olivia ava emily abigail mia madison elizabeth'.split())

findboth(boys, girls)

nfl_in = set('''colts saints chargers 49ers seahawks patriots panthers broncos chiefs eagles bengals packers'''.split())
nfl_out = set('''jets dolphins bills steelers ravens browns titans jaguars texans raiders cowboys giants 
                redskins bears lions vikings falcons buccaneers cardinals rams'''.split())

findboth(nfl_in, nfl_out)

drugs = set('lipitor nexium plavix advair ablify seroquel singulair crestor actos epogen'.split())
cities = set('paris trinidad capetown riga zurich shanghai vancouver chicago adelaide auckland'.split())

findboth(drugs, cities)

starwars = set('''The Phantom Menace 
Attack of the Clones 
Revenge of the Sith 
A New Hope
The Empire Strikes Back 
Return of the Jedi'''.upper().splitlines())

startrek = set('''The Wrath of Khan
The Search for Spock
The Voyage Home
The Final Frontier 
The Undiscovered Country 
Generations 
First Contact 
Insurrection 
Nemesis'''.upper().splitlines())

findboth(starwars, startrek)

verify('M | [TN]|B', starwars, startrek)

foo = set('''afoot catfoot dogfoot fanfoot foody foolery foolish fooster footage foothot footle footpad footway 
          hotfoot jawfoot mafoo nonfood padfoot prefool sfoot unfool'''.split())

bar = set('''Atlas Aymoro Iberic Mahran Ormazd Silipan altared chandoo crenel crooked fardo folksy forest 
          hebamic idgah manlike marly palazzi sixfold tarrock unfold'''.split())

findboth(foo, bar)