import re
from __future__ import division
from __builtin__ import any, all, sum # (because ipython imports the numpy versions)

def verify(regex, winners, losers):
    "Return true iff the regex matches all winners but no losers."
    missed_winners = {W for W in winners if not re.search(regex, W)}
    matched_losers = {L for L in losers if re.search(regex, L)}
    if missed_winners:
        print "Error: should match but did not:", ', '.join(missed_winners)
    if matched_losers:
        print "Error: should not match but did:", ', '.join(matched_losers)
    return not (missed_winners or matched_losers)

def findregex(winners, losers):
    "Find a regex that matches all winners but no losers (sets of strings)."
    # Make a pool of regex components, then pick from them to cover winners.
    # On each iteration, add the 'best' component to 'solution',
    # remove winners covered by best, and keep in 'pool' only components
    # that still match some winner.
    pool = regex_components(winners, losers)
    solution = []
    def score(r): return 4 * len(matches(r, winners)) - len(r)
    while winners:
        best = max(pool, key=score)
        solution.append(best)
        winners = winners - matches(best, winners)
        pool = {r for r in pool if matches(r, winners)}
    return OR(solution)

def matches(regex, strings):
    "Return a set of all the strings that are matched by regex."
    return {s for s in strings if re.search(regex, s)}

OR  = '|'.join # Join a sequence of strings with '|' between them

def regex_components(winners, losers):
    "Return components that match at least one winner, but no loser."
    wholes = {'^'+winner+'$' for winner in winners}
    parts = {d for w in wholes for s in subparts(w) for d in dotify(s)}
    return wholes | {p for p in parts if not matches(p, losers)}

def subparts(word):
    "Return a set of subparts of word, consecutive characters up to length 4."
    return set(word[i:i+n] for i in range(len(word)) for n in (1, 2, 3, 4)) 

def dotify(part):
    "Return all ways to replace a subset of chars in part with '.'."
    if part == '':
        return {''}  
    else:
        return {c+rest for rest in dotify(part[1:]) 
                for c in replacements(part[0])}
       
def replacements(char):
    "Return replacement characters for char (char + '.' unless char is special)."
    if (char == '^' or char == '$'):
        return char
    else:
        return char + '.'
    
def words(text): 
    "Return a set of all words in the text, lowercased."
    return frozenset(text.lower().split())

def phrases(text, sep='/'): 
    "Return a set of all 'sep'-separated phrases in text, uppercased."
    return frozenset(line.strip() for line in text.upper().split(sep))

winners = words('''washington adams jefferson jefferson madison madison monroe 
    monroe adams jackson jackson van-buren harrison polk taylor pierce buchanan 
    lincoln lincoln grant grant hayes garfield cleveland harrison cleveland mckinley
    mckinley roosevelt taft wilson wilson harding coolidge hoover roosevelt 
    roosevelt roosevelt roosevelt truman eisenhower eisenhower kennedy johnson nixon 
    nixon carter reagan reagan bush clinton clinton bush bush obama obama''')

losers = words('''clinton jefferson adams pinckney pinckney clinton king adams 
    jackson adams clay van-buren van-buren clay cass scott fremont breckinridge 
    mcclellan seymour greeley tilden hancock blaine cleveland harrison bryan bryan 
    parker bryan roosevelt hughes cox davis smith hoover landon wilkie dewey dewey 
    stevenson stevenson nixon goldwater humphrey mcgovern ford carter mondale 
    dukakis bush dole gore kerry mccain romney''') - winners

boys = words('jacob mason ethan noah william liam jayden michael alexander aiden')
girls = words('sophia emma isabella olivia ava emily abigail mia madison elizabeth')

nfl_in = words('colts saints chargers 49ers seahawks patriots panthers broncos chiefs eagles bengals packers')
nfl_out = words('''jets dolphins bills steelers ravens browns titans jaguars texans raiders cowboys giants 
    redskins bears lions vikings falcons buccaneers cardinals rams''')

pharma = words('lipitor nexium plavix advair ablify seroquel singulair crestor actos epogen')
cities = words('paris trinidad capetown riga zurich shanghai vancouver chicago adelaide auckland')

foo = words('''afoot catfoot dogfoot fanfoot foody foolery foolish fooster footage foothot footle footpad footway 
    hotfoot jawfoot mafoo nonfood padfoot prefool sfoot unfool''')
bar = words('''Atlas Aymoro Iberic Mahran Ormazd Silipan altared chandoo crenel crooked fardo folksy forest 
    hebamic idgah manlike marly palazzi sixfold tarrock unfold''')

nouns = words('''time year people way day man thing woman life child world school 
    state family student group country problem hand part place case week company 
    system program question work government number night point home water room 
    mother area money story fact month lot right study book eye job word business 
    issue side kind head house service friend father power hour game line end member 
    law car city community name president team minute idea kid body information 
    back parent face others level office door health person art war history party result 
    change morning reason research girl guy moment air teacher force education''')
adverbs = words('''all particularly just less indeed over soon course still yet before 
    certainly how actually better to finally pretty then around very early nearly now 
    always either where right often hard back home best out even away enough probably 
    ever recently never however here quite alone both about ok ahead of usually already 
    suddenly down simply long directly little fast there only least quickly much forward 
    today more on exactly else up sometimes eventually almost thus tonight as in close 
    clearly again no perhaps that when also instead really most why ago off 
    especially maybe later well together rather so far once''') - nouns  
verbs = words('''ask believe borrow break bring buy can be able cancel change clean
    comb complain cough count cut dance draw drink drive eat explain fall
    fill find finish fit fix fly forget give go have hear hurt know learn
    leave listen live look lose make do need open close shut organise pay
    play put rain read reply run say see sell send sign sing sit sleep
    smoke speak spell spend stand start begin study succeed swim take talk
    teach tell think translate travel try turn off turn on type understand
    use wait wake up want watch work worry write''') - nouns

randoms = frozenset(vars(random))
builtins = frozenset(vars(__builtin__)) - randoms

starwars = phrases('''The Phantom Menace / Attack of the Clones / Revenge of the Sith /
    A New Hope / The Empire Strikes Back / Return of the Jedi''')

startrek = phrases('''The Wrath of Khan / The Search for Spock / The Voyage Home /
    The Final Frontier / The Undiscovered Country / Generations / First Contact /
    Insurrection / Nemesis''')

dogs = phrases(''''Labrador Retrievers / German Shepherd Dogs / Golden Retrievers / Beagles / Bulldogs / 
    Yorkshire Terriers / Boxers / Poodles / Rottweilers / Dachshunds / Shih Tzu / Doberman Pinschers / 
    Miniature Schnauzers / French Bulldogs / German Shorthaired Pointers / Siberian Huskies / Great Danes / 
    Chihuahuas / Pomeranians / Cavalier King Charles Spaniels / Shetland Sheepdogs / Australian Shepherds / 
    Boston Terriers / Pembroke Welsh Corgis / Maltese / Mastiffs / Cocker Spaniels / Havanese / 
    English Springer Spaniels / Pugs / Brittanys / Weimaraners / Bernese Mountain Dogs / Vizslas / Collies / 
    West Highland White Terriers / Papillons / Bichons Frises / Bullmastiffs / Basset Hounds / 
    Rhodesian Ridgebacks / Newfoundlands / Russell Terriers / Border Collies / Akitas / 
    Chesapeake Bay Retrievers / Miniature Pinschers / Bloodhounds / St. Bernards / Shiba Inu / Bull Terriers / 
    Chinese Shar-Pei / Soft Coated Wheaten Terriers / Airedale Terriers / Portuguese Water Dogs / Whippets / 
    Alaskan Malamutes / Scottish Terriers / Australian Cattle Dogs / Cane Corso / Lhasa Apsos / 
    Chinese Crested / Cairn Terriers / English Cocker Spaniels / Dalmatians / Italian Greyhounds / 
    Dogues de Bordeaux / Samoyeds / Chow Chows / German Wirehaired Pointers / Belgian Malinois / 
    Great Pyrenees / Pekingese / Irish Setters / Cardigan Welsh Corgis / Staffordshire Bull Terriers / 
    Irish Wolfhounds / Old English Sheepdogs / American Staffordshire Terriers / Bouviers des Flandres / 
    Greater Swiss Mountain Dogs / Japanese Chin / Tibetan Terriers / Brussels Griffons / 
    Wirehaired Pointing Griffons / Border Terriers / English Setters / Basenjis / Standard Schnauzers / 
    Silky Terriers / Flat-Coated Retrievers / Norwich Terriers / Afghan Hounds / Giant Schnauzers / Borzois / 
    Wire Fox Terriers / Parson Russell Terriers / Schipperkes / Gordon Setters / Treeing Walker Coonhounds''')
cats = phrases('''Abyssinian / Aegean cat / Australian Mist / American Curl / American Bobtail / 
    American Polydactyl / American Shorthair / American Wirehair / Arabian Mau / Asian / Asian Semi-longhair / 
    Balinese / Bambino / Bengal / Birman / Bombay / Brazilian Shorthair / British Shorthair / British Longhair / 
    Burmese / Burmilla / California Spangled Cat / Chantilly/Tiffany / Chartreux / Chausie / Cheetoh / 
    Colorpoint Shorthair / Cornish Rex / Cymric / Cyprus cat / Devon Rex / Donskoy or Don Sphynx / Dragon Li / 
    Dwelf / Egyptian Mau / European Shorthair / Exotic Shorthair / German Rex / Havana Brown / Highlander / 
    Himalayan-Colorpoint Persian / Japanese Bobtail / Javanese / Khao Manee / Korat / Korn Ja / 
    Kurilian Bobtail / LaPerm / Maine Coon / Manx / Mekong bobtail / Minskin / Munchkin / Nebelung / Napoleon / 
    Norwegian Forest Cat / Ocicat / Ojos Azules / Oregon Rex / Oriental Bicolor / Oriental Shorthair / 
    Oriental Longhair / Persian / Peterbald / Pixie-bob / Ragamuffin / Ragdoll / Russian Blue / Russian Black / 
    Sam Sawet / Savannah / Scottish Fold / Selkirk Rex / Serengeti cat / Serrade petit / Siamese / Siberian / 
    Singapura / Snowshoe / Sokoke / Somali / Sphynx / Swedish forest cat / Thai / Tonkinese / Toyger / 
    Turkish Angora / Turkish Van / Ukrainian Levkoy / York Chocolate Cat''')

findregex(starwars, startrek)

verify(' T|E.P| N', starwars, startrek)

%time findregex(adverbs, nouns)

import cProfile

cProfile.run('findregex(adverbs, nouns)', sort='cumulative')

def matches(regex, strings):
    "Return a set of all the strings that are matched by regex."
    searcher = re.compile(regex).search
    return {s for s in strings if searcher(s)}

re.purge();
%time findregex(adverbs, nouns)

searcher = re.compile('^a.o').search
strings = adverbs

%timeit {s for s in strings if searcher(s)}
%timeit set(filter(searcher, strings))

def matches(regex, strings):
    "Return a set of all the strings that are matched by regex."
    searcher = re.compile(regex).search
    return set(filter(searcher, strings))

re.purge();
%time findregex(adverbs, nouns)

%timeit re.purge(); regex_components(winners, losers)  

def regex_components(winners, losers):
    "Return components that match at least one winner, but no loser."
    losers_str = '\n'.join(losers)
    def no_losers(part): return not re.compile(part, re.MULTILINE).search(losers_str)
    wholes = {'^'+winner+'$' for winner in winners}
    parts = {d for w in wholes for p in subparts(w) for d in dotify(p)}
    return wholes | set(filter(no_losers, parts))

%timeit re.purge(); regex_components(winners, losers)

def findregex(winners, losers):
    "Find a regex that matches all winners but no losers (sets of strings)."
    # Make a pool of regex components, then pick from them to cover winners.
    # On each iteration, add the 'best' component to 'solution',
    # remove winners covered by best, and keep in 'pool' only components
    # that still match some winner.
    pool = regex_components(winners, losers)
    solution = []
    def score(r): return 4 * len(matches(r, winners)) - len(r)
    while winners:
        best = max(pool, key=score)
        solution.append(best)
        winners = winners - matches(best, winners)
        pool = {r for r in pool if matches(r, winners)}
    return OR(solution)

def regex_covers(winners, losers):
    """Generate regex components and return a dict of {regex: {winner...}}.
    Each regex matches at least one winner and no loser."""
    losers_str = '\n'.join(losers)
    wholes = {'^'+winner+'$' for winner in winners}
    parts = {d for w in wholes for p in subparts(w) for d in dotify(p)}
    pool = wholes | parts
    searchers = [re.compile(c, re.MULTILINE).search for c in pool]
    return {r: set(filter(searcher, winners)) 
            for (r, searcher) in zip(pool, searchers)
            if not searcher(losers_str)}

def findregex(winners, losers):
    "Find a regex that matches all winners but no losers (sets of strings)."
    # Initialize covers = {regex: {winner,...}} for a large set of regex components.
    # On each iteration, add the 'best' component to 'solution',
    # remove winners covered by best, and keep in 'pool' only components
    # that still match some winner.
    covers = regex_covers(winners, losers)
    pool = set(covers)
    solution = []
        
    def matches(r, winners): return {w for w in covers[r] if w in winners}
    
    def score(r): return 4 * len(matches(r, winners)) - len(r)
    
    while winners:
        best = max(pool, key=score)
        solution.append(best)
        winners = winners - covers[best]
        pool = {r for r in pool if matches(r, winners)}
    return OR(solution)

re.purge()
%time findregex(adverbs, nouns)

import time

ALL = [ # An example has two sets of strings, each with a one-character initial identifying it.
    (winners, 'W', 'L', losers),
    (boys, 'B', 'G', girls),
    (nfl_in, 'I', 'O', nfl_out),
    (pharma, 'P', 'C', cities),
    (foo, 'F', 'B', bar),
    (starwars, 'W', 'T', startrek),
    (nouns, 'N', 'A', adverbs),
    (nouns, 'N', 'V', verbs),
    (randoms, 'R', 'B', builtins),
    (dogs, 'D', 'C', cats)]

ALL = [d for datum in ALL for d in (datum, datum[::-1])] # Add in the reverse versions

SOLUTION = {} # Remember solutions; SOLUTION[W, L] will hold a regex
               
def benchmark(examples=ALL): 
    "Run examples; print summaries; return total of solution lengths."
    total = 0
    for (W, Wname, Lname, L) in examples:
        re.purge()
        t0 = time.time()
        solution = SOLUTION[W, L] = findregex(W, L)
        t1 = time.time()
        assert verify(solution, W, L)
        print '%3d ch %3d win %5.2f s %s-%s: %r' % (
            len(solution), len(W), t1-t0, Wname, Lname, solution)
        total += len(solution)
    return total

%time benchmark()

from collections import Counter, defaultdict

components = [r for solution in SOLUTION.values() for r in solution.split('|')]

Counter(map(len, components)).most_common()

hist(map(len, components));

max(components, key=len)

cat = ''.join

Counter(cat(components)).most_common(20)

Counter(components).most_common(20)

import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = (8.0, 4.0)

def show(W, L, N=10):
    covers = regex_covers(W, L)
    inv = invert_multimap(covers)
    for ((n1, w), (n2, r)) in zip(top(N, covers), top(N, inv)):
        print "%8s %2d | %3d %s" % (w, n1, n2, r)
    print "TOTAL %5d | %3d TOTAL" % (len(covers), len(W))
    subplot(1, 2, 1); histmm(covers, "regexes", "winners")
    subplot(1, 2, 2); histmm(inv, "winners", "regexes")
    
def top(N, multimap): 
    "The top N longest items in a dict of {key: {val,...})"
    return sorted([(len(vals), key) for (key, vals) in multimap.items()], reverse=True)[:N]

def histmm(multimap, key, val): 
    "Display a histogram of how many values each key has."
    hist(map(len, multimap.values()))
    xlabel('set size of {' + val + ',...}')
    ylabel('number of ' + key + ' mapping to that size')

def invert_multimap(multimap):
    "Covert {key: {val,...}} to {val: [key,...]}."
    result = defaultdict(set)
    for key in multimap:
        for val in multimap[key]:
            result[val].add(key)
    return result

show(winners, losers)

show(dogs, cats)

show(adverbs, nouns)
# Pseudocode 
def greedy_search(components, partial_solution=None):
    if is_complete(partial_solution):
        return partial_solution
    else:
        best = select_best(components)
        return greedy_search(components - best, partial_solution + best)# Pseudocode
def exhaustive_search(components, partial_solution=None):
    if is_complete(partial_solution):
        return partial_solution
    else:
        best = select_best(components)
        return min(exhaustive_search(components - best, partial_solution + best),
                   exhaustive_search(components - best, partial_solution),
                   key=cost)# Pseudocode
def branch_and_bound_search(components, partial_solution=None):
    if is_complete(partial_solution):
        SOLUTION = min(SOLUTION, partial_solution, key=cost)
    elif cost(partial_solution) < cost(SOLUTION):
        best = select_best(components)
        branch_and_bound_search(components - best, partial_solution + best)
        branch_and_bound_search(components - best, partial_solution)
    else:
        pass 
    return SOLUTION
def findregex(winners, losers, calls=100000):
    "Find the shortest disjunction of regex components that covers winners but not losers."
    solution = '^(' + OR(winners) + ')$'
    covers = regex_covers(winners, losers)
    return BranchBound(solution, calls).search(covers)

class BranchBound(object):
    "Hold state information for a branch and bound search."
    def __init__(self, solution, calls):
        self.solution, self.calls = solution, calls
    
    def search(self, covers, partial=None):
        """Recursively extend partial regex until it matches all winners in covers.
        Try all reasonable combinations until we run out of calls."""
        if self.calls <= 0: 
            return self.solution
        self.calls -= 1
        covers, partial = simplify_covers(covers, partial)
        if not covers: # Nothing left to cover; solution is complete
            self.solution = min(partial, self.solution, key=len)
        elif len(OR(partial, min(covers, key=len))) < len(self.solution):
            # Try with and without the greedy-best component
            def score(c): return 4 * len(covers[c]) - len(c)
            best = max(covers, key=score) # Best component
            covered = covers[best] # Set of winners covered by best
            covers.pop(best)
            self.search({c:covers[c]-covered for c in covers}, OR(partial, best))
            self.search(covers, partial)
        return self.solution

def simplify_covers(covers, partial=None):
    "Eliminate dominated regexes, and select ones that uniquely cover a winner."
    previous = None
    while covers != previous:
        previous = covers
        covers = eliminate_dominated(covers)
        covers, necessary = select_necessary(covers)
        partial = OR(partial, necessary)
    return covers, partial

def OR(*regexes):
    """OR together component regexes. Ignore 'None' components.
    Allows both OR(a, b, c) and OR([a, b, c]), similar to max."""
    if len(regexes) == 1: 
        regexes = regexes[0]
    return '|'.join(r for r in regexes if r)

def eliminate_dominated(covers):
    """Given a dict of {regex: {winner...}}, make a new dict with only the regexes
    that are not dominated by any others. A regex r is dominated by r2 if r2 covers 
    a superset of the matches covered by r, and r2 is shorter."""
    newcovers = {}
    def signature(r): return (-len(covers[r]), len(r))
    for r in sorted(covers, key=signature):
        if not covers[r]: break # All remaining r must not cover anything
        # r goes in newcache if it is not dominated by any other regex
        if not any(covers[r2] >= covers[r] and len(r2) <= len(r) 
                   for r2 in newcovers):
            newcovers[r] = covers[r]
    return newcovers

def select_necessary(covers):
    """Select winners covered by only one component; remove from covers.
    Return a pair of (covers, necessary)."""
    counts = Counter(w for r in covers for w in covers[r])
    necessary = {r for r in covers if any(counts[w] == 1 for w in covers[r])}
    if necessary:
        covered = {w for r in necessary for w in covers[r]}
        covers = {r:covers[r]-covered for r in covers if r not in necessary}
        return covers, OR(necessary)
    else:
        return covers, None

def test_bb():
    assert OR(['a', 'b', 'c']) == OR('a', 'b', 'c') == 'a|b|c'
    assert OR(['a|b', 'c|d']) == OR('a|b', 'c|d') == 'a|b|c|d'
    assert OR(None, 'c') == 'c'
    covers1 = {'a': {'ann', 'abe'}, 'ab': {'abe'}}
    assert eliminate_dominated(covers1) == {'a': {'ann', 'abe'}}
    assert simplify_covers(covers1) == ({}, 'a')
    covers2 = {'a': {'abe', 'cab'}, 'b': {'abe', 'cab', 'bee'}, 
               'c': {'cab', 'cee'}, 'c.': {'cab', 'cee'}, 'abe': {'abe'},
               'ab': {'abe', 'cab'}, '.*b': {'abe', 'cab', 'bee'}, 
               'e': {'abe', 'bee', 'cee'}, 'f': {}, 'g': {}}
    assert eliminate_dominated(covers2) == simplify_covers(covers2)[0] == {
        'c': {'cab', 'cee'}, 'b': {'cab', 'abe', 'bee'}, 'e': {'cee', 'abe', 'bee'}}
    covers3 = {'1': {'w1'}, '.1': {'w1'}, '2': {'w2'}}
    assert eliminate_dominated(covers3) == {'1': {'w1'}, '2': {'w2'}}
    assert simplify_covers(covers3) == ({}, '1|2')
    assert select_necessary({'a': {'abe'}, 'c': {'cee'}}) == ({}, 'a|c')
    assert {0, 1, 2} >= {1, 2}
    assert {1, 2} >= {1, 2}
    assert not ({1, 2, 4} >= {1, 3})
    return 'test_bb passes'

test_bb()

covers = regex_covers(winners, losers)
len(covers)

covers2, partial = simplify_covers(covers)
len(covers2), partial

1 - len(covers2)/len(covers)

def benchmark(data=ALL, calls=10000): 
    "Run these data sets; print summaries; return total of solution lengths."
    total = 0
    for (W, Wname, Lname, L) in data:
        re.purge()
        t0 = time.time()
        bb = bb_findregex(W, L, calls)
        t1 = time.time()
        SOLUTION[W, L] = bb.solution
        assert verify(bb.solution, W, L)
        print '{:3d} ch {:3d} win {:6.2f} s {:7,d} calls {}-{}: {!r}'.format(
            len(bb.solution), len(W), t1-t0, (calls - bb.calls), Wname, Lname, bb.solution)
        total += len(bb.solution)
    return total

def bb_findregex(winners, losers, calls=10000):
    "Find the shortest disjunction of regex components that covers winners but not losers."
    solution = '^(' + OR(winners) + ')$'
    bb = BranchBound(solution, calls)
    bb.search(regex_covers(winners, losers))
    return bb

%time benchmark(calls=1000)

%time benchmark(calls=10000)

%time benchmark(calls=100000)

def regex_covers(winners, losers):
    """Generate regex components and return a dict of {regex: {winner...}}.
    Each regex matches at least one winner and no loser."""
    losers_str = '\n'.join(losers)
    wholes = {'^'+winner+'$' for winner in winners}
    parts = {d for w in wholes for p in subparts(w) for d in dotify(p)}
    chars = set(cat(winners))
    pairs = {A+'.'+q+B for A in chars for B in chars for q in rep_chars}
    reps = {r for p in parts for r in repetitions(p)}
    pool = wholes | parts | pairs | reps                         
    searchers = [re.compile(c, re.MULTILINE).search for c in pool]
    return {r: set(filter(searcher, winners)) 
            for (r, searcher) in zip(pool, searchers)
            if not searcher(losers_str)}

def repetitions(part):
    """Return a set of strings derived by inserting a single repetition character ('+' or '*' or '?') 
    after each non-special character.  Avoid redundant repetition of dots."""
    splits = [(part[:i], part[i:]) for i in range(1, len(part)+1)]
    return {A + q + B 
            for (A, B) in splits
            # Don't allow '^*' nor '$*' nor '..*' nor '.*.'
            if not (A[-1] in '^$')
            if not A.endswith('..')
            if not (A.endswith('.') and B.startswith('.'))
            for q in rep_chars}

rep_chars = ('*', '+', '?')

def test_rep():
    assert repetitions('a') == {'a+', 'a*', 'a?'}
    assert repetitions('ab') == {'a+b', 'a*b', 'a?b', 
                                 'ab+', 'ab*', 'ab?'}
    assert repetitions('a.c') == {'a+.c', 'a*.c', 'a?.c',
                                  'a.c+', 'a.*c', 'a.?c',
                                  'a.+c', 'a.c*', 'a.c?'}
    assert repetitions('^a..d$') == {'^a+..d$', '^a*..d$', '^a?..d$', 
                                     '^a..d+$', '^a..d*$', '^a..d?$'}
    assert (eliminate_dominated(regex_covers(
           {'one', 'on'}, {'won', 'wuan', 'juan'}))
           == {'e': {'one'}, '^o': {'on', 'one'}})
    return 'test_rep passes'

test_rep()

findregex(starwars, startrek)

covers = regex_covers(winners, losers)
len(covers)

covers, partial = simplify_covers(covers)
len(covers)

subpart_size = 5

def subparts(word):
    "Return a set of subparts of word, consecutive characters up to length 5."
    return set(word[i:i+1+s] for i in range(len(word)) for s in range(subpart_size)) 

covers = regex_covers(winners, losers)
len(covers)

covers, partial = simplify_covers(covers)
len(covers)

%time benchmark(calls=1000)

print matches('H.+N.S', dogs)

%time benchmark(calls=10000)

%time benchmark(calls=100000)

import random

def bb_findregex(winners, losers, calls=10000, restarts=10):
    "Find the shortest disjunction of regex components that covers winners but not losers."
    solution = '^(' + OR(winners) + ')$'
    bb = BranchBoundRandom(solution, calls)
    covers = eliminate_dominated(regex_covers(winners, losers))
    for restart in range(restarts):
        bb.calls = calls
        bb.search(covers.copy())
        if bb.calls > 0: 
            return bb # If search was not cut off, then stop
    return bb

class BranchBoundRandom(BranchBound):
    def search(self, covers, partial=None):
        """Recursively extend partial regex until it matches all winners in covers.
        Try all reasonable combinations until we run out of calls."""
        if self.calls <= 0: 
            return partial, covers
        self.calls -= 1
        covers, partial = simplify_covers(covers, partial)
        if not covers: # Nothing left to cover; solution is complete
            self.solution = min(partial, self.solution, key=len)
        elif len(OR(partial, min(covers, key=len))) < len(self.solution):
            # Try with and without the greedy-best component
            K = random.choice((2, 3, 4, 4, 5, 6))
            F = random.choice((1., 1., 2.))
            def score(c): return K * len(covers[c]) - len(c) + random.uniform(0., F)
            best = max(covers, key=score) # Best component
            covered = covers[best] # Set of winners covered by r
            covers.pop(best)
            self.search({c:covers[c]-covered for c in covers}, OR(partial, best))
            self.search(covers, partial)
        return self.solution

%time benchmark(calls=100)

%time benchmark(calls=1000)

%time benchmark(calls=10000)

solution = '^(?!.*foo)'
verify(solution, bar, foo)

def negative_lookahead_solution(W, L): 
    "Find a solution for W but not L by negative lookahead on SOLUTION[L, W]."
    solution = '^(?!.*(' + SOLUTION[L, W] + '))'
    assert verify(solution, W, L)
    return solution

def better_solution(W, L):
    "Return the better of the 'regular' or negative lookahead solutions."
    return min(SOLUTION[W, L], negative_lookahead_solution(W, L),
               key=len)
    
%time sum(len(better_solution(W, L)) for (W, _, _, L) in ALL)

m = 60.
plot([4/m], [1355], 'gD-', label='Greedy')
plot([170/m, 184/m, 348/m], [1203, 1196, 1191], 'bo-', label='BB')
plot([178/m, 193/m, 345/m], [1177, 1170, 1165], 'rs-', label='BBR')
plot([345/m], [1079], 'k*-', markersize=12, label='BBR + Neg Look')
xlabel('Time (minutes)'); ylabel('Solution size (chars)')
legend(loc='upper right');