%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict
import re
import string

MIN_WORD_LENGTH = 2
NUM_BINS = 15
nonalpha_pattern = re.compile('[\W_]+')
digit_pattern = re.compile('[\d]+')

def parse_word(freqs,l):
    word = l.strip()
    word = nonalpha_pattern.sub('', word) # remove non-alphanum chars
    word = digit_pattern.sub('', word)    # remove numerals
    word_max_index = float(len(word)) - 1
    if len(word) < MIN_WORD_LENGTH:
        return freqs
    for i,c in enumerate(word):
        letter = c.lower()
        try:
            normalized_position = i/word_max_index
        except ZeroDivisionError:
            normalized_position = 0.0 # single-letter position
        freqs[letter].append(normalized_position)
    return freqs

def parse_dictionary():
    freqs = defaultdict(list)
    with open("/usr/share/dict/words","r") as f:
        for l in f:
            parse_word(freqs,l)
    return freqs
            
def parse_book(filename):
    freqs = defaultdict(list)
    with open(filename,'r') as f:
        for l in f:
            for w in l.split():
                parse_word(freqs,w)
    return freqs

def get_counts(freqs):
    "return reverse-sorted list of count,letter tuples"
    counts = []
    for letter in sorted(freqs.keys()):
        counts.append( (len(freqs[letter]),letter) )
    return reversed(sorted(counts))

def plot_histogram_grid(freqs,letters,maxy=None):
    cnt = len(letters)
    cols = floor(sqrt(cnt))
    rows = ceil(cnt/cols)
    fig = plt.figure(figsize=(4*cols,3*rows))
    for i in range(len(letters)):
        letter = letters[i]
        ax = fig.add_subplot(rows,cols,i+1)
        n, bins, patches = ax.hist(freqs[letter],NUM_BINS,histtype="bar")
        ax.set_xticks([])
        if i == len(letters)-1:
            ax.set_xlabel("Normalized Position")
            ax.set_ylabel("Counts")
        if maxy:
            ax.set_ylim([0,maxy])
        ax.set_title("%s"%letter)
    plt.show()

freqs = parse_dictionary()

# let's just look at the vowels
plot_histogram_grid(freqs,'aeiou',50000)

# now create an ordered grid of usage.  E is used most often, then I, then A, etc.
plot_histogram_grid(freqs,[l[1] for l in get_counts(freqs)],50000)

moby = parse_book("/Users/rallen/Downloads/moby_dick.txt")

plot_histogram_grid(moby,'aeiou',45000)

plot_histogram_grid(moby,[l[1] for l in get_counts(moby)],45000)

plot_histogram_grid(moby,string.ascii_lowercase,45000)

mfreqs = defaultdict(list)
for k in string.ascii_lowercase:
    mfreqs[k] = [freqs[k],moby[k]]


plot_histogram_grid(mfreqs,string.ascii_lowercase,50000)

colors = [[0, '#ffffcc'],
          [0.1, '#ffeda0'],
          [0.5, '#fed976'],
          [1, '#feb24c'],
          [2, '#fd8d3c'],
          [3, '#fc4e2a'],
          [5, '#e31a1c'],
          [9, '#b10026']]

def plot_histogram_pretty(freqs,letters,in_maxy=None):
    freq_counts = get_counts(freqs)
    total_letters = sum([x[0] for x in freq_counts])
    cnt = len(letters)
    cols = int(floor(sqrt(cnt)))
    rows = int(ceil(float(cnt)/cols))
    #cols = 1
    #rows = cnt
    fig, ax = plt.subplots(rows, cols, figsize=(6*cols,rows*3))
    for i in range(rows*cols):
        ir = i/cols
        ic = i%cols
        if i >= len(letters):
            fig.delaxes(ax[ir][ic])
            continue
        letter = letters[i]
        n, bins = np.histogram(freqs[letter],NUM_BINS,(0.0,1.0))
        ax[ir][ic].plot(np.arange(NUM_BINS),n, color='k', linewidth = 3)
        if in_maxy == None:
            maxy = np.max(n)
        else:
            maxy = in_maxy
        color = ''
        for j in range(len(colors)):
            if 100.0*len(freqs[letter])/total_letters >= colors[j][0]:
             color = colors[j][1]
        ax[ir][ic].fill_between(np.arange(NUM_BINS), n, color=color, interpolate=True)
        ax[ir][ic].set_ylim(0,maxy)
        ax[ir][ic].set_xlim(0,NUM_BINS-1)
        ax[ir][ic].set_xticks([])
        ax[ir][ic].set_yticks([])
        ax[ir][ic].set_ylabel(letter+'       ', size=18, rotation='horizontal')
    plt.show()

plot_histogram_pretty(moby,"aeiou")

plot_histogram_pretty(moby,string.ascii_lowercase)

# does the shading work, or should we keep the y-limit the same for all graphs?  What do you think?
plot_histogram_pretty(moby,[l[1] for l in get_counts(moby)],45000)