%matplotlib inline import matplotlib.pyplot as plt from collections import defaultdict import re import string MIN_WORD_LENGTH = 2 NUM_BINS = 15 nonalpha_pattern = re.compile('[\W_]+') digit_pattern = re.compile('[\d]+') def parse_word(freqs,l): word = l.strip() word = nonalpha_pattern.sub('', word) # remove non-alphanum chars word = digit_pattern.sub('', word) # remove numerals word_max_index = float(len(word)) - 1 if len(word) < MIN_WORD_LENGTH: return freqs for i,c in enumerate(word): letter = c.lower() try: normalized_position = i/word_max_index except ZeroDivisionError: normalized_position = 0.0 # single-letter position freqs[letter].append(normalized_position) return freqs def parse_dictionary(): freqs = defaultdict(list) with open("/usr/share/dict/words","r") as f: for l in f: parse_word(freqs,l) return freqs def parse_book(filename): freqs = defaultdict(list) with open(filename,'r') as f: for l in f: for w in l.split(): parse_word(freqs,w) return freqs def get_counts(freqs): "return reverse-sorted list of count,letter tuples" counts = [] for letter in sorted(freqs.keys()): counts.append( (len(freqs[letter]),letter) ) return reversed(sorted(counts)) def plot_histogram_grid(freqs,letters,maxy=None): cnt = len(letters) cols = floor(sqrt(cnt)) rows = ceil(cnt/cols) fig = plt.figure(figsize=(4*cols,3*rows)) for i in range(len(letters)): letter = letters[i] ax = fig.add_subplot(rows,cols,i+1) n, bins, patches = ax.hist(freqs[letter],NUM_BINS,histtype="bar") ax.set_xticks([]) if i == len(letters)-1: ax.set_xlabel("Normalized Position") ax.set_ylabel("Counts") if maxy: ax.set_ylim([0,maxy]) ax.set_title("%s"%letter) plt.show() freqs = parse_dictionary() # let's just look at the vowels plot_histogram_grid(freqs,'aeiou',50000) # now create an ordered grid of usage. E is used most often, then I, then A, etc. plot_histogram_grid(freqs,[l[1] for l in get_counts(freqs)],50000) moby = parse_book("/Users/rallen/Downloads/moby_dick.txt") plot_histogram_grid(moby,'aeiou',45000) plot_histogram_grid(moby,[l[1] for l in get_counts(moby)],45000) plot_histogram_grid(moby,string.ascii_lowercase,45000) mfreqs = defaultdict(list) for k in string.ascii_lowercase: mfreqs[k] = [freqs[k],moby[k]] plot_histogram_grid(mfreqs,string.ascii_lowercase,50000) colors = [[0, '#ffffcc'], [0.1, '#ffeda0'], [0.5, '#fed976'], [1, '#feb24c'], [2, '#fd8d3c'], [3, '#fc4e2a'], [5, '#e31a1c'], [9, '#b10026']] def plot_histogram_pretty(freqs,letters,in_maxy=None): freq_counts = get_counts(freqs) total_letters = sum([x[0] for x in freq_counts]) cnt = len(letters) cols = int(floor(sqrt(cnt))) rows = int(ceil(float(cnt)/cols)) #cols = 1 #rows = cnt fig, ax = plt.subplots(rows, cols, figsize=(6*cols,rows*3)) for i in range(rows*cols): ir = i/cols ic = i%cols if i >= len(letters): fig.delaxes(ax[ir][ic]) continue letter = letters[i] n, bins = np.histogram(freqs[letter],NUM_BINS,(0.0,1.0)) ax[ir][ic].plot(np.arange(NUM_BINS),n, color='k', linewidth = 3) if in_maxy == None: maxy = np.max(n) else: maxy = in_maxy color = '' for j in range(len(colors)): if 100.0*len(freqs[letter])/total_letters >= colors[j][0]: color = colors[j][1] ax[ir][ic].fill_between(np.arange(NUM_BINS), n, color=color, interpolate=True) ax[ir][ic].set_ylim(0,maxy) ax[ir][ic].set_xlim(0,NUM_BINS-1) ax[ir][ic].set_xticks([]) ax[ir][ic].set_yticks([]) ax[ir][ic].set_ylabel(letter+' ', size=18, rotation='horizontal') plt.show() plot_histogram_pretty(moby,"aeiou") plot_histogram_pretty(moby,string.ascii_lowercase) # does the shading work, or should we keep the y-limit the same for all graphs? What do you think? plot_histogram_pretty(moby,[l[1] for l in get_counts(moby)],45000)