by David Taylor, www.prooffreader.com, prooffreader@gmail.com
a collection of tools to create and analyze lists of words using python with pandas and matplotlib
determine letter distributions within the word based on a word list with frequencies
word list is pandas dataframe with columns 'word' and 'freq'. Any other columns will be ignored.
** initial_data_munge must be run first to create pickled dataframes of word lists **
dataframe_base = 'brown_words' # change as needed to point to pickle
dataframe_description = 'Brown Corpus from Natural Language Toolkit'
data_path = 'data'
nb_path = 'letter_distributions'
save_filename = '' #used for .pickle and .png, leave as '' to use a default filename
import pandas as pd
import os
words = pd.read_pickle(data_path + "/" + dataframe_base + ".pickle")
b_len = 15 #number of bins, decided by user
if save_filename = '':
letters_pickle = nb_path + '/' + 'letters_' + dataframe_base + '_' + str(b_len) + '.pickle'
else:
letters_pickle = nb_path + '/' + save_filename + '.pickle'
if not os.path.isfile(letters_pickle):
print 'Calculating letters dataframe.'
p_step = b_len # to facilitate readability; cross product
# dataframe for results; z is just a temporary list to facilitate dataframe initialization
z = [0] * b_len
letters = pd.DataFrame({'a': z, 'b': z, 'c': z, 'd': z, 'e': z, 'f': z, 'g': z,
'h': z, 'i': z, 'j': z, 'k': z, 'l': z, 'm': z, 'n': z,
'o': z, 'p': z, 'q': z, 'r': z, 's': z, 't': z, 'u': z,
'v': z, 'w': z, 'x': z, 'y': z, 'z': z})
for i in range(len(words)):
freq = words.freq.iloc[i]
wd = words.word.iloc[i]
p_len = len(wd)
b_step = p_len
bp_mult = b_len * p_len #use multiple instead of range of 0 to 1 (or 0 to 100) to avoid floats not adding together exactly
b_curnum = 0 # current bin
p_curnum = 0 # current letter
curmult = 0 # current position of algorithm from 0 to bp_mult
temp = 0
if p_len > 1:
while curmult < bp_mult:
temp += 1
overlap = min((b_curnum + 1) * b_step, (p_curnum + 1) * p_step) - curmult
#try:
letters[wd[p_curnum]][b_curnum] += freq * overlap / bp_mult
curmult += overlap
if (b_curnum + 1) * b_step == curmult:
b_curnum += 1
if (p_curnum + 1) * p_step == curmult:
p_curnum += 1
if temp >= 100:
print "Error; more than 100 iterations on word " + wd
break
letters.to_pickle(letters_pickle)
else:
print 'Reading from pickle.'
letters = pd.read_pickle(letters_pickle)
Reading from pickle.
Dataframes with bins as rows and letters as columns. Bin number equals row number, so iloc can be used to look up values.
* letters (raw frequencies)
* letters_norm (frequencies normalized so that each letter's maximum has a value of 100; integers, not floats)
* letters_equal_area (letters_norm adjusted so that the area under the slope is the same for all graphs)
Dataframe 'letters_stats' has statistics for each letter, the row indexes are the letters. Columns are:
* max_freq: maximum raw frequency of each letter
* max_bin: bin where max_freq occurs
* total_freq: the total frequency of each letter
* pct_freq: the total frequency as a percent of all letters; for representative English word lists, e is the top letter at about 12 percent
* area_under_norm: area under normalized lines
Dict 'letters_overall' has statistics for the entire dataset;
* max_freq: the maximum raw frequency of any letter
* max_letter: the letter with the maximum raw frequency
* total_freq:
* max_pct:
List 'colors' is assigned by user, with nested lists of lower boundary (the first should normally be zero) and hex color string. The bins follow the usual [low, high) python boundaries. The maximum value is calculated for the user.
colors = [[0, '#ffffcc'],
[0.1, '#ffeda0'],
[0.5, '#fed976'],
[1, '#feb24c'],
[2, '#fd8d3c'],
[3, '#fc4e2a'],
[5, '#e31a1c'],
[9, '#b10026']]
alphabet = 'abcdefghijklmnopqrstuvwxyz'
letters_norm = letters.copy() # note that values are kept as integers for now; the graphs are narrow enough that it should not matter
letters_equal_area = letters.copy()
letters_overall = {}
letters_stats = pd.DataFrame({'max_freq': [0] * 26}, index=list(alphabet))
letters_stats['max_bin'] = 0
letters_stats['total_freq'] = 0
letters_stats['pct_freq'] = 0.0
letters_stats['norm_area'] = 0
letters_stats['color'] = ''
for ltr in alphabet:
letters_stats.max_freq.ix[ltr] = letters[ltr].max()
letters_stats.max_bin.ix[ltr] = letters[letters[ltr] == letters_stats.max_freq.ix[ltr]].index[0]
letters_stats.total_freq.ix[ltr] = letters[ltr].sum()
letters_overall['max_freq'] = letters_stats.max_freq.max()
letters_overall['total_freq'] = letters_stats.total_freq.sum()
letters_overall['max_letter'] = letters_stats[letters_stats.max_freq == letters_overall['max_freq']].iloc[0].name
for ltr in alphabet:
letters_stats.pct_freq.ix[ltr] = (letters_stats.total_freq.ix[ltr] * 100.0
/ letters_overall['total_freq'])
for rw in range(len(letters_norm)):
letters_norm[ltr].iloc[rw] *= 100
letters_norm[ltr].iloc[rw] /= letters_stats['max_freq'].ix[ltr]
letters_overall['max_pct'] = letters_stats.pct_freq.max()
for ltr in alphabet:
# assign colors based on pct_max and color list
color = ''
for i in range(len(colors)):
if letters_stats.pct_freq.ix[ltr] >= colors[i][0]:
color = colors[i][1]
letters_stats.color.ix[ltr] = color
# calculate area under norm lines
area = 0
for rw in range(len(letters_norm) - 1):
height0 = letters_norm[ltr].iloc[rw]
height1 = letters_norm[ltr].iloc[rw+1]
area += min(height0, height1)
area += 0.5 * abs(height1 - height0)
letters_stats.norm_area[ltr] = area
letters_overall['max_area'] = letters_stats.norm_area.max()
letters_overall['max_equal_area'] = 0
for ltr in alphabet:
for rw in range(len(letters_equal_area)):
letters_equal_area[ltr].iloc[rw] = (letters_norm[ltr].iloc[rw] *
letters_overall['max_area'] / letters_stats.norm_area[ltr])
letters_overall['max_equal_area'] = max(letters_overall['max_equal_area'], letters_equal_area[ltr].max())
#rescale to 100
for ltr in alphabet:
for rw in range(len(letters_equal_area)):
letters_equal_area[ltr].iloc[rw] *= 100
letters_equal_area[ltr].iloc[rw] /= letters_overall['max_equal_area']
import math
letters_overall['max_pct_for_legend'] = int(math.ceil(letters_overall['max_pct']))
letters_overall['max_compromise'] = 0
letters_compromise = letters_norm.copy()
for ltr in alphabet:
for rw in range(len(letters_equal_area)):
letters_compromise[ltr].iloc[rw] = (letters_norm[ltr].iloc[rw] + letters_equal_area[ltr].iloc[rw]) / 2
letters_overall['max_compromise'] = max(letters_overall['max_compromise'], letters_compromise[ltr].max())
#rescale to 100
for ltr in alphabet:
for rw in range(len(letters_equal_area)):
letters_compromise[ltr].iloc[rw] *= 100
letters_compromise[ltr].iloc[rw] /= letters_overall['max_compromise']
Print parts of dataframes for sanity check:
print "letters['a']:"
print letters['a']
print "\nletters_norm['a']:"
print letters_norm['a']
print "\nletters_equal_area['a']:"
print letters_equal_area['a']
print "\nletters_compromise['a']:"
print letters_compromise['a']
print "\nletters_stats:"
print letters_stats
print "\nLetters_overall:"
print letters_overall
letters['a']: 0 5555 1 5379 2 5436 3 6069 4 7241 5 6180 6 6195 7 5307 8 4705 9 4417 10 2505 11 1460 12 816 13 268 14 178 Name: a, dtype: int64 letters_norm['a']: 0 76 1 74 2 75 3 83 4 100 5 85 6 85 7 73 8 64 9 60 10 34 11 20 12 11 13 3 14 2 Name: a, dtype: int64 letters_equal_area['a']: 0 34 1 33 2 33 3 37 4 45 5 38 6 38 7 33 8 29 9 27 10 15 11 8 12 4 13 1 14 0 Name: a, dtype: int64 letters_compromise['a']: 0 55 1 53 2 54 3 60 4 72 5 61 6 61 7 53 8 46 9 43 10 24 11 14 12 7 13 2 14 1 Name: a, dtype: int64 letters_stats: max_freq max_bin total_freq pct_freq norm_area color a 7241 4 61711 7.763015 806 #e31a1c b 2510 0 13683 1.721271 489 #feb24c c 2204 0 14703 1.849583 607 #feb24c d 5739 14 29344 3.691366 444 #fc4e2a e 12911 12 96941 12.194818 692 #b10026 f 2883 9 31145 3.917925 986 #fc4e2a g 1324 14 9848 1.238842 655 #feb24c h 9057 5 61327 7.714709 642 #e31a1c i 6988 6 53692 6.754254 736 #e31a1c j 244 0 1044 0.131331 367 #ffeda0 k 489 14 4313 0.542559 799 #fed976 l 2410 10 23446 2.949420 906 #fd8d3c m 1999 0 16735 2.105201 757 #fd8d3c n 6193 9 57395 7.220078 869 #e31a1c o 8328 5 79843 10.043953 910 #b10026 p 1804 0 9730 1.223998 474 #feb24c q 91 0 415 0.052205 401 #ffffcc r 3563 10 37179 4.676980 973 #fc4e2a s 6166 14 46107 5.800090 663 #e31a1c t 10132 0 88497 11.132594 790 #b10026 u 2279 6 17318 2.178540 736 #fd8d3c v 697 8 5615 0.706346 777 #fed976 w 3744 0 19512 2.454537 457 #fd8d3c x 198 2 898 0.112965 433 #ffeda0 y 3037 14 14292 1.797881 407 #feb24c z 39 8 203 0.025537 504 #ffffcc Letters_overall: {'total_freq': 794936, 'max_area': 986, 'max_equal_area': 268, 'max_letter': 'e', 'max_freq': 12911, 'max_compromise': 100, 'max_pct_for_legend': 13, 'max_pct': 12.194818199200942}
save_plot = True
column_list = list('abcdefghijklmnopqrstuvwxyz')
x_length = b_len
import matplotlib.pyplot as plt
fig, axes = plt.subplots(26, 1, figsize=(12, 90))
#plt.title("Title", size=18, color='k')
for pos in range(len(column_list)):
ltr = column_list[pos]
axes[pos].plot(range(x_length), letters_compromise[ltr], color='k', linewidth = 3, label = ltr)
axes[pos].set_ylim(0,100)
fill_color = letters_stats['color'].ix[ltr]
axes[pos].fill_between(range(x_length), letters_compromise[ltr], color=fill_color, interpolate=True)
axes[pos].set_xticks([])
axes[pos].set_yticks([])
axes[pos].set_xticklabels([], size=0)
axes[pos].set_yticklabels([])
axes[pos].get_xaxis().set_visible(False)
axes[pos].set_ylabel(ltr+' ', size=24, rotation='horizontal')
plt.subplots_adjust(hspace=0.1)
if save_filename != '':
plot_name = nb_path + '/' + save_filename + '.png'
else:
plot_name = nb_path + '/letters_' + dataframe_base + '_' + str(b_len) + '.png'
if save_plot == True:
plt.savefig(plot_name)