import pandas as pd
import time
from math import ceil
import pickle
import matplotlib.pyplot as plt
from math import floor, log10
%matplotlib inline
class progress_bar:
def __init__(self, loop_length):
import time
self.start = time.time()
self.increment_size = 100.0/loop_length
self.curr_count = 0
self.curr_pct = 0
self.overflow = False
print '% complete:',
def increment(self):
self.curr_count += self.increment_size
if int(self.curr_count) > self.curr_pct:
self.curr_pct = int(self.curr_count)
if self.curr_pct <= 100:
print self.curr_pct,
elif self.overflow == False:
print "\n*!* Count has gone over 100%; likely either due to:\n*!* - an error in the loop_length specified when " + \
"progress_bar was instantiated\n*!* - an error in the placement of the increment() function"
print '*!* Elapsed time when progress bar full: %0.1f seconds.' % (time.time() - self.start)
self.overflow = True
def finish(self):
if self.curr_pct == 99:
print "100", # this is a cheat, because rounding sometimes makes the maximum count 99. One day I'll fix this bug.
if self.overflow == True:
print '*!* Elapsed time after end of loop: %0.1f seconds.\n' % (time.time() - self.start)
else:
print '\nElapsed time: %0.1f seconds.\n' % (time.time() - self.start)
# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):
significance = int(floor((log10(x))))
val = floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df = df[df.nonalpha == False]
df['year'] = df.decade + 5 # middle of decade
df = df[['word', 'year', 'pct']]
df.sort(['word', 'year'], ascending=True, inplace=True)
print df.head()
words = df.word.unique()
print len(words)
word year pct 0 a 1815 1.763519 1 a 1825 1.901462 2 a 1835 2.061233 3 a 1845 2.073233 4 a 1855 2.069824 337085
# remove any whose count is 20, i.e. never have a zero value
dfcounts = pd.DataFrame(df.groupby('word').pct.count())
wordcounts20 = list(dfcounts[dfcounts.pct == 20].index)
df = df[~df.word.isin(wordcounts20)]
# make a set of top 1000 words for each year, both by max and by total
topwords = set()
for i in range(1815, 2015, 10):
dftemp = df[df.year == i]
dftempmax = dftemp.groupby('word')['pct'].max()
dftempmax.sort()
dftemptotal = dftemp.groupby('word')['pct'].sum()
dftemptotal.sort()
topwords.update(dftempmax[-1000:].index)
topwords.update(dftemptotal[-1000:].index)
print len(topwords)
df = df[df.word.isin(topwords)]
8044
# Add missing years as pct 0
pbar = progress_bar(len(df))
# 1000 words at a time
bin_size = 1000
bins = int(ceil(len(words)/bin_size)) + 1
new_word = []
new_year = []
new_pct = []
for i in range(bins):
loopwords = words[i*bin_size:(i+1)*bin_size]
loopdf = df[df.word.isin(loopwords)]
for j in range(len(loopdf)):
word = loopdf.word.iloc[j]
year = loopdf.year.iloc[j]
pbar.increment()
if j == 0 or word != loopdf.word.iloc[j-1]:
cur_yr = 1815
else:
cur_yr += 10
while cur_yr < year:
new_word.append(word)
new_year.append(cur_yr)
new_pct.append(0)
cur_yr += 10
if j == len(loopdf) - 1 or word != loopdf.word.iloc[j+1]:
while cur_yr <= 2005 and cur_yr != year:
new_word.append(word)
new_year.append(cur_yr)
new_pct.append(0)
cur_yr += 10
pbar.finish()
% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 Elapsed time: 17.7 seconds.
print len(new_word)
print len(df)
df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True)
df.sort(['word', 'year'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle('coha_1_trendiness_checkpoint.pickle')
38407 106758
# sanity check
df_orig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
print df_orig[df_orig.word=="dukakis"]
print df[df.word=="dukakis"]
word freq decade nonalpha length pct 650008 dukakis 10 1970 False 7 0.000043 650009 dukakis 570 1980 False 7 0.002344 650010 dukakis 70 1990 False 7 0.000260 650011 dukakis 28 2000 False 7 0.000098 pct word year 40608 0.000000 dukakis 1815 40609 0.000000 dukakis 1825 40610 0.000000 dukakis 1835 40611 0.000000 dukakis 1845 40612 0.000000 dukakis 1855 40613 0.000000 dukakis 1865 40614 0.000000 dukakis 1875 40615 0.000000 dukakis 1885 40616 0.000000 dukakis 1895 40617 0.000000 dukakis 1905 40618 0.000000 dukakis 1915 40619 0.000000 dukakis 1925 40620 0.000000 dukakis 1935 40621 0.000000 dukakis 1945 40622 0.000000 dukakis 1955 40623 0.000000 dukakis 1965 40624 0.000043 dukakis 1975 40625 0.002344 dukakis 1985 40626 0.000260 dukakis 1995 40627 0.000098 dukakis 2005
# add interpolated values for years ending in 0
# so that peaks can be calculated for single-decade words
pbar = progress_bar(len(df))
# 10,000 rows at a time
bin_size = 10000
bins = int(ceil(len(df)/bin_size)) + 1
new_word = []
new_year = []
new_pct = []
for i in range(bins):
loopdf = df[i*bin_size:(i+1)*bin_size]
for j in range(len(loopdf)):
word = loopdf.word.iloc[j]
year = loopdf.year.iloc[j]
pbar.increment()
if j == 0 or word != loopdf.word.iloc[j-1]:
pass
else:
new_word.append(word)
new_year.append(year - 5)
avgpct = loopdf.pct.iloc[j]
avgpct += loopdf.pct.iloc[j-1]
avgpct /= 2
new_pct.append(avgpct)
pbar.finish()
% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 Elapsed time: 24.9 seconds.
print len(new_word)
print len(df)
df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True)
df.sort(['word', 'year'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle('coha_1_trendiness.pickle')
137108 145165
# sanity check dforig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df = pd.read_pickle("coha_1_trendiness.pickle")
df_orig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
print df_orig[df_orig.word == 'dukakis']
print df[df.word == 'dukakis']
word freq decade nonalpha length pct 650008 dukakis 10 1970 False 7 0.000043 650009 dukakis 570 1980 False 7 0.002344 650010 dukakis 70 1990 False 7 0.000260 650011 dukakis 28 2000 False 7 0.000098 pct word year 78975 0.000000 dukakis 1815 78976 0.000000 dukakis 1820 78977 0.000000 dukakis 1825 78978 0.000000 dukakis 1830 78979 0.000000 dukakis 1835 78980 0.000000 dukakis 1840 78981 0.000000 dukakis 1845 78982 0.000000 dukakis 1850 78983 0.000000 dukakis 1855 78984 0.000000 dukakis 1860 78985 0.000000 dukakis 1865 78986 0.000000 dukakis 1870 78987 0.000000 dukakis 1875 78988 0.000000 dukakis 1880 78989 0.000000 dukakis 1885 78990 0.000000 dukakis 1890 78991 0.000000 dukakis 1895 78992 0.000000 dukakis 1900 78993 0.000000 dukakis 1905 78994 0.000000 dukakis 1910 78995 0.000000 dukakis 1915 78996 0.000000 dukakis 1920 78997 0.000000 dukakis 1925 78998 0.000000 dukakis 1930 78999 0.000000 dukakis 1935 79000 0.000000 dukakis 1940 79001 0.000000 dukakis 1945 79002 0.000000 dukakis 1950 79003 0.000000 dukakis 1955 79004 0.000000 dukakis 1960 79005 0.000000 dukakis 1965 79006 0.000022 dukakis 1970 79007 0.000043 dukakis 1975 79008 0.001194 dukakis 1980 79009 0.002344 dukakis 1985 79010 0.001302 dukakis 1990 79011 0.000260 dukakis 1995 79012 0.000179 dukakis 2000 79013 0.000098 dukakis 2005
wordmax = df.groupby('word').pct.max()
peak_height_cutoff = 0.5
words = []
years_start = []
years_max = []
years_end = []
trendiness = []
pbar = progress_bar(len(df))
for i in range(len(df)):
pbar.increment()
year = df.year.iloc[i]
pct = df.pct.iloc[i]
if year == 1815:
word = df.word.iloc[i]
cur_max = wordmax[word]
year_start = 0
year_max = 0
year_end = 0
if pct < peak_height_cutoff * cur_max:
starts_below_cutoff = True
else:
starts_below_cutoff = False
if pct >= peak_height_cutoff * cur_max:
if year_start == 0 and dips_below_cutoff == True:
year_start = year
year_end = year
else:
dips_below_cutoff = True
if pct == cur_max:
year_max = year
if (year == 2005 and
starts_below_cutoff == True and
pct < peak_height_cutoff * cur_max): # equivalent of ends_below_cutoff
words.append(word)
years_start.append(year_start)
years_max.append(year_max)
years_end.append(year_end)
trendiness.append(cur_max / (year_end - year_start))
pbar.finish()
% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 Elapsed time: 20.8 seconds.
trends = pd.DataFrame({'word':words, 'year_start':years_start, 'year_max':years_max, 'year_end':years_end,
'trendiness':trendiness})
trends = trends[['word', 'trendiness', 'year_start', 'year_max', 'year_end']]
trends.sort('trendiness', ascending=False, inplace=True)
trends.to_csv('coha_trendiness.csv')
trends.to_pickle('coha_trendiness.pickle')
print len(trends)
print trends.head(50)
3941 word trendiness year_start year_max year_end 2955 reagan 0.003379 1980 1985 1990 2572 nixon 0.002831 1970 1975 1980 3687 uv 0.002767 1860 1865 1870 1931 kennedy 0.002747 1960 1965 1970 1099 eisenhower 0.002241 1950 1955 1960 3525 ter 0.001686 1880 1885 1890 734 communist 0.001660 1950 1955 1965 2793 planes 0.001219 1940 1945 1950 1870 jimmie 0.001180 1910 1915 1920 770 coolidge 0.001105 1920 1925 1930 1128 elsie 0.001072 1870 1875 1880 438 bradshaw 0.001070 1830 1835 1840 1965 korea 0.001056 1950 1955 1960 3064 rollo 0.001039 1850 1855 1860 3736 vietnam 0.001029 1960 1965 1975 3071 roosevelt 0.001027 1930 1935 1950 1917 katy 0.000983 1860 1865 1870 1458 graeme 0.000935 1860 1865 1870 1102 eleanor 0.000932 1920 1925 1930 3871 winthrop 0.000931 1850 1855 1860 1843 jeff 0.000906 1950 1955 1960 2183 madeleine 0.000891 1860 1865 1870 898 dave 0.000881 1910 1915 1920 735 communists 0.000877 1950 1955 1965 2006 lanny 0.000862 1940 1945 1950 1058 dulles 0.000836 1950 1955 1960 2686 pa 0.000820 1880 1885 1890 107 amy 0.000808 1860 1865 1870 1869 jimbo 0.000797 1970 1975 1980 1800 isabella 0.000783 1830 1835 1840 1952 kissinger 0.000781 1970 1975 1980 3340 soviet 0.000767 1950 1955 1990 2971 redwood 0.000761 1820 1825 1830 964 dewey 0.000757 1940 1945 1950 3407 stitch 0.000749 1870 1875 1880 1528 gypsy 0.000737 1860 1865 1870 1639 hev 0.000733 1860 1865 1870 1661 hitler 0.000718 1935 1945 1950 1131 elvira 0.000714 1820 1825 1830 2319 mcs 0.000714 1950 1955 1960 201 atomic 0.000709 1945 1955 1960 841 cuba 0.000687 1960 1965 1970 76 alessandro 0.000685 1880 1885 1890 3846 wilford 0.000683 1860 1865 1870 3632 truman 0.000681 1945 1955 1960 2211 malone 0.000672 1960 1965 1970 2193 magdalen 0.000665 1870 1875 1880 1966 korean 0.000663 1950 1955 1960 3092 rowland 0.000662 1870 1875 1880 3403 stevenson 0.000652 1950 1955 1960
print df[df.word == 'reagan']
pct word year 211087 0.000000 reagan 1815 211088 0.000000 reagan 1820 211089 0.000000 reagan 1825 211090 0.000000 reagan 1830 211091 0.000000 reagan 1835 211092 0.000000 reagan 1840 211093 0.000000 reagan 1845 211094 0.000000 reagan 1850 211095 0.000000 reagan 1855 211096 0.000003 reagan 1860 211097 0.000006 reagan 1865 211098 0.000025 reagan 1870 211099 0.000044 reagan 1875 211100 0.000065 reagan 1880 211101 0.000086 reagan 1885 211102 0.000083 reagan 1890 211103 0.000079 reagan 1895 211104 0.000049 reagan 1900 211105 0.000019 reagan 1905 211106 0.000014 reagan 1910 211107 0.000009 reagan 1915 211108 0.000007 reagan 1920 211109 0.000004 reagan 1925 211110 0.000006 reagan 1930 211111 0.000008 reagan 1935 211112 0.000023 reagan 1940 211113 0.000038 reagan 1945 211114 0.000038 reagan 1950 211115 0.000038 reagan 1955 211116 0.001042 reagan 1960 211117 0.002045 reagan 1965 211118 0.002479 reagan 1970 211119 0.002912 reagan 1975 211120 0.018351 reagan 1980 211121 0.033789 reagan 1985 211122 0.018466 reagan 1990 211123 0.003143 reagan 1995 211124 0.002649 reagan 2000 211125 0.002156 reagan 2005
# top for each decade
for i in range(1825,2005,10):
print i, trends[trends.year_max == i].word.iloc[0]
1825 redwood 1835 bradshaw 1845 puffer 1855 rollo 1865 uv 1875 elsie 1885 ter 1895 madonna 1905 ivan 1915 jimmie 1925 coolidge 1935 roosevelt 1945 planes 1955 eisenhower 1965 kennedy 1975 nixon 1985 reagan 1995 epa
# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):
significance = int(floor((log10(x))))
val = floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
def make_chart(df, words, form = 'line', title='', colors= [], smoothing=0,
baseline='sym', png_name='', ymax=None):
dataframe = df[df['word'].isin(words)]
dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word']))
dataframe.sort(inplace=True, ascending=True)
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
max_y = 0
for word in dataframe.columns:
max_y = max(max_y, dataframe[word].max())
final_word = word
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[word].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[word].iloc[row] = newvalues[row]
y_text = "% of words in corpus"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
x_values = list(dataframe.index)
y_zeroes = [0] * len(x_values)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for word in words:
color = colors[counter % num_colors]
counter += 1
label = word
ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3)
if ymax == None:
ax.set_ylim(0,determine_y_limit(max_y))
else:
ax.set_ylim(0, ymax)
ax.set_title(title, size = 20)
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 20)
ax.set_xticks(range(1810, 2010, 10))
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for word in dataframe.columns:
label = word
current_ymax = dataframe[word].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[word], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[word], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for word in dataframe.columns:
label = word
axes[counter].plot(x_values, dataframe[word], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[word], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(startyear, endyear)
yaxtext = 'Percent of words in corpus'
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = pyplot.stackplot(x_values, *[dataframe[word] for word in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(pyplot.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
wordlist = []
for word in dataframe.columns:
wordlist.append(word)
plt.legend(legendProxies, wordlist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
fileword = save_path + "/" + png_name + ".png"
plt.savefig(fileword)
plt.close()
perdecade = ['redwood', 'bradshaw', 'puffer', 'rollo', 'uv', 'elsie', 'ter', 'madonna', 'ivan', 'jimmie', 'coolidge', 'roosevelt', 'planes', 'eisenhower', 'kennedy', 'nixon', 'reagan', 'epa']
print len(perdecade)
print perdecade.index('puffer')
18 2
make_chart(df=df,
words = perdecade,
form = 'line',
title='\"Trendiest\" words in Corpus of Historical American English',
colors = ["#1f78b4","#ae4ec9","#33a02c","#e31a1c",
"#009b89","#b15928"],
smoothing=0,
baseline='sym',
png_name='',
ymax = 0.05)
Warning: colors will be repeated.
#repeat plot but with repeating six colors in chronological order
words = perdecade
title=''
smoothing=0
ymax=0.05
dataframe = df[df['word'].isin(words)]
dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word']))
dataframe.sort(inplace=True, ascending=True)
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
max_y = 0
for word in dataframe.columns:
max_y = max(max_y, dataframe[word].max())
final_word = word
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[word].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[word].iloc[row] = newvalues[row]
y_text = "% of words in corpus"
num_series = len(dataframe.columns)
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
x_values = list(dataframe.index)
y_zeroes = [0] * len(x_values)
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for word in words:
color = colors[counter % num_colors]
counter += 1
label = word
ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3)
if ymax == None:
ax.set_ylim(0,determine_y_limit(max_y))
else:
ax.set_ylim(0, ymax)
ax.set_title(title, size = 20)
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 20)
ax.set_xticks(range(1810, 2010, 10))
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16)
Warning: colors will be repeated.
<matplotlib.legend.Legend at 0xcb06358>
top10 = list(trends.word[:10])
make_chart(df=df,
words = top10,
form = 'line',
title='Top 10 \"Trendiest\" words in the Corpus of Historical American English, 1810s-2000s',
colors= [],
smoothing=0,
baseline='sym',
png_name='',
ymax = 0.045)
# note: they are all the in the top 10 per decade list
make_chart(df=df,
words = ['atomic'],
form = 'line',
title='',
colors= ['#ee2222', '#4444aa'],
smoothing=0,
baseline='sym',
png_name='',
ymax = 0.015)