by David Taylor, www.prooffreader.com, [email protected]
from a collection of tools to create and analyze lists of words using python with pandas and matplotlib
import pandas as pd
import os
import time
words = pd.read_pickle('gadsby_analysis.pickle')
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def increment_dict(d, key, increment=1): # from before I found collections.Counter
if key in d.keys():
d[key] += increment
else:
d[key] = increment
return d
redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes
if not os.path.isfile('gadsby_analysis_letfreqs.pickle') or redo_pickle == True:
start = time.time()
letfreq_dict = {}
for ltr in alphabet:
letfreq_dict[ltr] = 0
for i in range(len(words)):
wd = words.word.iloc[i]
for ltr in wd:
letfreq_dict[ltr] += words.freq.iloc[i]
letfreqs_df = pd.DataFrame()
for letter in alphabet:
temp = pd.DataFrame({'letter':[letter],
'freq':[letfreq_dict[letter]]})
letfreqs_df = letfreqs_df.append(temp, ignore_index=True)
letfreqs_df['normal'] = 0.0
letfreqsum = letfreqs_df.freq.sum()
for i in range(len(letfreqs_df)):
letter = letfreqs_df.letter.iloc[i]
freq = letfreqs_df.freq.iloc[i]
letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum
letfreqs_df.set_index('letter', drop=False, inplace=True)
print "%d seconds elapsed." % (time.time() - start)
letfreqs_df.to_pickle('gadsby_analysis_letfreqs.pickle')
else:
print 'Reading from pickle.'
letfreqs_df = pd.read_pickle('gadsby_analysis_letfreqs.pickle')
df = letfreqs_df.copy()
df.columns = ['gadsby_freq', 'letter', 'gadsby_pct']
df = df[['letter', 'gadsby_freq', 'gadsby_pct']]
print df.head()
redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes
brown = pd.read_pickle('brown_non_e.pickle')
if not os.path.isfile('brown_non_e_leftfreq.pickle') or redo_pickle == True:
start = time.time()
letfreq_dict = {}
for ltr in alphabet:
letfreq_dict[ltr] = 0
for i in range(len(words)):
wd = brown.word.iloc[i]
for ltr in wd:
letfreq_dict[ltr] += brown.freq.iloc[i]
letfreqs_df = pd.DataFrame()
for letter in alphabet:
temp = pd.DataFrame({'letter':[letter],
'freq':[letfreq_dict[letter]]})
letfreqs_df = letfreqs_df.append(temp, ignore_index=True)
letfreqs_df['normal'] = 0.0
letfreqsum = letfreqs_df.freq.sum()
for i in range(len(letfreqs_df)):
letter = letfreqs_df.letter.iloc[i]
freq = letfreqs_df.freq.iloc[i]
letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum
letfreqs_df.set_index('letter', drop=False, inplace=True)
print "%d seconds elapsed." % (time.time() - start)
letfreqs_df.to_pickle('brown_non_e_letfreqs.pickle')
else:
print 'Reading from pickle.'
letfreqs_df = pd.read_pickle('brown_non_e_letfreqs.pickle')
brown = letfreqs_df.copy()
brown.columns = ['brown_freq', 'letter', 'brown_pct']
brown = brown[['letter', 'brown_freq', 'brown_pct']]
print brown.head()
df = pd.merge(df, brown, how='inner', on='letter', left_index=True, right_index = False)
df['brown_normalized'] = (df.brown_freq * df.gadsby_freq.sum()/df.brown_freq.sum()).astype(int)
df['diff'] = df.gadsby_pct - df.brown_pct
df['ratio'] = df.gadsby_pct / df.brown_pct
def loglike(n1, t1, n2, t2):
"""Calculates Dunning log likelihood of an observation of
frequency n1 in a corpus of size t1, compared to a frequency n2
in a corpus of size t2. If result is positive, it is more
likely to occur in corpus 1, otherwise in corpus 2."""
e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
e2 = t2*1.0*(n1+n2)/(t1+t2)
LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
if n2*1.0/t2 > n1*1.0/t1:
LL = -LL
return LL
from numpy import log
t1 = df.gadsby_freq.sum()
t2 = df.brown_freq.sum()
df['log_likelihood'] = 0.0
for i in range(len(df)):
n1 = df.gadsby_freq.iloc[i]
n2 = df.brown_freq.iloc[i]
df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)
df
df.to_csv('gadsby_letter_analysis.csv')
print df.sort('log_likelihood', ascending=False).head(10)
print df.sort('log_likelihood', ascending=True).head(20)
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn
x = list(df.brown_pct)
y = list(df.gadsby_pct)
s = list(df.gadsby_freq / 10)
fig, ax = plt.subplots(figsize=(12,9))
#plt.style.use('bmh')
ax.scatter(x,y,s=s,c='b',alpha=0.3, marker='o', color='steelblue')
plt.plot([0,13], [0,13], linestyle='-', marker='None', color='r', markersize=0)
ax.set_xlim(0, 13)
ax.set_ylim(0, 13)
ax.set_ylabel("Percent frequency in Gadsby", fontsize=14)
ax.set_xlabel("Percent frequency in Brown without 'e'-containing words", fontsize=14)
ax.set_title("Comparison of letter frequencies in Gadsby, a novel without\nthe letter 'e', \
and the Brown corpus with 'e'-containing words removed", fontsize=18)
txt = []
for i, ltr in enumerate(alphabet):
if ltr != 'e':
txt.append( ax.text(x[i], y[i], ltr, ha="center", va="center", rotation=0,
size=s[i]/40 ) )
plt.show()
import plotly.plotly as py
from plotly.graph_objs import *
#fill this in with your username and api key
py.sign_in(open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_emanresu.txt', 'r').read(),
open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_yekipa.txt', 'r').read())
data = Data([
Bar(
x=list(df.letter),
y=list(df.log_likelihood)
)])
layout=Layout(
title="Comparison of letter frequencies in Gadsby, a novel without the letter 'e', and \
the Brown corpus with 'e'-containing words removed",
yaxis=YAxis(
title='Log Likelihood keyness'
)
)
#plot_url = py.plot(data, layout=layout, filename='gadsby_letter_keyness')
py.iplot(data, layout=layout, filename='gadsby_letter_keyness')