import pandas as pd import os import time words = pd.read_pickle('gadsby_analysis.pickle') alphabet = 'abcdefghijklmnopqrstuvwxyz' def increment_dict(d, key, increment=1): # from before I found collections.Counter if key in d.keys(): d[key] += increment else: d[key] = increment return d redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes if not os.path.isfile('gadsby_analysis_letfreqs.pickle') or redo_pickle == True: start = time.time() letfreq_dict = {} for ltr in alphabet: letfreq_dict[ltr] = 0 for i in range(len(words)): wd = words.word.iloc[i] for ltr in wd: letfreq_dict[ltr] += words.freq.iloc[i] letfreqs_df = pd.DataFrame() for letter in alphabet: temp = pd.DataFrame({'letter':[letter], 'freq':[letfreq_dict[letter]]}) letfreqs_df = letfreqs_df.append(temp, ignore_index=True) letfreqs_df['normal'] = 0.0 letfreqsum = letfreqs_df.freq.sum() for i in range(len(letfreqs_df)): letter = letfreqs_df.letter.iloc[i] freq = letfreqs_df.freq.iloc[i] letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum letfreqs_df.set_index('letter', drop=False, inplace=True) print "%d seconds elapsed." % (time.time() - start) letfreqs_df.to_pickle('gadsby_analysis_letfreqs.pickle') else: print 'Reading from pickle.' letfreqs_df = pd.read_pickle('gadsby_analysis_letfreqs.pickle') df = letfreqs_df.copy() df.columns = ['gadsby_freq', 'letter', 'gadsby_pct'] df = df[['letter', 'gadsby_freq', 'gadsby_pct']] print df.head() redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes brown = pd.read_pickle('brown_non_e.pickle') if not os.path.isfile('brown_non_e_leftfreq.pickle') or redo_pickle == True: start = time.time() letfreq_dict = {} for ltr in alphabet: letfreq_dict[ltr] = 0 for i in range(len(words)): wd = brown.word.iloc[i] for ltr in wd: letfreq_dict[ltr] += brown.freq.iloc[i] letfreqs_df = pd.DataFrame() for letter in alphabet: temp = pd.DataFrame({'letter':[letter], 'freq':[letfreq_dict[letter]]}) letfreqs_df = letfreqs_df.append(temp, ignore_index=True) letfreqs_df['normal'] = 0.0 letfreqsum = letfreqs_df.freq.sum() for i in range(len(letfreqs_df)): letter = letfreqs_df.letter.iloc[i] freq = letfreqs_df.freq.iloc[i] letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum letfreqs_df.set_index('letter', drop=False, inplace=True) print "%d seconds elapsed." % (time.time() - start) letfreqs_df.to_pickle('brown_non_e_letfreqs.pickle') else: print 'Reading from pickle.' letfreqs_df = pd.read_pickle('brown_non_e_letfreqs.pickle') brown = letfreqs_df.copy() brown.columns = ['brown_freq', 'letter', 'brown_pct'] brown = brown[['letter', 'brown_freq', 'brown_pct']] print brown.head() df = pd.merge(df, brown, how='inner', on='letter', left_index=True, right_index = False) df['brown_normalized'] = (df.brown_freq * df.gadsby_freq.sum()/df.brown_freq.sum()).astype(int) df['diff'] = df.gadsby_pct - df.brown_pct df['ratio'] = df.gadsby_pct / df.brown_pct def loglike(n1, t1, n2, t2): """Calculates Dunning log likelihood of an observation of frequency n1 in a corpus of size t1, compared to a frequency n2 in a corpus of size t2. If result is positive, it is more likely to occur in corpus 1, otherwise in corpus 2.""" e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values e2 = t2*1.0*(n1+n2)/(t1+t2) LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2))) if n2*1.0/t2 > n1*1.0/t1: LL = -LL return LL from numpy import log t1 = df.gadsby_freq.sum() t2 = df.brown_freq.sum() df['log_likelihood'] = 0.0 for i in range(len(df)): n1 = df.gadsby_freq.iloc[i] n2 = df.brown_freq.iloc[i] df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2) df df.to_csv('gadsby_letter_analysis.csv') print df.sort('log_likelihood', ascending=False).head(10) print df.sort('log_likelihood', ascending=True).head(20) import matplotlib.pyplot as plt %matplotlib inline import matplotlib.pyplot as plt %matplotlib inline import seaborn x = list(df.brown_pct) y = list(df.gadsby_pct) s = list(df.gadsby_freq / 10) fig, ax = plt.subplots(figsize=(12,9)) #plt.style.use('bmh') ax.scatter(x,y,s=s,c='b',alpha=0.3, marker='o', color='steelblue') plt.plot([0,13], [0,13], linestyle='-', marker='None', color='r', markersize=0) ax.set_xlim(0, 13) ax.set_ylim(0, 13) ax.set_ylabel("Percent frequency in Gadsby", fontsize=14) ax.set_xlabel("Percent frequency in Brown without 'e'-containing words", fontsize=14) ax.set_title("Comparison of letter frequencies in Gadsby, a novel without\nthe letter 'e', \ and the Brown corpus with 'e'-containing words removed", fontsize=18) txt = [] for i, ltr in enumerate(alphabet): if ltr != 'e': txt.append( ax.text(x[i], y[i], ltr, ha="center", va="center", rotation=0, size=s[i]/40 ) ) plt.show() import plotly.plotly as py from plotly.graph_objs import * #fill this in with your username and api key py.sign_in(open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_emanresu.txt', 'r').read(), open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_yekipa.txt', 'r').read()) data = Data([ Bar( x=list(df.letter), y=list(df.log_likelihood) )]) layout=Layout( title="Comparison of letter frequencies in Gadsby, a novel without the letter 'e', and \ the Brown corpus with 'e'-containing words removed", yaxis=YAxis( title='Log Likelihood keyness' ) ) #plot_url = py.plot(data, layout=layout, filename='gadsby_letter_keyness') py.iplot(data, layout=layout, filename='gadsby_letter_keyness')