import re
import os
import nltk
raw = open('gadsby_full_lower.txt', 'r').read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
# summary statistics
print ' '
print "Characters: {}".format(len(raw))
print "Tokens: {}".format(len(tokens))
print "Unique tokens: {}".format(len(set(tokens)))
print "Lexical diversity: {:.3f}".format(len(set(tokens))*1.0/len(tokens))
# create frequency distribution dataframe
fdist = nltk.FreqDist(text)
import pandas as pd
df = pd.DataFrame({'token': fdist.keys(), 'freq':fdist.values()})
df = df[~df.token.str.contains('[^A-Za-z]')] # remove tokens with non-alphabetic characters
df.columns = ['freq', 'word']
print ' '
print df.head()
Characters: 277418 Tokens: 61759 Unique tokens: 5233 Lexical diversity: 0.085 freq word 1 2437 a 2 1669 and 4 1225 that 6 1162 of 8 934 in
#sanity check
print df[df.word.str.contains('e')]
Empty DataFrame Columns: [freq, word] Index: []
if not os.path.isfile('brown_df.pickle'):
print "Processing Brown corpus from NLTK"
from nltk.corpus import brown
categories = []
words = []
frequencies = []
for category in brown.categories():
wordlist = brown.words(categories=category)
freqs = nltk.FreqDist([w.lower() for w in wordlist])
for key in freqs.keys():
categories.append(category)
words.append(key)
frequencies.append(freqs[key])
brown_df = pd.DataFrame({'word':words, 'freq':frequencies, 'category':categories})
brown_df['nonalpha'] = False
brown_df['nonalpha'][brown_df.word.str.contains('[^A-Za-z]')] = True
brown_df.to_pickle('brown_df.pickle')
else:
print "Reading brown_df.pickle"
brown_df = pd.read_pickle('brown_df.pickle')
Reading brown_df.pickle
if not os.path.isfile('brown_non_e.pickle'):
print 'Creating dataframe.'
total_freq = 0
weighted_length = 0
brown_words = brown_df[brown_df.nonalpha == False]
brown_words = brown_words[~brown_words.word.str.contains('e')]
brown_words = pd.DataFrame(brown_words.groupby(['word']).sum()).reset_index(drop=False)
brown_words['length'] = 0
brown_words.sort('freq', ascending = False, inplace=True)
total_freq = brown_words.freq.sum()
median_counter = total_freq / 2
median_found = False
for idx, row in brown_words.iterrows():
curr_len = len(brown_words.word[idx])
brown_words.length[idx] = curr_len
weighted_length += brown_words.freq[idx] * curr_len
median_counter -= curr_len * brown_words.freq[idx]
if median_counter < 0 and median_found == False:
wt_median_len = curr_len
median_found = True
brown_words = brown_words[['word', 'freq', 'length']]
brown_words.to_pickle('brown_non_e.pickle')
else:
print 'Reading pickle.'
brown_words = pd.read_pickle('brown_non_e.pickle')
Reading pickle.
def loglike(n1, t1, n2, t2):
"""Calculates Dunning log likelihood of an observation of
frequency n1 in a corpus of size t1, compared to a frequency n2
in a corpus of size t2. If result is positive, it is more
likely to occur in corpus 1, otherwise in corpus 2."""
e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
e2 = t2*1.0*(n1+n2)/(t1+t2)
LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
if n2*1.0/t2 > n1*1.0/t1:
LL = -LL
return LL
from numpy import log
t1 = df.freq.sum()
t2 = brown_words.freq.sum()
df['log_likelihood'] = 0.0
for i in range(len(df)):
word = df.word.iloc[i]
n1 = df.freq.iloc[i]
fnd = brown_words[brown_words.word == word]
if len(fnd) > 0:
n2 = fnd.freq.iloc[0]
else:
n2 = 0
df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)
print df.sort('log_likelihood', ascending=False).head(20)
freq word log_likelihood 54 200 hills 749.660964 32 297 big 630.151174 79 122 nancy 563.296979 71 138 lady 411.288896 68 140 bill 326.549776 108 80 folks 306.118227 95 97 honor 272.219017 59 187 young 271.193850 85 108 happy 267.283070 112 76 sarah 265.372978 44 237 old 253.839785 136 60 simpkins 253.691283 19 442 so 234.694974 47 229 know 226.664917 152 52 councilman 224.677398 176 46 nina 218.448458 87 106 girls 211.621005 126 66 tiny 177.231911 103 89 oh 175.102218 131 63 grand 168.765580
print df.sort('log_likelihood', ascending=True).head(20)
freq word log_likelihood 6 1162 of -1776.113782 9 921 to -1112.090217 8 934 in -616.702852 2 1669 and -383.521998 78 124 by -362.344996 18 471 is -254.363544 22 403 with -113.644883 25 383 his -112.694084 96 96 him -104.650941 104 86 has -103.254890 13 585 for -97.381118 35 284 at -96.568517 24 384 on -94.773170 228 34 may -92.570425 30 301 had -64.783953 4378 1 program -60.080679 2736 1 among -55.998265 247 30 must -54.226455 41 259 from -52.902273 33 291 not -42.458054
Note: this takes a long time.
import time
import numpy as np
from nltk.corpus import brown
br_trig = nltk.trigrams(brown.words())
if 2>1: #not os.path.isfile('gadsby_analysis.pickle'):
start = time.time()
print "Building dataframe."
freq_g = df.freq.sum()
freq_b = brown_words.freq.sum()
df['brown_freq_normalized'] = 0.0
for i in range(len(df)):
word = df.word.iloc[i]
try:
brown_freq = brown_words[brown_words.word == word].freq.iloc[0]
except:
brown_freq = 0
try:
df.brown_freq_normalized.iloc[i] = brown_freq * freq_g / freq_b
except:
df.brown_freq_normalized.iloc[i] = np.nan
df['diff_absolute'] = df.freq - df.brown_freq_normalized
df['diff_relative'] = df.freq*1.0/df.brown_freq_normalized
wprev = []
word = []
wnext = []
for item in br_trig:
a = item[0].lower()
b = item[1].lower()
c = item[2].lower()
if (re.search('[a-z]', a) and
re.search('[a-z]', b) and
re.search('[a-z]', c) ):
wprev.append(a)
word.append(b)
wnext.append(c)
tri = pd.DataFrame({'wprev': wprev,
'word': word,
'wnext': wnext})
def calc_pcte(row):
dftemp = tri[tri.word == row.word]
total = len(dftemp)
dftempe = dftemp[(dftemp.wprev.str.contains('e') ) | (dftemp.wnext.str.contains('e'))]
try:
return len(dftempe) * 100.0 / total
except:
return np.nan
def calc_pctthe(row):
dftemp = tri[tri.word == row.word]
total = len(dftemp)
dftempthe = dftemp[(dftemp.wprev == 'the' ) | (dftemp.wnext == 'the')]
try:
return len(dftempthe) * 100.0 / total
except:
return np.nan
df['pct_e'] = df.apply(calc_pcte, axis=1)
df['pct_the'] = df.apply(calc_pctthe, axis=1)
df.to_pickle('gadsby_analysis.pickle')
df.to_csv('gadsby_analysis.csv')
print "Done. {} minutes elapsed.".format(round((time.time() - start) / 60, 1))
else:
print "Reading pickle."
df = pd.read_pickle('gadsby_analysis.pickle')
Building dataframe. Done. 17.2 minutes elapsed.
Graph 'volcano plot' of
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import seaborn
plt.figure(figsize=(9,6))
dftemp = df[['word', 'log_likelihood', 'pct_e', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood > -50) & (dftemp.log_likelihood < 50)]
dftemp = dftemp[(dftemp.freq > 10)]
plt.scatter(dftemp.log_likelihood, dftemp.pct_e, s=dftemp.freq*1.5, marker='o',
color='blue', alpha=0.15, label='data')
plt.title(' ')
#plt.xlim(-10,10)
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighbouring 'e' in Brown corpus")
plt.show()
df
freq | word | log_likelihood | brown_freq_normalized | diff_absolute | diff_relative | |
---|---|---|---|---|---|---|
1 | 2437 | a | 39.837277 | 2125 | 312 | 1.146824 |
2 | 1669 | and | -383.521998 | 2643 | -974 | 0.631479 |
4 | 1225 | that | 55.933699 | 970 | 255 | 1.262887 |
6 | 1162 | of | -1776.113782 | 3336 | -2174 | 0.348321 |
8 | 934 | in | -616.702852 | 1954 | -1020 | 0.477994 |
9 | 921 | to | -1112.090217 | 2396 | -1475 | 0.384391 |
10 | 710 | was | -39.608135 | 899 | -189 | 0.789766 |
11 | 694 | as | 1.177540 | 664 | 30 | 1.045181 |
12 | 667 | it | -22.394679 | 802 | -135 | 0.831671 |
13 | 585 | for | -97.381118 | 869 | -284 | 0.673188 |
14 | 566 | you | 165.609370 | 301 | 265 | 1.880399 |
15 | 518 | but | 28.177335 | 401 | 117 | 1.291771 |
17 | 471 | i | -0.008889 | 473 | -2 | 0.995772 |
18 | 471 | is | -254.363544 | 926 | -455 | 0.508639 |
19 | 442 | so | 234.694974 | 181 | 261 | 2.441989 |
21 | 421 | this | -5.137938 | 471 | -50 | 0.893843 |
22 | 403 | with | -113.644883 | 667 | -264 | 0.604198 |
23 | 393 | all | 40.463187 | 274 | 119 | 1.434307 |
24 | 384 | on | -94.773170 | 617 | -233 | 0.622366 |
25 | 383 | his | -112.694084 | 640 | -257 | 0.598437 |
26 | 364 | gadsby | NaN | 0 | 364 | inf |
28 | 327 | up | 96.888501 | 173 | 154 | 1.890173 |
30 | 301 | had | -64.783953 | 470 | -169 | 0.640426 |
31 | 298 | or | -19.827346 | 385 | -87 | 0.774026 |
32 | 297 | big | 630.151174 | 32 | 265 | 9.281250 |
33 | 291 | not | -42.458054 | 422 | -131 | 0.689573 |
34 | 289 | an | -8.174090 | 342 | -53 | 0.845029 |
35 | 284 | at | -96.568517 | 492 | -208 | 0.577236 |
36 | 280 | out | 31.832340 | 192 | 88 | 1.458333 |
37 | 270 | our | 134.651348 | 114 | 156 | 2.368421 |
... | ... | ... | ... | ... | ... | ... |
5122 | 1 | wistaria | NaN | 0 | 1 | inf |
5124 | 1 | wit | -0.421448 | 1 | 0 | 1.000000 |
5126 | 1 | withhold | 1.487055 | 0 | 1 | inf |
5128 | 1 | wobbly | 1.487055 | 0 | 1 | inf |
5129 | 1 | woild | NaN | 0 | 1 | inf |
5130 | 1 | wolf | 0.265836 | 0 | 1 | inf |
5133 | 1 | wondrous | 2.358220 | 0 | 1 | inf |
5134 | 1 | woo | 0.982790 | 0 | 1 | inf |
5137 | 1 | woodlands | NaN | 0 | 1 | inf |
5138 | 1 | woodwork | 0.425399 | 0 | 1 | inf |
5141 | 1 | wording | 0.652778 | 0 | 1 | inf |
5143 | 1 | worka | NaN | 0 | 1 | inf |
5145 | 1 | workmanship | 0.265836 | 0 | 1 | inf |
5146 | 1 | worldly | 0.031798 | 0 | 1 | inf |
5147 | 1 | worm | 0.652778 | 0 | 1 | inf |
5150 | 1 | worthington | NaN | 0 | 1 | inf |
5153 | 1 | wounds | 0.079149 | 0 | 1 | inf |
5154 | 1 | wracking | 2.358220 | 0 | 1 | inf |
5155 | 1 | wrapping | 0.265836 | 0 | 1 | inf |
5157 | 1 | wrath | 0.031798 | 0 | 1 | inf |
5158 | 1 | wriggly | NaN | 0 | 1 | inf |
5161 | 1 | wrought | 0.982790 | 0 | 1 | inf |
5162 | 1 | yak | NaN | 0 | 1 | inf |
5163 | 1 | yaks | 2.358220 | 0 | 1 | inf |
5164 | 1 | yank | 0.154473 | 0 | 1 | inf |
5165 | 1 | yanks | 0.652778 | 0 | 1 | inf |
5172 | 1 | yucatan | 2.358220 | 0 | 1 | inf |
5173 | 1 | zigzagging | 0.652778 | 0 | 1 | inf |
5177 | 1 | zoological | NaN | 0 | 1 | inf |
5178 | 1 | zooming | 2.358220 | 0 | 1 | inf |
3934 rows × 6 columns
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
# x1_samples.shape -> (100, 2), 100 rows, 2 columns
plt.figure(figsize=(8,6))
dftemp = df[['word', 'log_likelihood', 'pct_the', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood < -100) | (dftemp.log_likelihood > 100)]
plt.scatter(dftemp.log_likelihood, dftemp.pct_the, s=dftemp.freq, marker='o',
color='blue', alpha=0.4, label='data')
#plt.scatter(x2_samples[:,0], x1_samples[:,1], marker='o',
# color='green', alpha=0.7, label='x2 samples')
#plt.scatter(x3_samples[:,0], x1_samples[:,1], marker='^',
# color='red', alpha=0.7, label='x3 samples')
plt.title(' ')
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighboring 'the' in Brown corpus")
<matplotlib.text.Text at 0x1fc0a748>