by David Taylor, www.prooffreader.com, prooffreader@gmail.com
a collection of tools to create and analyze lists of words using python with pandas and matplotlib
Creates complete and summary dataframes
Three corpus databases/word lists implemented:
1. COHA (not public domain for single word frequencies, requires license to use)
2. Brown corpus (public domain, part of the python Natural Language Toolkit, NLTK).
3. Europarl: A Parallel Corpus for Statistical Machine Translation, Philipp Koehn, MT Summit 2005 (http://statmt.org/europarl/)
The scripts should be easily adaptable to other lists
Uses the Corpus of Historical American English 1-gram corpus from Brigham Young University, available for a fee or with academic license from http://corpus.byu.edu/coha/
Only small snippets of up to five records at a time are shown as examples in this repo
import pandas as pd
import os
import pickle
import json
init_path = 'data_initial/'
data_path = 'data_user_pickle_csv/'
coha_filename = 'coha_1_pos_n_cs_n.txt' # Without part of speech tagging or upper-/lowercase differentiation
Create "coha" initial dataframe
To do: add column for decade range, e.g. 1910-1940 if word was only used within those decades
redo_coha_initial = False # change to True to redo munge and rewrite pickle
pickle_full_path = data_path + "coha_1.pickle"
if not os.path.isfile(pickle_full_path) or redo_coha_initial == True: # if pickle already exists, coha is not processed, only read
print "Processing " + coha_filename
coha = pd.read_table(init_path + coha_filename)
print "\nTail of initial dataframe:"
print coha.tail()
coha.columns = ['freq', 'word', 'decade']
coha = coha[['word', 'freq', 'decade']]
coha = coha.fillna('nan')
coha['nonalpha'] = False
coha['nonalpha'][coha.word.str.contains('[^A-Za-z]')] = True
coha['length'] = 0
decades = [1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900,
1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000]
for idx, row in coha.iterrows(): ## this step take 2-3 minutes on a medium-quality desktop computer ca 2013
coha['decade'][idx] = decades[coha['decade'][idx] - 1]
coha['length'][idx] = len(str(coha.word[idx]))
def coha_pct(group):
freq = group.freq.astype(float)
group['pct'] = (freq / freq.sum() * 100)
return group
coha = coha.groupby(['decade']).apply(coha_pct)
coha.to_pickle(pickle_full_path)
else:
print "Reading " + pickle_full_path
coha = pd.read_pickle(pickle_full_path)
print "\nTail of dataframe:"
print coha.tail()
# note: 'nonalpha' is True if word contains any characters but a-z or A-Z
Processing 1_pos_n_cs_n.txt Tail of initial dataframe: freq word-cs decade 2539723 3 zzzz 20 2539724 1 zzzzzz 18 2539725 1 zzzzzz 20 2539726 3 zzzzzzzz 20 2539727 3 zzzzzzzzz 20 Tail of dataframe: word freq decade nonalpha length pct 2539723 zzzz 3 2000 False 4 0.000011 2539724 zzzzzz 1 1980 False 6 0.000004 2539725 zzzzzz 1 2000 False 6 0.000004 2539726 zzzzzzzz 3 2000 False 8 0.000011 2539727 zzzzzzzzz 3 2000 False 9 0.000011
redo_coha_words = False # change to True to redo munge and rewrite pickle
pickle_full_path = data_path + "coha_words.pickle"
mean_median_full_path = data_path + "coha_words_mean_median.pickle"
if not os.path.isfile(pickle_full_path) or redo_coha_words == True:
print 'Creating dataframe.'
total_freq = 0
weighted_length = 0
coha_words = coha[coha.nonalpha == 0]
coha_words = pd.DataFrame(coha_words.groupby(['word']).sum()).reset_index(drop=False)
coha_words.columns = ['word', 'freq', 'erase1', 'erase2', 'decadesxlen', 'erase3']
coha_words['length'] = 0
coha_words['decades'] = 0
coha_words.sort('freq', ascending = False, inplace=True)
total_freq = coha_words.freq.sum()
median_counter = total_freq / 2
median_found = False
for idx, row in coha_words.iterrows():
curr_len = len(coha_words.word[idx])
coha_words.length[idx] = curr_len
coha_words.decades[idx] = coha_words.decadesxlen[idx] / coha_words.length[idx]
weighted_length += coha_words.freq[idx] * curr_len
median_counter -= curr_len * coha_words.freq[idx]
if median_counter < 0 and median_found == False:
wt_median_len = curr_len
median_found = True
coha_words = coha_words[['word', 'freq', 'length', 'decades']]
coha_words.to_pickle(pickle_full_path)
wt_mean_len = weighted_length * 1.0 / total_freq
pickle.dump((wt_mean_len, wt_median_len), open(mean_median_full_path, "w"))
else:
print 'Reading pickle.'
coha_words = pd.read_pickle(pickle_full_path)
wt_mean_len, wt_median_len = pickle.load(open(mean_median_full_path, "r"))
print "Weighted mean word length: " + str(wt_mean_len)
print "Weighted median word length: " + str(wt_median_len)
print "Tail of coha_words dataframe:"
print coha_words.tail()
Reading pickle. Weighted mean word length: 4.45220167708 Weighted median word length: 4 Tail of coha_words dataframe: word freq length decades 336193 zieg 1 4 1 87532 eckerson 1 8 1 125546 haldore 1 7 1 303578 tranio 1 6 1 40616 burz 1 4 1
# create list of coha_words that also appears in moby crossword dictionary
xword = set()
with open(init_path + '/word_list_moby_crossword.flat.txt', 'r') as f:
for line in f.readlines():
xword.add(line.strip())
cohawords = set(coha_words.word)
coha_moby = xword.intersection(cohawords)
print "{0} words out of intersection of {1} and {2}".format(len(coha_moby), len(cohawords), len(xword))
open(data_path + '/coha_and_xword.json', 'w+').write(json.dumps(list(coha_moby)))
78280 words out of intersection of 337085 and 113809
import pandas as pd
import os
import pickle
import nltk
from nltk.corpus import brown
#nltk.download() # uncomment to start downloader window to fetch brown or other corpora
data_path = 'data'
redo_brown_initial = False # change to True to redo munge and rewrite pickle
pickle_full_path = data_path + "/brown_df.pickle"
if not os.path.isfile(pickle_full_path) or redo_brown_initial == True:
print "Processing Brown corpus from NLTK"
categories = []
words = []
frequencies = []
for category in brown.categories():
wordlist = brown.words(categories=category)
freqs = nltk.FreqDist([w.lower() for w in wordlist])
for key in freqs.keys():
categories.append(category)
words.append(key)
frequencies.append(freqs[key])
brown_df = pd.DataFrame({'word':words, 'freq':frequencies, 'category':categories})
brown_df['nonalpha'] = False
brown_df['nonalpha'][brown_df.word.str.contains('[^A-Za-z]')] = True
brown_df.to_pickle(pickle_full_path)
else:
print "Reading " + pickle_full_path
brown_df = pd.read_pickle(pickle_full_path)
print "\nTail of dataframe:"
print brown_df.tail()
Processing Brown corpus from NLTK Tail of dataframe: category freq word nonalpha 139440 science_fiction 1 yoga False 139441 science_fiction 1 you'll True 139442 science_fiction 1 yourself False 139443 science_fiction 1 zigzagging False 139444 science_fiction 1 zone False
redo_brown_words = False # change to True to redo munge and rewrite pickle
pickle_full_path = data_path + "brown_words.pickle"
mean_median_full_path = data_path + "brown_words_mean_median.pickle"
if not os.path.isfile(pickle_full_path) or redo_brown_words == True:
print 'Creating dataframe.'
total_freq = 0
weighted_length = 0
brown_words = brown_df[brown_df.nonalpha == 0]
brown_words = pd.DataFrame(brown_words.groupby(['word']).sum()).reset_index(drop=False)
brown_words['length'] = 0
brown_words.sort('freq', ascending = False, inplace=True)
total_freq = brown_words.freq.sum()
median_counter = total_freq / 2
median_found = False
for idx, row in brown_words.iterrows():
curr_len = len(brown_words.word[idx])
brown_words.length[idx] = curr_len
weighted_length += brown_words.freq[idx] * curr_len
median_counter -= curr_len * brown_words.freq[idx]
if median_counter < 0 and median_found == False:
wt_median_len = curr_len
median_found = True
brown_words = brown_words[['word', 'freq', 'length']]
brown_words.to_pickle(pickle_full_path)
wt_mean_len = weighted_length * 1.0 / total_freq
pickle.dump((wt_mean_len, wt_median_len), open(mean_median_full_path, "w"))
else:
print 'Reading pickle.'
brown_words = pd.read_pickle(pickle_full_path)
wt_mean_len, wt_median_len = pickle.load(open(mean_median_full_path, "r"))
print "Weighted mean word length: " + str(wt_mean_len)
print "Weighted median word length: " + str(wt_median_len)
print "Tail of brown_words dataframe:"
print brown_words.tail()
Creating dataframe. Weighted mean word length: 4.68324851586 Weighted median word length: 4 Tail of brown_words dataframe: word freq length 11805 enchant 1 7 11804 enchained 1 9 11803 encephalographic 1 16 11802 encephalitis 1 12 29339 regalia 1 7
The corpora for parallel translations are used because they are better curated than the raw texts. The English corpus is a combination of the English versions of the parallel translations for the languages with the most (not necessarily native) speakers in the world.
import pandas as pd
import os
import pickle
import nltk
import re
europarl_filenames = {'Spanish':'europarl-v7.es-en.es', 'French':'europarl-v7.fr-en.fr',
'German':'europarl-v7.de-en.de', 'Portuguese':'europarl-v7.pt-en.pt',
'Italian':'europarl-v7.it-en.it', 'Polish':'europarl-v7.pl-en.pl',
'Finnish':'europarl-v7.fi-en.fi', 'English':'europarl-v7.fr-en.en'}
# this is not complete; notably missing are Hungarian diacritics
accents = [(r'Ą', r'A'), (r'ą', r'a'), (r'Č', r'C'), (r'č', r'c'), (r'ď', r'd'), (r'Ę', r'E'), (r'ę', r'e'),
(r'Ě', r'E'), (r'ě', r'e'), (r'Ĺ', r'L'), (r'ĺ', r'l'), (r'Ň', r'N'), (r'ň', r'n'), (r'Ŕ', r'R'),
(r'ŕ', r'r'), (r'Ř', r'R'), (r'ř', r'r'), (r'ť', r't'), (r'Ů', r'r'), (r'ů', r'r'), (r'Ž', r'Z'),
(r'ž', r'z'), (r'Á', r'A'), (r'á', r'a'), (r'Â', r'A'), (r'â', r'a'), (r'Ø', r'o'), (r'õ', r'o'),
(r'À', r'A'), (r'à', r'a'), (r'Ä', r'A'), (r'ä', r'a'), (r'Ç', r'C'), (r'ç', r'c'), (r'É', r'E'),
(r'é', r'e'), (r'Ê', r'E'), (r'ê', r'e'), (r'È', r'E'), (r'è', r'e'), (r'Ë', r'E'), (r'ë', r'e'),
(r'Í', r'I'), (r'í', r'i'), (r'Î', r'I'), (r'î', r'i'), (r'Ì', r'I'), (r'ì', r'i'), (r'Ï', r'I'),
(r'ï', r'i'), (r'Ñ', r'N'), (r'ñ', r'n'), (r'Ó', r'O'), (r'ó', r'o'), (r'Ô', r'O'), (r'ô', r'o'),
(r'Ò', r'O'), (r'ò', r'o'), (r'Ö', r'O'), (r'ö', r'o'), (r'ø', r'o'), (r'Õ', r'O'), (r'ä', r'a'),
(r'Ú', r'r'), (r'ú', r'r'), (r'Û', r'r'), (r'û', r'r'), (r'Ù', r'r'), (r'ù', r'r'), (r'Ü', r'r'),
(r'ü', r'r'), (r'Ý', r'Y'), (r'ý', r'y'), (r'Š', r'S'), (r'š', r's'), (r'ÿ', r'y'), (r'Ÿ', r'Y'),
(r'Å', r'A'), (r'å', r'a'), (r'Ã', r'A'), (r'ã', r'a'), (r'Ä', r'A'),
(r'Æ', r'Ae'), (r'æ', r'ae'), (r'Œ', r'Oe'), (r'œ', r'oe'), (r'ß', r'ss')]
redo_europarl = True # change to True to redo munge and rewrite pickle
import time
start = time.time()
language = 'English' # This takes a long time -- Finnish was the longest, at 10.3 hours with a
# $500 Windows Acer desktop computer bought in 2013. Therefore I munged the files
# one by one instead of iterating over a list of them.
if language == 'English':
file_path = init_path + europarl_filenames[language]
pickle_path = data_path + 'europarl_' + language + ".pickle"
if not os.path.isfile(pickle_path) or redo_europarl == True:
print "\nProcessing", language
txt = open(file_path, 'rU').read()
freqdist = nltk.FreqDist(nltk.word_tokenize(txt))
europarl_df = pd.DataFrame()
counter_max = len(freqdist.keys())
counter_list = []
for i in range(20):
counter_list.append(int(i * counter_max / 20))
counter = -1
countdown = 100
for token in freqdist.keys():
counter += 1
if counter in counter_list:
print countdown,
countdown -= 5
token_mod = token.lower()
for accent in accents:
token_mod = re.sub(accent[0], accent[1], token_mod)
if len(token_mod) > 1 and re.search('[aeiouy]', token_mod) and not re.search('[^a-z]', token_mod):
df_to_append = pd.DataFrame({'language':[language], 'word':[token_mod], 'freq':[freqdist[token]]})
europarl_df = europarl_df.append(df_to_append)
europarl_df.to_pickle(pickle_path)
print "\n%d minutes elapsed." % (int((time.time() - start) / 60))
print "Done."
Processing English 100 95 90 85 80 75 70 65 60 55 50 45 40 35 30 25 20 15 10 5 18 minutes elapsed. Done.