Notebook

word_list_tools repo¶

by David Taylor, www.prooffreader.com, prooffreader@gmail.com

a collection of tools to create and analyze lists of words using python with pandas and matplotlib

initial_data_munge¶

Creates complete and summary dataframes

Three corpus databases/word lists implemented:

1. COHA (not public domain for single word frequencies, requires license to use)
2. Brown corpus (public domain, part of the python Natural Language Toolkit, NLTK).
3. Europarl: A Parallel Corpus for Statistical Machine Translation, Philipp Koehn, MT Summit 2005 (http://statmt.org/europarl/)

The scripts should be easily adaptable to other lists

COHA¶

Uses the Corpus of Historical American English 1-gram corpus from Brigham Young University, available for a fee or with academic license from http://corpus.byu.edu/coha/

Only small snippets of up to five records at a time are shown as examples in this repo

In [8]:

import pandas as pd
import os
import pickle
import json

init_path = 'data_initial/'
data_path = 'data_user_pickle_csv/'
coha_filename = 'coha_1_pos_n_cs_n.txt' # Without part of speech tagging or upper-/lowercase differentiation 

Create "coha" initial dataframe

To do: add column for decade range, e.g. 1910-1940 if word was only used within those decades

In [2]:

redo_coha_initial = False # change to True to redo munge and rewrite pickle

pickle_full_path = data_path + "coha_1.pickle"

if not os.path.isfile(pickle_full_path) or redo_coha_initial == True: # if pickle already exists, coha is not processed, only read
    print "Processing " + coha_filename
    coha = pd.read_table(init_path + coha_filename)
    print "\nTail of initial dataframe:"
    print coha.tail()
    coha.columns = ['freq', 'word', 'decade']
    coha = coha[['word', 'freq', 'decade']]
    coha = coha.fillna('nan')
    coha['nonalpha'] = False
    coha['nonalpha'][coha.word.str.contains('[^A-Za-z]')] = True
    coha['length'] = 0
    decades = [1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900,
               1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000]
    for idx, row in coha.iterrows():  ## this step take 2-3 minutes on a medium-quality desktop computer ca 2013
        coha['decade'][idx] = decades[coha['decade'][idx] - 1]
        coha['length'][idx] = len(str(coha.word[idx]))
    def coha_pct(group):
        freq = group.freq.astype(float)
        group['pct'] = (freq / freq.sum() * 100)
        return group
    coha = coha.groupby(['decade']).apply(coha_pct)
    coha.to_pickle(pickle_full_path)
    
else:
    print "Reading " + pickle_full_path
    coha = pd.read_pickle(pickle_full_path)
    
print "\nTail of dataframe:"
print coha.tail()

# note: 'nonalpha' is True if word contains any characters but a-z or A-Z

Processing 1_pos_n_cs_n.txt

Tail of initial dataframe:
         freq    word-cs  decade
2539723     3       zzzz      20
2539724     1     zzzzzz      18
2539725     1     zzzzzz      20
2539726     3   zzzzzzzz      20
2539727     3  zzzzzzzzz      20

Tail of dataframe:
              word  freq  decade nonalpha  length       pct
2539723       zzzz     3    2000    False       4  0.000011
2539724     zzzzzz     1    1980    False       6  0.000004
2539725     zzzzzz     1    2000    False       6  0.000004
2539726   zzzzzzzz     3    2000    False       8  0.000011
2539727  zzzzzzzzz     3    2000    False       9  0.000011

Creates dataframe coha_words that ignores tokens with non-alphabetic characters and consolidates all decades into one record

In [4]:

redo_coha_words = False # change to True to redo munge and rewrite pickle

pickle_full_path = data_path + "coha_words.pickle"
mean_median_full_path = data_path + "coha_words_mean_median.pickle"

if not os.path.isfile(pickle_full_path) or redo_coha_words == True:
    print 'Creating dataframe.'
    total_freq = 0
    weighted_length = 0
    coha_words = coha[coha.nonalpha == 0]
    coha_words = pd.DataFrame(coha_words.groupby(['word']).sum()).reset_index(drop=False)
    coha_words.columns = ['word', 'freq', 'erase1', 'erase2', 'decadesxlen', 'erase3']
    coha_words['length'] = 0
    coha_words['decades'] = 0
    coha_words.sort('freq', ascending = False, inplace=True)
    total_freq = coha_words.freq.sum()
    median_counter = total_freq / 2
    median_found = False
    for idx, row in coha_words.iterrows():
        curr_len = len(coha_words.word[idx])
        coha_words.length[idx] = curr_len
        coha_words.decades[idx] = coha_words.decadesxlen[idx] / coha_words.length[idx]
        weighted_length += coha_words.freq[idx] * curr_len
        median_counter -= curr_len * coha_words.freq[idx]
        if median_counter < 0 and median_found == False:
            wt_median_len = curr_len
            median_found = True
    coha_words = coha_words[['word', 'freq', 'length', 'decades']]
    coha_words.to_pickle(pickle_full_path)
    wt_mean_len = weighted_length * 1.0 / total_freq
    pickle.dump((wt_mean_len, wt_median_len), open(mean_median_full_path, "w"))
    
else:
    print 'Reading pickle.'
    coha_words = pd.read_pickle(pickle_full_path)
    wt_mean_len, wt_median_len = pickle.load(open(mean_median_full_path, "r"))
    
print "Weighted mean word length: " + str(wt_mean_len)
print "Weighted median word length: " + str(wt_median_len)
print "Tail of coha_words dataframe:"
print coha_words.tail()

Reading pickle.
Weighted mean word length: 4.45220167708
Weighted median word length: 4
Tail of coha_words dataframe:
            word  freq  length  decades
336193      zieg     1       4        1
87532   eckerson     1       8        1
125546   haldore     1       7        1
303578    tranio     1       6        1
40616       burz     1       4        1

In [11]:

# create list of coha_words that also appears in moby crossword dictionary

xword = set()
with open(init_path + '/word_list_moby_crossword.flat.txt', 'r') as f:
    for line in f.readlines():
        xword.add(line.strip())

cohawords = set(coha_words.word)

coha_moby = xword.intersection(cohawords)

print "{0} words out of intersection of {1} and {2}".format(len(coha_moby), len(cohawords), len(xword))

open(data_path + '/coha_and_xword.json', 'w+').write(json.dumps(list(coha_moby)))

78280 words out of intersection of 337085 and 113809

Brown corpus from NLTK 3##¶

In [5]:

import pandas as pd
import os
import pickle
import nltk
from nltk.corpus import brown
#nltk.download() # uncomment to start downloader window to fetch brown or other corpora

data_path = 'data'

In [6]:

redo_brown_initial = False # change to True to redo munge and rewrite pickle

pickle_full_path = data_path + "/brown_df.pickle"

if not os.path.isfile(pickle_full_path) or redo_brown_initial == True: 
    
    print "Processing Brown corpus from NLTK"
    categories = []
    words = []
    frequencies = []
    
    for category in brown.categories():
        wordlist = brown.words(categories=category)
        freqs = nltk.FreqDist([w.lower() for w in wordlist])
        for key in freqs.keys():
            categories.append(category)
            words.append(key)
            frequencies.append(freqs[key])
    
    brown_df = pd.DataFrame({'word':words, 'freq':frequencies, 'category':categories})
    brown_df['nonalpha'] = False
    brown_df['nonalpha'][brown_df.word.str.contains('[^A-Za-z]')] = True
    brown_df.to_pickle(pickle_full_path)
    
else:
    print "Reading " + pickle_full_path
    brown_df = pd.read_pickle(pickle_full_path)
    
print "\nTail of dataframe:"
print brown_df.tail()

Processing Brown corpus from NLTK

Tail of dataframe:
               category  freq        word nonalpha
139440  science_fiction     1        yoga    False
139441  science_fiction     1      you'll     True
139442  science_fiction     1    yourself    False
139443  science_fiction     1  zigzagging    False
139444  science_fiction     1        zone    False

build dataframe of frequencies of unique words, consolidating categories, discarding tokens with non-alphabetic characters

In [7]:

redo_brown_words = False # change to True to redo munge and rewrite pickle

pickle_full_path = data_path + "brown_words.pickle"

mean_median_full_path = data_path + "brown_words_mean_median.pickle"

if not os.path.isfile(pickle_full_path) or redo_brown_words == True:
    print 'Creating dataframe.'
    total_freq = 0
    weighted_length = 0
    brown_words = brown_df[brown_df.nonalpha == 0]
    brown_words = pd.DataFrame(brown_words.groupby(['word']).sum()).reset_index(drop=False)
    brown_words['length'] = 0
    brown_words.sort('freq', ascending = False, inplace=True)
    total_freq = brown_words.freq.sum()
    median_counter = total_freq / 2
    median_found = False
    for idx, row in brown_words.iterrows():
        curr_len = len(brown_words.word[idx])
        brown_words.length[idx] = curr_len
        weighted_length += brown_words.freq[idx] * curr_len
        median_counter -= curr_len * brown_words.freq[idx]
        if median_counter < 0 and median_found == False:
            wt_median_len = curr_len
            median_found = True
    brown_words = brown_words[['word', 'freq', 'length']]
    brown_words.to_pickle(pickle_full_path)
    wt_mean_len = weighted_length * 1.0 / total_freq
    pickle.dump((wt_mean_len, wt_median_len), open(mean_median_full_path, "w"))
    
else:
    print 'Reading pickle.'
    brown_words = pd.read_pickle(pickle_full_path)
    wt_mean_len, wt_median_len = pickle.load(open(mean_median_full_path, "r"))
    
print "Weighted mean word length: " + str(wt_mean_len)
print "Weighted median word length: " + str(wt_median_len)
print "Tail of brown_words dataframe:"
print brown_words.tail()

Creating dataframe.
Weighted mean word length: 4.68324851586
Weighted median word length: 4
Tail of brown_words dataframe:
                   word  freq  length
11805           enchant     1       7
11804         enchained     1       9
11803  encephalographic     1      16
11802      encephalitis     1      12
29339           regalia     1       7

Europarl¶

Selection of 20 different languages from European Parliament¶

The corpora for parallel translations are used because they are better curated than the raw texts. The English corpus is a combination of the English versions of the parallel translations for the languages with the most (not necessarily native) speakers in the world.

In [3]:

import pandas as pd
import os
import pickle
import nltk
import re

europarl_filenames = {'Spanish':'europarl-v7.es-en.es', 'French':'europarl-v7.fr-en.fr',
                      'German':'europarl-v7.de-en.de', 'Portuguese':'europarl-v7.pt-en.pt', 
                      'Italian':'europarl-v7.it-en.it', 'Polish':'europarl-v7.pl-en.pl',
                      'Finnish':'europarl-v7.fi-en.fi', 'English':'europarl-v7.fr-en.en'} 

# this is not complete; notably missing are Hungarian diacritics
accents = [(r'Ą', r'A'), (r'ą', r'a'), (r'Č', r'C'), (r'č', r'c'), (r'ď', r'd'), (r'Ę', r'E'), (r'ę', r'e'), 
           (r'Ě', r'E'), (r'ě', r'e'), (r'Ĺ', r'L'), (r'ĺ', r'l'), (r'Ň', r'N'), (r'ň', r'n'), (r'Ŕ', r'R'), 
           (r'ŕ', r'r'), (r'Ř', r'R'), (r'ř', r'r'), (r'ť', r't'), (r'Ů', r'r'), (r'ů', r'r'), (r'Ž', r'Z'), 
           (r'ž', r'z'), (r'Á', r'A'), (r'á', r'a'), (r'Â', r'A'), (r'â', r'a'), (r'Ø', r'o'), (r'õ', r'o'), 
           (r'À', r'A'), (r'à', r'a'), (r'Ä', r'A'), (r'ä', r'a'), (r'Ç', r'C'), (r'ç', r'c'), (r'É', r'E'), 
           (r'é', r'e'), (r'Ê', r'E'), (r'ê', r'e'), (r'È', r'E'), (r'è', r'e'), (r'Ë', r'E'), (r'ë', r'e'), 
           (r'Í', r'I'), (r'í', r'i'), (r'Î', r'I'), (r'î', r'i'), (r'Ì', r'I'), (r'ì', r'i'), (r'Ï', r'I'), 
           (r'ï', r'i'), (r'Ñ', r'N'), (r'ñ', r'n'), (r'Ó', r'O'), (r'ó', r'o'), (r'Ô', r'O'), (r'ô', r'o'), 
           (r'Ò', r'O'), (r'ò', r'o'), (r'Ö', r'O'), (r'ö', r'o'), (r'ø', r'o'), (r'Õ', r'O'), (r'ä', r'a'),
           (r'Ú', r'r'), (r'ú', r'r'), (r'Û', r'r'), (r'û', r'r'), (r'Ù', r'r'), (r'ù', r'r'), (r'Ü', r'r'), 
           (r'ü', r'r'), (r'Ý', r'Y'), (r'ý', r'y'), (r'Š', r'S'), (r'š', r's'), (r'ÿ', r'y'), (r'Ÿ', r'Y'), 
           (r'Å', r'A'), (r'å', r'a'), (r'Ã', r'A'), (r'ã', r'a'), (r'Ä', r'A'), 
           (r'Æ', r'Ae'), (r'æ', r'ae'), (r'Œ', r'Oe'), (r'œ', r'oe'), (r'ß', r'ss')]

In [4]:

redo_europarl = True # change to True to redo munge and rewrite pickle

import time
start = time.time()

language = 'English' # This takes a long time -- Finnish was the longest, at 10.3 hours with a
                     # $500 Windows Acer desktop computer bought in 2013. Therefore I munged the files
                     # one by one instead of iterating over a list of them.

if language == 'English':

    file_path = init_path + europarl_filenames[language]
    pickle_path = data_path + 'europarl_' + language + ".pickle"
    
    if not os.path.isfile(pickle_path) or redo_europarl == True: 
        
        print "\nProcessing", language
        
        txt = open(file_path, 'rU').read()
        freqdist = nltk.FreqDist(nltk.word_tokenize(txt))
        
        europarl_df = pd.DataFrame()

        counter_max = len(freqdist.keys())
        counter_list = []
        for i in range(20):
            counter_list.append(int(i * counter_max / 20))
        counter = -1
        countdown = 100
        
        for token in freqdist.keys():
            counter += 1
            if counter in counter_list:
                print countdown,
                countdown -= 5
            token_mod = token.lower()
            for accent in accents:
                token_mod = re.sub(accent[0], accent[1], token_mod)
            if len(token_mod) > 1 and re.search('[aeiouy]', token_mod) and not re.search('[^a-z]', token_mod):
                df_to_append = pd.DataFrame({'language':[language], 'word':[token_mod], 'freq':[freqdist[token]]})
                europarl_df = europarl_df.append(df_to_append)
        
        europarl_df.to_pickle(pickle_path)
        
        print "\n%d minutes elapsed." % (int((time.time() - start) / 60))
        
print "Done."

Processing English
100 95 90 85 80 75 70 65 60 55 50 45 40 35 30 25 20 15 10 5 
18 minutes elapsed.
Done.

In [ ]: