import pandas as pd
import time
from math import ceil
import pickle
# load pickle of all words and decades and remove those that appear in more than 10 decades
df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
origlen = len(df)
origwds = len(df.word.unique())
df = df[df.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df = df[df.word.isin(wordcount.index)]
df = df[['word', 'decade', 'pct']]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()),
len(df.word.unique())*100.0/origwds)
print df.head(10)
#keep words in crossword dictionary, e.g. not proper nouns
origlen = len(df)
origwds = len(df.word.unique())
import json
xwords = json.loads(open('../data_user_pickle_csv/coha_and_xword.json', 'r').read())
df = df[df.word.isin(xwords)]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()),
len(df.word.unique())*100.0/origwds)
# keep top 10000 words in terms of max and sum
origlen = len(df)
origwds = len(df.word.unique())
dfsum = pd.DataFrame(df.groupby('word').pct.sum())
dfsum.sort('pct', ascending=False, inplace=True)
dfsum = dfsum[:10000]
dfmax = pd.DataFrame(df.groupby('word').pct.max())
dfmax.sort('pct', ascending=False, inplace=True)
dfmax = dfmax[:10000]
df = df[(df.word.isin(dfsum.index)) | (df.word.isin(dfmax.index))]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()),
len(df.word.unique())*100.0/origwds)
# add sum per decade to dfsum
series_count = df.groupby('word').decade.count()
dfsum['pct_per_decade'] = 0.0
dfsum['decades'] = 0
dfsum['decade_specificity'] = 0.0
for i in range(len(dfsum)):
dfsum.pct_per_decade.iloc[i] = (dfsum.pct[i] /
series_count[dfsum.index[i]])
dfsum.decades[i] = series_count[dfsum.index[i]]
dfsum.decade_specificity[i] = 20 - series_count[dfsum.index[i]]
dfsum.sort('pct_per_decade', ascending=False, inplace=True)
print dfsum.head(50)
# for contrast, let's see proper nouns
df_proper = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df_proper = df_proper[df_proper.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df_proper.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df_proper = df_proper[df_proper.word.isin(wordcount.index)]
df_proper = df_proper[['word', 'decade', 'pct']]
df_propersum = pd.DataFrame(df_proper.groupby('word').pct.sum())
df_propersum.sort('pct', ascending=False, inplace=True)
df_propersum = df_propersum[:10000]
df_propermax = pd.DataFrame(df_proper.groupby('word').pct.max())
df_propermax.sort('pct', ascending=False, inplace=True)
df_propermax = df_propermax[:10000]
df_proper = df_proper[(df_proper.word.isin(df_propersum.index)) | (df_proper.word.isin(df_propermax.index))]
proper_series_count = df_proper.groupby('word').decade.count()
df_propersum['pct_per_decade'] = 0.0
for i in range(len(df_propersum)):
df_propersum.pct_per_decade.iloc[i] = (df_propersum.pct[i] /
proper_series_count[df_propersum.index[i]])
df_propersum.sort('pct_per_decade', ascending=False, inplace=True)
df_propersum = df_propersum[~df_propersum.index.isin(dfsum.index)]
print df_propersum.head(50)
df_propersum[:50].to_csv('coha_top_omitted_proper_nouns.csv')
# make pivot table showing in which decades words occurred
decades = range(1810, 2010, 10)
dftop = dfsum[:50]
dftoplookup = df.copy()
for decade in decades:
dftop[decade] = 0.0
for i in range(len(dftop)):
for decade in decades:
if len(dftoplookup[(dftoplookup.word == dftop.index[i]) &
(dftoplookup.decade == decade)]) > 0:
dftop[decade].iloc[i] = dftoplookup[(dftoplookup.word == dftop.index[i]) &
(dftoplookup.decade == decade)].pct.iloc[0]
print dftop.head()
dftop.to_csv('coha_top_decades.csv')