Notebook

In [16]:

import json
import codecs
import operator
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import glob
from datetime import datetime
from dateutil import tz
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
from nltk.collocations import *
from time import strftime, localtime
import gc
import sys

#show = sys.argv[1]
out = codecs.open(show.strip('#') + '.trigrams.txt', 'w', encoding='utf-8')

print "start reading file at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
with codecs.open('../twarc/' + show.strip('#') + '.txt', 'r', encoding='utf-8') as content_file:
    content = content_file.read()
print "finished reading file at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())

'''
jsons = glob.glob('../twarc/' + show + '*')
f = codecs.open('../twarc/scandal.txt', 'r', encoding='utf-8')
data = []
words = ""
count = 0
for line in f:
    count += 1
    print str(count)
    words += line
# All attempts to process these massive JSON collections crap out.
# The Python objects that result from parsing the json just don't scale
# 16G Memory locks down machine after a few hundred thousand.
An alternate approach inolves pulling necessary data out via JQ beforehand:
charper@charper-ThinkPad-T530:~/Dropbox/stern/pds/twarc$ touch scandal.txt
charper@charper-ThinkPad-T530:~/Dropbox/stern/pds/twarc$ for j in \#Scandal-201311*
> do
> cat $j | jq '.text' >> scandal.txt
> done

'''

print "starting tokenization at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
tokens = content.split()
print "starting filtering at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
# Stopwords requires download of stopwords corpora -- import nltk, nltk.download()
stops = stopwords.words('english')
#specifics = ['#DoctorWho', '#DoctorWho50th', '#SaveTheDay', '#DayoftheDoctor', 'RT']
specifics = ['#sleepyhollow', '#Sleepyhollow', 'RT']
stops.extend(specifics)
#filtered_words = [w for w in tokens if not w in stops]
filtered_words = [w for w in tokens if not w in stopwords.words('english')]

print "starting finder at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(filtered_words)
finder.ngram_fd.viewitems()
finder.apply_freq_filter(3)
print finder.nbest(bigram_measures.pmi, 100)
out.write(str(finder.nbest(bigram_measures.pmi, 100)) + "\n")

print "trigrams & frequency at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
tgs = nltk.trigrams(filtered_words)

fdist = nltk.FreqDist(tgs)
for k,v in fdist.items():
    if int(v) > 10: 
        out.write(str(k) + "|" + str(v) + "\n")
'''
Part of speech tagging code, that mostly works, but didn't make the final iteration of things
print "part of speech tagging at : " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
pos = nltk.pos_tag(filtered_words)
pos_df = pd.DataFrame.from_records(pos)
pos_df.to_csv('sleepyhollow.pos.csv', encoding='utf8', quoting=1, index=False)
'''

print "finsihed at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())

start reading file at: Wed, 18 Dec 2013 18:55:30 +0000
finished reading file at: Wed, 18 Dec 2013 18:55:30 +0000
starting tokenization at: Wed, 18 Dec 2013 18:55:30 +0000
starting filtering at: Wed, 18 Dec 2013 18:55:30 +0000
starting finder at: Wed, 18 Dec 2013 18:55:49 +0000
[(u'"#Lmfao', u'#Lmao'), (u'"watched', u'yesterday:'), (u'#Apps:', u'#DeadTrigger'), (u'#ArrestedDevelopment', u'#RETWEET"'), (u'#Blog', u'#Geek"'), (u'#Bud', u'http://t.co/fU3lXpg25T"'), (u'#Conan\\n2)', u'#DoratheExplorer\\n3)'), (u'#DoratheExplorer\\n3)', u'#TOMANDJERRY\\n4)'), (u'#EndersGame', u'Review\\"'), (u'#FREE', u'#podcast!'), (u'#FollowAndFollow', u'siga'), (u'#GameFly', u'http://t.co/AagvYH9IrD"'), (u'#HarryStyles', u'before!!'), (u'#HomerSimpson\\n#Nerdvana', u'http://t.co/U9EMeQ7I39"'), (u'#JensenAckles', u'#JaredPadalecki"'), (u'#Jetpack', u'#GameFly'), (u'#MrBergstrom', u'http://t.co/wEQ2gJIhAy"'), (u'#Nonfiction', u'#Books"'), (u'#OldEpisodes', u'#BestFriend"'), (u'#StarCars:', u'http://t.co/KdOJ4uIkuB'), (u'#TOMANDJERRY\\n4)', u'#TheSimpsons\\n5)...'), (u'#TheHobbit\\n#TheSimpsons', u'#HomerSimpson\\n#Nerdvana'), (u'#TheSimpsons)', u'Toronto.'), (u'#YouAreLisaSimpsons', u'w/@louisaevecohen'), (u'#codeine', u'#hustle'), (u'#detention', u'#donthaveacowman'), (u'#donthaveacowman', u'#mattgroening\u2026'), (u'#eatmyshorts', u'#detention'), (u'#ethreal', u'#trippy'), (u'#justgonnareplacetheskinnerpartaccordingly', u'#simpsonslogic'), (u'#kush', u'#trapmusic'), (u'#mattgroening\u2026', u'http://t.co/xSH6mFN1Rq"'), (u'#newlifemotto', u'#justgonnareplacetheskinnerpartaccordingly'), (u'#nintendo', u'#gamersunite'), (u'#nouraakekhonger', u'http://t.co/uvHqurEbYP"'), (u'#ociogay', u'#pikoftheday'), (u'#trippy', u'#wave'), (u'#wave', u'#kush'), (u'#\u0627\u0644\u0623\u0643\u062b\u0631', u'#\u0634\u0639\u0628\u064a\u0629'), (u'#\u0627\u0644\u0639\u0627\u0644\u0645', u'\u061f\\n1)'), (u'#\u0634\u0639\u0628\u064a\u0629', u'\u0641\u064a'), (u'(From', u'#TheSimpsons)'), (u'(cc', u'@camiiilleem'), (u'11x17', u'Signed'), (u'526-529', u'#nintendo'), (u'@2amsnaps:', u'[JOKWON'), (u'@Andreita_Villas', u'@luciasolerapine'), (u'@AzuGadoMart:', u'Haz'), (u'@BeverlyMacca1:', u'OK'), (u'@Brad_Cibane', u'@Julius_S_Malema'), (u'@DACrosse', u'@dliebma'), (u'@FaythAnderton:', u'Omg'), (u'@Fercharmed:', u'.@Alyssa_Milano'), (u'@Joshstrangehill', u'@bbcradiomanc'), (u'@Josi_RF:', u'\\"Hoy'), (u'@LITTLEREGGAEMAN:', u'-Jaa'), (u'@LaurenTom9000', u'#IKnowThatVoice,'), (u'@LetsGoSeeDo', u'@DACrosse'), (u'@LottaBitt:', u'Wtf!'), (u'@Lurdesferizq', u'#osquiero'), (u'@Nathaliajshj:', u'\u2665\\"'), (u'@TELUS', u'Optik'), (u'@VictorGionatan:', u'Ma'), (u'@_ItsEli:', u'Nel'), (u'@andreagandia14', u'http://t.co/bkXfsGIDRG"'), (u'@bbcradiomanc', u'4.50pm'), (u'@camiiilleem', u'@MargotDaval'), (u'@debikayo', u'@kaywyoming'), (u'@kaywyoming', u'@LetsGoSeeDo'), (u'@luciasolerapine', u'@Lurdesferizq'), (u'@malmarri', u'@Salem_Belyouha'), (u'@mariamas05', u'@andreagandia14'), (u'@mark_jubb:', u'OMG,'), (u'@paula200084', u'@Andreita_Villas'), (u'@veronii29082249', u'@paula200084'), (u'Astilla\\"', u'#budumtss'), (u'Benvingut,', u'nano!'), (u'Bradbury,', u'Matheson.'), (u'C\u2019mon!', u'You\u2019ve'), (u'DANG', u'ROOF!!\\n#thesimpsons'), (u'DNS?\\n\xa1NOSOTROS!', u'\xa1NOSOTROS!\\n#sindromeDeDAW'), (u'Database.', u'Please.'), (u'Dias', u':)))'), (u'Doh!\\n\\nhttp://t.co/KmbKePxpK5\\n#LOTR', u'#TheHobbit\\n#TheSimpsons'), (u'Email', u'jen@tvfilmnewsllc.com'), (u'Emmy', u'nom.'), (u'Fansite?', u'Email'), (u'GET', u'OFF'), (u'GeoGantArt', u'http://t.co/m77SOEb82p'), (u'HOLLYWOOD', u"STAR'S"), (u'Hershel', u'Krustovsky'), (u'Hierarchy', u'Quoting'), (u'ICONIC', u'DRESSES'), (u'IP?', u'\\n\xbfQuien'), (u'Interactive', u'Map'), (u'Jamaicans??', u'Lol...wow...'), (u'Jump', u'see:'), (u'Lunar', u'Module'), (u'MA!', u'GET'), (u'MOST', u'ICONIC')]
trigrams & frequency at: Wed, 18 Dec 2013 18:55:52 +0000
finsihed at: Wed, 18 Dec 2013 18:55:54 +0000

In [18]:

print "part of speech tagging at : " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
pos = nltk.pos_tag(filtered_words)
pos_df = pd.DataFrame.from_records(pos)
pos_df.to_csv(show.strip('#') + '.pos.csv', encoding='utf8', quoting=1, index=False)

part of speech tagging at : Wed, 18 Dec 2013 18:57:08 +0000

In [19]:

pos_df.head(10)

Out[19]:

	0	1
0	"#TheSimpsons:	NN
1	FXX	NNP
2	sichert	NN
3	sich	NN
4	Wiederholungsrechte	NNP
5	Mega-Deal	NNP
6	http://t.co/UQxUm3pylV"	NNP
7	"#thesimpsons	NNS
8	@ProSieben"	-NONE-
9	"Yes	VBZ

In [23]:

counts = pos_df.groupby(1).size()
counts.sort(ascending=False)
counts.head(10)

Out[23]:

1
NNP       59609
NN        55267
JJ        26434
NNS       23771
-NONE-    17535
CD         4508
RB         3765
VBP        3564
:          3331
VBZ        3124
dtype: int64

In [31]:

!awk ' BEGIN { FS = "\t" } { print $1, $5 } ' POSMappings.txt

Category PTB
Adjective JJ
Adjective, ordinal number JJ
Adjective, comparative JJR
Adjective, superlative JJS
Adjective, superlative, semantically JJ
Adjective, cardinal number CD
Adjective, cardinal number, one CD
Adjective, past-part of verb VBN, JJ
Adjective, pres-part of verb VBG, JJ
Adverb RB
Adverb, negative RB
Adverb, comparative RBR
Adverb, superlative RBS
Adverb, particle RP
Adverb, question WRB
Adverb, degree & question WRB
Adverb, degree RB
Adverb, degree, postposed RB
Adverb, nominal RB
Adverb, conjunctive RB
Conjunction, coordination CC
Conjunction, subordinating IN
Conjunction, complementizer 'that' IN
Determiner DT
Determiner, pronoun DT
Determiner, pronoun, plural DT
Determiner, prequalifier PDT
Determiner, prequalifier PDT
Determiner, pronoun or double conj. DT (CC)
Determiner, pronoun or double conj. DT (CC)
Determiner, article DT
Determiner, article DT
Determiner, postdeterminer JJ
Determiner, possessive PRP$
Determiner, possessive, second PRP
Determiner, question WDT
Determiner, possessive & question WP$
Noun NN
Noun, singular NN
Noun, plural NNS
Noun, proper, singular NNP
Noun, proper, plural NNPS
Noun, adverbial NN, NNP, RB
Noun, plural from post-determiner NNS
Pronoun, nominal (indefinite) NN
Pronoun, personal, subject PRP
Pronoun, personal, subject, 3SG PRP
Pronoun, personal, object PRP
Pronoun, reflexive PRP
Pronoun, reflexive, plural PRP
Pronoun, question, subject WP
Pronoun, question, object WP
Pronoun, question, existential there EX
Verb. base present form (not infinitive) VBP
Verb, infinitive VB
Verb, past tense VBD
Verb, present participle VBG
Verb, past/passive participle VBN
Verb, present 3SG -s form VBZ
Verb, auxilliary do, base VBP
Verb, auxilliary do, infinitive VB
Verb, auxilliary do, past VBD
Verb, auxilliary do, present part. VBG
Verb, auxilliary do, past part. VBN
Verb, auxilliary do, present 3SG VBZ
Verb, auxilliary have, base VBP
Verb, auxilliary have, infinitive VB
Verb, auxilliary have, past VBD
Verb, auxilliary have, present part. VBG
Verb, auxilliary have, past part. VBN
Verb, auxilliary have, present 3SG VBZ
Verb, auxilliary be, infinitive VB
Verb, auxilliary be, past VBD
Verb, auxilliary be, past, 3SG VBD
Verb, auxilliary be, present part. VBG
Verb, auxilliary be, past part. VBN
Verb, auxilliary be, present, 3SG VBZ
Verb, auxilliary be, present, 1SG VBP
Verb, auxilliary be, present VBP
Verb, modal MD
Infinitive marker TO
Preposition, to TO
Preposition, IN
Preposition, of IN
Possessive POS
Interjection (or other isolate) UH
Punctuation, sentence ender .
Punctuation, semicolon :
Punctuation, colon or ellipsis :
Punctuation, comma ,
Punctuation, dash -
Punctuation, dollar sign $
Punctuation, left bracket (
Punctuation, right bracket )
Punctuation, left quotation ``
Punctuation, right quotation ''
Foreign words (not in English lexicon) FW
Symbol SYM
Symbol, alphabetical 
Symbol, list item LS
URL or email address ??
Emoticon ??
Online discourse marker ??
Possessive nominal ??
Possessive proper noun ??
Nominal combined with verbal ??
Proper noun combined with verbal ??
Miscellaneous function word combined with verbal ??

In [ ]: