import json
import codecs
import operator
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import glob
from datetime import datetime
from dateutil import tz
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
from nltk.collocations import *
from time import strftime, localtime
import gc
import sys
#show = sys.argv[1]
out = codecs.open(show.strip('#') + '.trigrams.txt', 'w', encoding='utf-8')
print "start reading file at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
with codecs.open('../twarc/' + show.strip('#') + '.txt', 'r', encoding='utf-8') as content_file:
content = content_file.read()
print "finished reading file at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
'''
jsons = glob.glob('../twarc/' + show + '*')
f = codecs.open('../twarc/scandal.txt', 'r', encoding='utf-8')
data = []
words = ""
count = 0
for line in f:
count += 1
print str(count)
words += line
# All attempts to process these massive JSON collections crap out.
# The Python objects that result from parsing the json just don't scale
# 16G Memory locks down machine after a few hundred thousand.
An alternate approach inolves pulling necessary data out via JQ beforehand:
charper@charper-ThinkPad-T530:~/Dropbox/stern/pds/twarc$ touch scandal.txt
charper@charper-ThinkPad-T530:~/Dropbox/stern/pds/twarc$ for j in \#Scandal-201311*
> do
> cat $j | jq '.text' >> scandal.txt
> done
'''
print "starting tokenization at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
tokens = content.split()
print "starting filtering at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
# Stopwords requires download of stopwords corpora -- import nltk, nltk.download()
stops = stopwords.words('english')
#specifics = ['#DoctorWho', '#DoctorWho50th', '#SaveTheDay', '#DayoftheDoctor', 'RT']
specifics = ['#sleepyhollow', '#Sleepyhollow', 'RT']
stops.extend(specifics)
#filtered_words = [w for w in tokens if not w in stops]
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
print "starting finder at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(filtered_words)
finder.ngram_fd.viewitems()
finder.apply_freq_filter(3)
print finder.nbest(bigram_measures.pmi, 100)
out.write(str(finder.nbest(bigram_measures.pmi, 100)) + "\n")
print "trigrams & frequency at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
tgs = nltk.trigrams(filtered_words)
fdist = nltk.FreqDist(tgs)
for k,v in fdist.items():
if int(v) > 10:
out.write(str(k) + "|" + str(v) + "\n")
'''
Part of speech tagging code, that mostly works, but didn't make the final iteration of things
print "part of speech tagging at : " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
pos = nltk.pos_tag(filtered_words)
pos_df = pd.DataFrame.from_records(pos)
pos_df.to_csv('sleepyhollow.pos.csv', encoding='utf8', quoting=1, index=False)
'''
print "finsihed at: " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
start reading file at: Wed, 18 Dec 2013 18:55:30 +0000 finished reading file at: Wed, 18 Dec 2013 18:55:30 +0000 starting tokenization at: Wed, 18 Dec 2013 18:55:30 +0000 starting filtering at: Wed, 18 Dec 2013 18:55:30 +0000 starting finder at: Wed, 18 Dec 2013 18:55:49 +0000 [(u'"#Lmfao', u'#Lmao'), (u'"watched', u'yesterday:'), (u'#Apps:', u'#DeadTrigger'), (u'#ArrestedDevelopment', u'#RETWEET"'), (u'#Blog', u'#Geek"'), (u'#Bud', u'http://t.co/fU3lXpg25T"'), (u'#Conan\\n2)', u'#DoratheExplorer\\n3)'), (u'#DoratheExplorer\\n3)', u'#TOMANDJERRY\\n4)'), (u'#EndersGame', u'Review\\"'), (u'#FREE', u'#podcast!'), (u'#FollowAndFollow', u'siga'), (u'#GameFly', u'http://t.co/AagvYH9IrD"'), (u'#HarryStyles', u'before!!'), (u'#HomerSimpson\\n#Nerdvana', u'http://t.co/U9EMeQ7I39"'), (u'#JensenAckles', u'#JaredPadalecki"'), (u'#Jetpack', u'#GameFly'), (u'#MrBergstrom', u'http://t.co/wEQ2gJIhAy"'), (u'#Nonfiction', u'#Books"'), (u'#OldEpisodes', u'#BestFriend"'), (u'#StarCars:', u'http://t.co/KdOJ4uIkuB'), (u'#TOMANDJERRY\\n4)', u'#TheSimpsons\\n5)...'), (u'#TheHobbit\\n#TheSimpsons', u'#HomerSimpson\\n#Nerdvana'), (u'#TheSimpsons)', u'Toronto.'), (u'#YouAreLisaSimpsons', u'w/@louisaevecohen'), (u'#codeine', u'#hustle'), (u'#detention', u'#donthaveacowman'), (u'#donthaveacowman', u'#mattgroening\u2026'), (u'#eatmyshorts', u'#detention'), (u'#ethreal', u'#trippy'), (u'#justgonnareplacetheskinnerpartaccordingly', u'#simpsonslogic'), (u'#kush', u'#trapmusic'), (u'#mattgroening\u2026', u'http://t.co/xSH6mFN1Rq"'), (u'#newlifemotto', u'#justgonnareplacetheskinnerpartaccordingly'), (u'#nintendo', u'#gamersunite'), (u'#nouraakekhonger', u'http://t.co/uvHqurEbYP"'), (u'#ociogay', u'#pikoftheday'), (u'#trippy', u'#wave'), (u'#wave', u'#kush'), (u'#\u0627\u0644\u0623\u0643\u062b\u0631', u'#\u0634\u0639\u0628\u064a\u0629'), (u'#\u0627\u0644\u0639\u0627\u0644\u0645', u'\u061f\\n1)'), (u'#\u0634\u0639\u0628\u064a\u0629', u'\u0641\u064a'), (u'(From', u'#TheSimpsons)'), (u'(cc', u'@camiiilleem'), (u'11x17', u'Signed'), (u'526-529', u'#nintendo'), (u'@2amsnaps:', u'[JOKWON'), (u'@Andreita_Villas', u'@luciasolerapine'), (u'@AzuGadoMart:', u'Haz'), (u'@BeverlyMacca1:', u'OK'), (u'@Brad_Cibane', u'@Julius_S_Malema'), (u'@DACrosse', u'@dliebma'), (u'@FaythAnderton:', u'Omg'), (u'@Fercharmed:', u'.@Alyssa_Milano'), (u'@Joshstrangehill', u'@bbcradiomanc'), (u'@Josi_RF:', u'\\"Hoy'), (u'@LITTLEREGGAEMAN:', u'-Jaa'), (u'@LaurenTom9000', u'#IKnowThatVoice,'), (u'@LetsGoSeeDo', u'@DACrosse'), (u'@LottaBitt:', u'Wtf!'), (u'@Lurdesferizq', u'#osquiero'), (u'@Nathaliajshj:', u'\u2665\\"'), (u'@TELUS', u'Optik'), (u'@VictorGionatan:', u'Ma'), (u'@_ItsEli:', u'Nel'), (u'@andreagandia14', u'http://t.co/bkXfsGIDRG"'), (u'@bbcradiomanc', u'4.50pm'), (u'@camiiilleem', u'@MargotDaval'), (u'@debikayo', u'@kaywyoming'), (u'@kaywyoming', u'@LetsGoSeeDo'), (u'@luciasolerapine', u'@Lurdesferizq'), (u'@malmarri', u'@Salem_Belyouha'), (u'@mariamas05', u'@andreagandia14'), (u'@mark_jubb:', u'OMG,'), (u'@paula200084', u'@Andreita_Villas'), (u'@veronii29082249', u'@paula200084'), (u'Astilla\\"', u'#budumtss'), (u'Benvingut,', u'nano!'), (u'Bradbury,', u'Matheson.'), (u'C\u2019mon!', u'You\u2019ve'), (u'DANG', u'ROOF!!\\n#thesimpsons'), (u'DNS?\\n\xa1NOSOTROS!', u'\xa1NOSOTROS!\\n#sindromeDeDAW'), (u'Database.', u'Please.'), (u'Dias', u':)))'), (u'Doh!\\n\\nhttp://t.co/KmbKePxpK5\\n#LOTR', u'#TheHobbit\\n#TheSimpsons'), (u'Email', u'jen@tvfilmnewsllc.com'), (u'Emmy', u'nom.'), (u'Fansite?', u'Email'), (u'GET', u'OFF'), (u'GeoGantArt', u'http://t.co/m77SOEb82p'), (u'HOLLYWOOD', u"STAR'S"), (u'Hershel', u'Krustovsky'), (u'Hierarchy', u'Quoting'), (u'ICONIC', u'DRESSES'), (u'IP?', u'\\n\xbfQuien'), (u'Interactive', u'Map'), (u'Jamaicans??', u'Lol...wow...'), (u'Jump', u'see:'), (u'Lunar', u'Module'), (u'MA!', u'GET'), (u'MOST', u'ICONIC')] trigrams & frequency at: Wed, 18 Dec 2013 18:55:52 +0000 finsihed at: Wed, 18 Dec 2013 18:55:54 +0000
print "part of speech tagging at : " + time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
pos = nltk.pos_tag(filtered_words)
pos_df = pd.DataFrame.from_records(pos)
pos_df.to_csv(show.strip('#') + '.pos.csv', encoding='utf8', quoting=1, index=False)
part of speech tagging at : Wed, 18 Dec 2013 18:57:08 +0000
pos_df.head(10)
0 | 1 | |
---|---|---|
0 | "#TheSimpsons: | NN |
1 | FXX | NNP |
2 | sichert | NN |
3 | sich | NN |
4 | Wiederholungsrechte | NNP |
5 | Mega-Deal | NNP |
6 | http://t.co/UQxUm3pylV" | NNP |
7 | "#thesimpsons | NNS |
8 | @ProSieben" | -NONE- |
9 | "Yes | VBZ |
counts = pos_df.groupby(1).size()
counts.sort(ascending=False)
counts.head(10)
1 NNP 59609 NN 55267 JJ 26434 NNS 23771 -NONE- 17535 CD 4508 RB 3765 VBP 3564 : 3331 VBZ 3124 dtype: int64
!awk ' BEGIN { FS = "\t" } { print $1, $5 } ' POSMappings.txt
Category PTB Adjective JJ Adjective, ordinal number JJ Adjective, comparative JJR Adjective, superlative JJS Adjective, superlative, semantically JJ Adjective, cardinal number CD Adjective, cardinal number, one CD Adjective, past-part of verb VBN, JJ Adjective, pres-part of verb VBG, JJ Adverb RB Adverb, negative RB Adverb, comparative RBR Adverb, superlative RBS Adverb, particle RP Adverb, question WRB Adverb, degree & question WRB Adverb, degree RB Adverb, degree, postposed RB Adverb, nominal RB Adverb, conjunctive RB Conjunction, coordination CC Conjunction, subordinating IN Conjunction, complementizer 'that' IN Determiner DT Determiner, pronoun DT Determiner, pronoun, plural DT Determiner, prequalifier PDT Determiner, prequalifier PDT Determiner, pronoun or double conj. DT (CC) Determiner, pronoun or double conj. DT (CC) Determiner, article DT Determiner, article DT Determiner, postdeterminer JJ Determiner, possessive PRP$ Determiner, possessive, second PRP Determiner, question WDT Determiner, possessive & question WP$ Noun NN Noun, singular NN Noun, plural NNS Noun, proper, singular NNP Noun, proper, plural NNPS Noun, adverbial NN, NNP, RB Noun, plural from post-determiner NNS Pronoun, nominal (indefinite) NN Pronoun, personal, subject PRP Pronoun, personal, subject, 3SG PRP Pronoun, personal, object PRP Pronoun, reflexive PRP Pronoun, reflexive, plural PRP Pronoun, question, subject WP Pronoun, question, object WP Pronoun, question, existential there EX Verb. base present form (not infinitive) VBP Verb, infinitive VB Verb, past tense VBD Verb, present participle VBG Verb, past/passive participle VBN Verb, present 3SG -s form VBZ Verb, auxilliary do, base VBP Verb, auxilliary do, infinitive VB Verb, auxilliary do, past VBD Verb, auxilliary do, present part. VBG Verb, auxilliary do, past part. VBN Verb, auxilliary do, present 3SG VBZ Verb, auxilliary have, base VBP Verb, auxilliary have, infinitive VB Verb, auxilliary have, past VBD Verb, auxilliary have, present part. VBG Verb, auxilliary have, past part. VBN Verb, auxilliary have, present 3SG VBZ Verb, auxilliary be, infinitive VB Verb, auxilliary be, past VBD Verb, auxilliary be, past, 3SG VBD Verb, auxilliary be, present part. VBG Verb, auxilliary be, past part. VBN Verb, auxilliary be, present, 3SG VBZ Verb, auxilliary be, present, 1SG VBP Verb, auxilliary be, present VBP Verb, modal MD Infinitive marker TO Preposition, to TO Preposition, IN Preposition, of IN Possessive POS Interjection (or other isolate) UH Punctuation, sentence ender . Punctuation, semicolon : Punctuation, colon or ellipsis : Punctuation, comma , Punctuation, dash - Punctuation, dollar sign $ Punctuation, left bracket ( Punctuation, right bracket ) Punctuation, left quotation `` Punctuation, right quotation '' Foreign words (not in English lexicon) FW Symbol SYM Symbol, alphabetical Symbol, list item LS URL or email address ?? Emoticon ?? Online discourse marker ?? Possessive nominal ?? Possessive proper noun ?? Nominal combined with verbal ?? Proper noun combined with verbal ?? Miscellaneous function word combined with verbal ??