CAVEAT: Make sure the page owner allows crawling the content for scientific purpost (Terms, robots.txt)
Strata Conference, one of the most important conferences for all things Big Data, Hadoop, Data Science.
from IPython.display import HTML
HTML('<iframe src="http://strataconf.com/strata2014/public/schedule/grid/2014-02-12?schedule=public" width=100% height=350></iframe>')
from bs4 import BeautifulSoup
import urllib2
# List of conference schedule pages
urls = {2011 : "http://strataconf.com/strata2011/public/schedule/full",
2012 : "http://strataconf.com/strata2012/public/schedule/full/public",
2013 : "http://strataconf.com/strata2013/public/schedule/full/public",
2014 : "http://strataconf.com/strata2014/public/schedule/full/public"}
links = {}
# Collecting the links to the talk abstracts
for u in urls:
raw = urllib2.urlopen(urls[u]).read()
soup = BeautifulSoup(raw)
yearlinks = set([l.get("href") for l in soup.find_all("a")])
yearlinks = [l for l in yearlinks if not (l is None)]
yearlinks = [l for l in yearlinks if '/detail' in l]
yearlinks = [l.replace("http://strataconf.com", "") for l in yearlinks]
links[u] = yearlinks
abstracts = {}
for year in links:
for l in links[year]:
raw = urllib2.urlopen("http://www.strataconf.com" + l).read()
soup = BeautifulSoup(raw)
desc = soup.find("div", class_="en_session_description description")
if year in abstracts:
abstracts[year].append(desc.get_text())
else:
abstracts[year] = [desc.get_text()]
import json
# Load Data (if you don't want to crawl the data)
with open('strata_abstracts.json') as f:
abstracts = json.load(f)
import nltk
bigram_measures = nltk.collocations.BigramAssocMeasures()
stop = nltk.corpus.stopwords.words('english')
text = {}
words = {}
for year in abstracts:
raw = " ".join(abstracts[year])
tokens = nltk.WordPunctTokenizer().tokenize(raw)
text[year] = nltk.Text(tokens)
words[year] = [w.lower() for w in text[year]]
words[year] = [w for w in words[year] if w not in stop]
words[year] = filter(lambda word: word not in u'%,-:()$\/;?.’–“”', words[year])
words[year] = [w for w in words[year] if w not in ["ll", "II", "ll", "http", "://", "e", "g", "2", "0"]]
text["2012"]
for year in text:
print year
text[year].collocations()
print
numwords = {}
uniwords = {}
for year in text:
numwords[year] = len(text[year])
uniwords[year] = len(set(text[year]))
print numwords
print uniwords
import pandas as pd
freq_table = pd.DataFrame()
for year in words:
fd = nltk.FreqDist(words[year])
if (len(freq_table) == 0):
freq_table = pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)])
else:
freq_table = freq_table.merge(pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)]))
print freq_table[:10]
for year in numwords:
freq_table["Perc_" + year] = 100.0 * freq_table["Freq_" + year] / numwords[year]
for year in ["2012", "2013", "2014"]:
print year
freq_table["Growth_" + year] = 100.0 * freq_table["Perc_" + year] / freq_table["Perc_" + str(int(year)-1)]
tb = freq_table[freq_table['Perc_' + str(year)] >= 0.08].sort(columns="Growth_" + str(year), ascending=False)[["Word", "Freq_" + str(year), "Perc_" + str(year), "Growth_" + str(year)]]
tb.columns = ["Word", "Freq", "Percent", "Index"]
tb.Index = tb['Index'].round(1)
tb.Percent = tb['Percent'].round(4)
print tb[:10]
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
for year in ["2011", "2012", "2013", "2014"]:
print "Bigrams " + str(year)
finder = BigramCollocationFinder.from_words(words[year])
scored = finder.score_ngrams(bigram_measures.raw_freq)
print pd.DataFrame(scored[:10])
for year in abstracts:
print "Trigrams " + str(year)
finder = TrigramCollocationFinder.from_words(text[year])
scored = finder.score_ngrams(trigram_measures.raw_freq)
print pd.DataFrame(scored[:10])
from collections import Counter
import pandas as pd
trending_words = pd.DataFrame()
for year in words:
fdist = nltk.FreqDist(words[year])
if len(trending_words) == 0:
trending_words = pd.DataFrame(fdist.items(), columns=["word", str(year)])
trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
else:
trending_words = trending_words.merge(pd.DataFrame(fdist.items(), columns=["word", str(year)]), how="outer")
trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
print trending_words[:10]
trending_words["plus12"] = trending_words["2012"] / trending_words["2011"]
trending_words["plus13"] = trending_words["2013"] / trending_words["2012"]
trending_words["plus14"] = trending_words["2014"] / trending_words["2013"]
trending_words = trending_words.fillna(0)
print trending_words[(trending_words["2012"] > 0.001) & (trending_words["2011"] > 0)].sort("plus12", ascending=False)[:10]
print
print trending_words[(trending_words["2013"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus13", ascending=False)[:10]
print
print trending_words[(trending_words["2014"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus14", ascending=False)[:10]
import pandas as pd
result = pd.DataFrame()
for year in words:
finder = BigramCollocationFinder.from_words(words[year], window_size = 2)
#finder.apply_freq_filter(2)
ignored_words = nltk.corpus.stopwords.words('english')
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
scores = finder.score_ngrams(bigram_measures.raw_freq)
if len(result) == 0:
result = pd.DataFrame(scores, columns=["ngram", str(year)])
else:
result = result.merge(pd.DataFrame(scores, columns=["ngram", str(year)]))
print result[:10]
result["plus12"] = result["2012"] / result["2011"]
result["plus13"] = result["2013"] / result["2012"]
result["plus14"] = result["2014"] / result["2013"]
print result[result["2014"] > 0.0005].sort("plus14", ascending=False)[:10]
print
print result[result["2013"] > 0.0005].sort("plus13", ascending=False)[:10]
print
print result[result["2012"] > 0.0005].sort("plus12", ascending=False)[:10]
%matplotlib inline
import matplotlib.pyplot as plt
query = [("big", "data"), ("data", "science"), ("real", "time"), ("machine", "learning"), ("social", "media"), ("open", "source")]
query_results = result[result['ngram'].isin(query)][["2011", "2012", "2013", "2014"]].transpose()
query_results.columns = [" ".join(q) for q in query]
print query_results.plot(figsize=(10,5), title="Strata topics")
Latent Dirichlet Analysis from https://github.com/shuyo/iir/blob/master/lda/lda.py
%run lda.py -f strata_abstracts.txt -s --stopwords -k 7
%matplotlib inline
import matplotlib.pyplot as plt
query = ["hadoop", "yarn", "storm"]
query = ["python", "julia", "r", "sas", "stata", "excel"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Programming Langugages @ Strata Conferences 2011-2014")
query = ["business", "energy", "advertising", "banking", "health", "politics", "government", "finance", "automotive"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
query = ["google", "facebook", "yahoo", "linkedin", "microsoft"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
query = ["modern", "machine", "learning"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Topics at Strata Conferences 2011-14")
plt.savefig("Strata_ModernMachineLearning.png")