#!/usr/bin/env python
# coding: utf-8
# # Final Project - James Quacinella
#
# Fo the final project, I will look at the follower network of one of the think tank Twitter account and perform clustering to find groups of associated accounts. Looking at the clusters, I hope to identify what joins them by performing some NLP tasks on the account's profile contents.
#
# ## Step 1 - Crawl Twitter for Followers
#
# The next section of code does not run in the notebook, but is a copy of the crawler code created for this project. It will take a single account, get the first level followers, and then grab the 'second-level' followers. Those second level follower are only added if they were nodes in the first level (so we focus on the main account, not other accounts tangentially related).
# In[ ]:
#import graphlab as gl
import pickle
import twitter
import logging
import time
from collections import defaultdict
### Setup a console and file logger
logger = logging.getLogger('crawler')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('crawler.log')
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
### Setup signals to make sure API calls only take 60s at most
from functools import wraps
import errno
import os
import signal
class TimeoutError(Exception):
pass
def timeout(seconds=60, error_message=os.strerror(errno.ETIME)):
def decorator(func):
def _handle_timeout(signum, frame):
raise TimeoutError(error_message)
def wrapper(*args, **kwargs):
signal.signal(signal.SIGALRM, _handle_timeout)
signal.alarm(seconds)
try:
result = func(*args, **kwargs)
finally:
signal.alarm(0)
return result
return wraps(func)(wrapper)
return decorator
@timeout()
def getFollowers(api, follower):
''' Function that will get a user's list of followers from an api object.
NOTE: the decorator ensures that this only runs for 60s at most. '''
# return api.GetFollowerIDs(follower)
return api.GetFriendIDs(follower)
### Twitter API
# Lets create our list of api OAuth parameters
API_TOKENS = [
{"consumer_key": 'yp4wi4FASXbsRKa6JxYqzhUlH',
"consumer_secret": 'Wkh1d5ygAOp4Bp65syFzHRN4xQsS8O4FvU3zHWosX8NXCqMpcl',
"access_token_key": '16562593-F6lRFe7iyoQEahezhPmaI64oInHZD0LNpcIbbq7Wy',
"access_token_secret": 'weregYL8n6DI7yZy9pkizIJ78rH2GY02Do9jvpTe7rCey',
"requests_timeout": 60},
{"consumer_key": 'NsNYFG9LtZV2XMyigPaCKVyVz',
"consumer_secret": '4J1vlowybipqXnSrKgLBvmzPmwqx71uHN32noljTgDLS2xQNfI',
"access_token_key": '16562593-NCuQWVnpzcnB55w7VLdoCkdobdUQBRDJKjIPXAksP',
"access_token_secret": 'nX9OksrYQxj0jBXYJTkUjlX5mZh4rZljfVRXtSM3Tjc8c',
"requests_timeout": 60},
{"consumer_key": 'ZcAMGe2MUcnTO9ATCIo563SHN',
"consumer_secret": 'dJAB7mBfoYyx27Yccbmzz98GtNigAA67Ish9Y1NjN2wNznciM1',
"access_token_key": '16562593-AmaoKVLEYL3o8rVUS3b6u4PUbVPTI6BPsyaqCdwxY',
"access_token_secret": '8pjYJCFWTErJlb2WSkLwsYNoptVazQQs95JAvIU8JApUA',
"requests_timeout": 60},
{"consumer_key": 'avZpjObqQN9vue2Y4gu9zIF9X',
"consumer_secret": 'Ka6WCj3fyon5yGgf5YJIIl8nVcLcUh5YT99N58qy8qv4kfaMbc',
"access_token_key": '16562593-VNuGD09Cr29ZlzNCWnV5MOujU7PsexSwfTgfKQNqC',
"access_token_secret": '9P3hB3qDb9zPDFCUhWU16N4CMXPwHacl6HJbCc0EuGj7s',
"requests_timeout": 60},
{"consumer_key": 'sQ9H5NKteroNZSWvIrkSWvXR0',
"consumer_secret": 'lC0ttZKdIZhhJAE1I5RxMxdjpSiADQCVUnHS7LbtfVmI2pz2F2',
"access_token_key": '16562593-4LOk7QkXWD0boF01BmZ6NP2oPtHmDZ1OVJ883aANG',
"access_token_secret": 'JJ85qMqzVowN1KdQ6w4YlhJB9YF9eWbw6SGbxQoU6gvne',
"requests_timeout": 60},
{"consumer_key": 'DHppZ2LG3iYj8vEx7ibRRLN35',
"consumer_secret": 'wdTQeyp7ZNDN7ne40IriRw7Ah1J8cAi2OIlw4MVtgpq5MMKjYE',
"access_token_key": '16562593-WN8zvEWAxVfJPrneMwUjDoVQw0geuLckOOJqFimsC',
"access_token_secret": 'ZgVi2onPB3RPGtRmPBs6QXymIMgXwJHUOQycesp64S0Hp',
"requests_timeout": 60},
{"consumer_key": 'lIgtfdkC2WmN7XAcicrGygQBp',
"consumer_secret": '2D9WIJN2MIPwFpMeIGcP6vWjQC8vvy7G5ZlHMSH1F1CsgWGKfz',
"access_token_key": '16562593-7lhPpeZNNAGoQQJnqcnTtBiGq1O52XMZ4CMeVqXiY',
"access_token_secret": 'WKRBQsr36MMB2EpCcZLr89ik0MSJfPoBORCKu9E1hw96I',
"requests_timeout": 60},
{"consumer_key": '1XFu2urZzoMoC5sadXAjA7IoQ',
"consumer_secret": 'FrJOlHfNLp3M7ejJWiO5k74E9ai6L5EzQJ45HmlsUINbh8qUUi',
"access_token_key": '16562593-Texko6g7VyCwhNUfxBDoJKJl4058hpvQkqAYWRKpi',
"access_token_secret": 'ISZCTvN6bYJVaJ3Z2iidQObTzE2pxkINBLi0WWe9Ab2Zv',
"requests_timeout": 60},
{"consumer_key": 'r8Bvdm6I8QrRPuVzP4VtRYpqd',
"consumer_secret": 'CzA8u8M8nDiDCCrSzCsXpR3SyTGCaLppDWbdTxSg78ZKgtKkhh',
"access_token_key": '16562593-I3l0ZSmfZbMxIQ2NbiiM2eDMA4KNzFmFBeUkWxunR',
"access_token_secret": '9HkILP4kSMF0hgvsB126jpoUzsRXETYMlSM0YSKb2yMJH',
"requests_timeout": 60},
{"consumer_key": 'NmMjfP1Zt3n2VDZ15X7SDGM6G',
"consumer_secret": 'j9JBx7HUbMpcDnFteiIAAgHSoA8idlqQ20A1xbvnMrqMrOHQ1n',
"access_token_key": '16562593-zUNyMUdO9JnSIstmTrqdyHHmX2lpv9NqkQxGC8faP',
"access_token_secret": 'DEeHvLjTXlxNGmqDntXOK0cJCX08cnpg0btoRXWATW3X2',
"requests_timeout": 60}
]
# Now create a list of twitter API objects
apis = []
for token in API_TOKENS:
apis.append( twitter.Api(consumer_key=token['consumer_key'],
consumer_secret=token['consumer_secret'],
access_token_key=token['access_token_key'],
access_token_secret=token['access_token_secret'],
requests_timeout=60))
# The account id / screen name we want followers from
account_screen_name = 'fairmediawatch'
account_id = '54679731'
# Keep track of nodes connected to account, and all edges we need in the graph
nodes = set()
edges = defaultdict(set)
# Try to load first level followers from pickle;
# otherwise, generate them from a single API call and save via pickle
try:
logger.info("Loading followers for %s" % account_screen_name)
f = open("following1", "rb")
following = pickle.load(f)
except Exception as e:
logger.info("Failed. Generating followers for %s" % account_screen_name)
following = api.GetFriendIDs(screen_name=account_screen_name)
pickle.dump(following, open("following1", "wb"))
# Try to load the nodes and first level edges from pickle;
# otherwise generate them from the 'following' list and save
try:
logger.info("Loading nodes and edges for depth = 1, for %s" % account_screen_name)
n = open("nodes.follow1.set", "rb")
e = open("edges.follow1.dict", "rb")
nodes = pickle.load(n)
edges = pickle.load(e)
except Exception as e:
logger.info("Failed. Generating nodes and edges for depth = 1, for %s" % account_screen_name)
for follower in following:
nodes.add(follower)
edges[account_id].add(follower)
pickle.dump(nodes, open("nodes.follow1.set", "wb"))
pickle.dump(edges, open("edges.follow1.dict", "wb"))
### Crawling for Depth2
# Index the api list, and start from the first api object
api_idx = 0
api = apis[api_idx]
# Some accounts give us issues (either too many followers or no permissions)
blacklist= [74323323, 43532023, 19608297, 25757924, 240369959, 173634807, 17008482, 142143804]
api_updated = False
# It is nice to start from a point in the list, instead of from the beginning
starting_point = 142143804
if starting_point:
starting_point_idx = following.index(starting_point)
following_iter = range(starting_point_idx, len(following))
else:
following_iter = range(len(following))
# Try loading second layer of followers from pickle, otherwise start from scratch
try:
f = open("edges.follow2.dict", "rb")
edges = pickle.load(f)
logger.info("Loaded edges.follow2 into memory!")
except Exception as e:
logger.info("Starting from SCRATCH: did not load edges.follow2 into memory!")
pass
# For each follower of the main account ...
for follower_idx in following_iter:
follower = following[follower_idx]
success = False
# ... check if they are on the blacklist; if so, skip
if follower in blacklist:
logger.info("Skipping due to blacklist")
continue
# Otherwise, attempt to get list of their followers
followers_depth2_list = []
while not success:
try:
logger.info("Getting followers for follower %s" % follower)
followers_depth2_list = getFollowers(api, follower)
success = True
except TimeoutError as e:
# If api call takes too long, move on
logger.info("Timeout after 60s for follower %d" % follower)
success = True # technically not a success but setting flag so next loop moves on
continue
except Exception as e:
# IF we get here, then we hit API limits
logger.info("API Exception %s; api-idx = %d" % (str(e), api_idx))
# Are we at the begining of api list?
# IF so, dump edges so far via pickle and sleep
if api_updated and api_idx % len(API_TOKENS) == 0 and api_idx >= len(API_TOKENS):
logger.info("Save edges to pickle file for follower = %s" % follower)
pickle.dump(edges, open("edges.follow2.dict", "wb"))
logger.info("Sleeping ...")
time.sleep(60)
api_updated = False
# Otherwise, move on to the next api object and try again
else:
api_idx += 1
api = apis[api_idx % len(API_TOKENS)]
api_updated = True
# After getting the followers, find the intersection of those followers
# with those of the first-level followers and add to edge dict
if followers_depth2_list:
logger.info("Adding followers to the graph")
edges[follower].update(nodes.intersection(followers_depth2_list))
# Write out final list of edges via pickle
logger.info("Save edges to pickle file for follower = %s" % follower)
pickle.dump(edges, open("edges.follow2.dict", "wb"))
# Instead of running the above, lets just load everything via pickle:
# In[2]:
import pickle
n = open("nodes.follow1.set", "rb")
nodes = pickle.load(n)
e = open("edges.follow2.dict", "rb")
edges = pickle.load(e)
f = open("following1", "rb")
following = pickle.load(f)
# ## Step 2 - Generate Graph from Crawl
#
# First, we generate CSV files so we can load data into GraphLab Create.
# In[3]:
# Hide some silly output
import logging
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
# Import everything we need
import graphlab as gl
# Generate CSVs from the previous crawl
# TODO
f = open('vertices.csv', 'w')
f.write('id\n')
for node in nodes:
f.write(str(node) + "\n")
f.close()
f = open('edges.csv', 'w')
f.write('src,dst,relation\n')
for node, followers in edges.iteritems():
for follower in followers:
f.write('%s,%s,%s\n' % (follower, node, 'follows'))
f.close()
# Next, let us use these CSV files and load them into a graph object called g:
# In[4]:
# Load Data
gvertices = gl.SFrame.read_csv('vertices.csv')
gedges = gl.SFrame.read_csv('edges.csv')
# Create graph
g = gl.SGraph()
g = g.add_vertices(vertices=gvertices, vid_field='id')
g = g.add_edges(edges=gedges, src_field='src', dst_field='dst')
g = g.add_edges(edges=gedges, src_field='dst', dst_field='src')
# Lets try to visualize the graph!
# In[28]:
# Visualize graph?
gl.canvas.set_target('browser')
g.show(vlabel="id")
# Looks like its too large of a graph to display.
#
# ## Central / Important Nodes
#
# Lets use pagerank to find important nodes in the network:
# In[5]:
pr = gl.pagerank.create(g)
pr.get('pagerank').topk(column_name='pagerank')
# ## Import The Graph to iGraph
#
# Next we will load the graph data into igrpah and perform clustering to find communities
# In[140]:
from igraph import *
# Create empty graph
twitter_graph = Graph(directed=False)
# Setup the nodes
for node in nodes:
if isinstance(node, int):
twitter_graph.add_vertex(name=str(node))
# In[142]:
# Setup the edges
for user in edges:
for follower in edges[user]:
try:
twitter_graph.add_edge(str(follower), str(user))
except Exception as e:
print user, follower
print e
break
# In[143]:
# Add the 'ego' edges
following = pickle.load(open("following1", "rb"))
for node in following:
twitter_graph.add_edge(str(node), "54679731")
# In[145]:
# for v in twitter_graph.vs.select(name_eq="54679731"):
# print v
# for e in twitter_graph.es.select(_source=v.index): print e
# for e in twitter_graph.es.select(_target=533): print e
# In[102]:
# for v in twitter_graph.vs:
# if len(twitter_graph.es.select(_source=v.index)) == 0 and len(twitter_graph.es.select(_target=v.index)) == 0:
# print v
# In[ ]:
pickle.dump(twitter_graph, open("twitter_graph", "wb"))
# In[19]:
# Load twitter grapg to prevent running the above
twitter_graph = pickle.load(open("twitter_graph", "rb"))
# ## Display the Graph
# In[ ]:
layout = twitter_graph.layout_drl()
plt1 = plot(twitter_graph, 'graph.drl.png', layout = layout)
# In[257]:
from IPython.display import HTML
s = """"""
h = HTML(s); h
# In[214]:
layout = twitter_graph.layout("graphopt")
plt2 = plot(twitter_graph, 'graph.graphopt.png', layout = layout)
# In[258]:
s = """"""
h = HTML(s); h
# In[227]:
layout = twitter_graph.layout("lgl")
plt2 = plot(twitter_graph, 'graph.lgl.png', layout = layout)
# In[260]:
s = """"""
h = HTML(s); h
# Lets trim down the graph to only large nodes:
# In[237]:
# https://lists.nongnu.org/archive/html/igraph-help/2012-11/msg00047.html
twitter_graph2 = twitter_graph.copy()
nodes = twitter_graph2.vs(_degree_lt=200)
twitter_graph2.es.select(_within=nodes).delete()
twitter_graph2.vs(_degree_lt=200).delete()
layout = twitter_graph2.layout_drl()
plt1 = plot(twitter_graph2, 'graph2.drl.png', layout = layout)
# In[261]:
s = """"""
h = HTML(s); h
# ## Clustering
# First, lets run the walktrap community algorithm, which seems to produce a heirarchical clustering:
# In[243]:
wc = twitter_graph.community_walktrap()
plot(wc, 'cluster.walktrap.png', bbox=(3000,3000))
# In[266]:
s = """"""
h = HTML(s); h
# In[22]:
eigen = twitter_graph.community_leading_eigenvector()
plot(eigen, 'cluster.eigen.test.png', mark_groups=True, bbox=(5000,5000))
# In[270]:
s = """"""
h = HTML(s); h
# Still messy, lets try doing the same thing but on the smaller graph:
# In[238]:
eigen2 = twitter_graph2.community_leading_eigenvector()
plot(eigen2, 'cluster2.eigen.png', mark_groups=True, bbox=(5000,5000))
# In[269]:
s = """"""
h = HTML(s); h
# ## Bios of Twitter Users
#
# Now that we have the graph communities, lets crawl twitter for the bios on each user. I will not reproduce the code, as it is mostly a copy of the above twitter crawl, using a different API call. Check biocrawl.py in the repo. We'll load the data from pickle:
# In[23]:
bios = pickle.load(open("bios", "rb"))
from collections import defaultdict
documents = defaultdict(str)
for v_idx, cluster in zip(range(len(eigen.membership)), eigen.membership):
twitter_id = int(twitter_graph.vs[v_idx].attributes()['name'])
if twitter_id in bios:
documents[cluster] += "\n%s" % bios[ twitter_id ]
# ### Important Nodes from PageRank
#
# Lets look at the important nodes in the network and their associated bios:
# In[61]:
import prettytable
pts = []
for cluster in set(eigen.membership):
# Create a pretty table of tweet contents and any expanded urls
pt = prettytable.PrettyTable(["Rank", "Bio", "Cluster ID"])
pt.align["Bio"] = "l" # Left align bio
pt.max_width = 60
pt.padding_width = 1 # One space between column edges and contents (default)
pts.append(pt)
# Loop thru top 100 page-rank'ed nodes
x = pr.get('pagerank')
for row in x.topk(column_name='pagerank', k=100):
if row['__id'] in bios:
vidx = twitter_graph.vs.select(name_eq=str(row['__id']))[0].index
clusterid = eigen.membership[vidx]
pts[clusterid].add_row([row['pagerank'], bios[row['__id']].split("\n")[0], clusterid ])
# Lets see the results!
for pt in pts:
print pt
# Looking at the above, I think I can come up with explanations of the clusters just by looking at the important nodes as found by page rank:
#
# - Cluster 0: Media personalities
# - This is a little tougher to gauge, but cluster 0 is more oriented towards editors and personalities, while the others tend to be organizations or institutional accounts
# - Cluster 1: Independent media, 'far-left' media outlets
# - Amy Goodman (democracy now), muckreads, #Occupy, GritTV, Truthout
# - Notice how this is a smaller group than the others
# - Cluster 2: Mainstream Liberal organizations or media personalities
# - HuffPost, NY Times, CNN, MSNBC, Bill Moyers, New Yorker
# - Cluster 3: people / journalists involved with civil liberties, human rights and whistleblowing
# - FOIA, PGP, Whistleblower, secrecy
# ### Bigram Analysis
#
# Lets do a bigram analysis on the documents that created from a combination from user bio's and their latest tweet, using a custom stop word filter and filtering for tokens of length of 3 or more:
# In[93]:
import string
import nltk
from nltk.collocations import *
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
# Custom stopwords, mostly twitter slang and other internet rubbish
my_stopwords = stopwords.words('english')
my_stopwords = my_stopwords + ['http', 'https', 'bit', 'ly', 'co', 'rt', 'rts', 'com', 'org', 'dot', 'go', 'via', 'follow', 'us', 'follow', 'retweet', 'also', 'run']
def preProcess(text):
text = text.lower()
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
filtered_words = [w for w in tokens if not w in my_stopwords and not w.isdigit() and len(w) > 2]
return " ".join(filtered_words)
def getBigrams(content, threshold=5):
tokens = nltk.wordpunct_tokenize(preProcess(content))
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(threshold)
scored = finder.score_ngrams(bigram_measures.raw_freq)
return sorted([ (bigram, score) for (bigram, score) in scored ], key=lambda t: t[1], reverse=True)
# In[94]:
bigrams = []
for clusteridx in documents:
bigrams.append(getBigrams(documents[clusteridx], threshold=2))
# Create a pretty table of tweet contents and any expanded urls
bigram_pt = prettytable.PrettyTable(["Cluster 0", "Cluster 1", "Cluster 2", "Cluster 3"])
bigram_pt.align["Cluster 0"] = "l"
bigram_pt.align["Cluster 1"] = "l"
bigram_pt.align["Cluster 2"] = "l"
bigram_pt.align["Cluster 3"] = "l"
bigram_pt.max_width = 60
bigram_pt.padding_width = 1 # One space between column edges and contents (default)
for idx in range(len(bigrams[0])):
bigram_pt.add_row([" ".join(bigram[idx][0]) for bigram in bigrams])
# Lets see the results!
print bigram_pt
# Looking through the bigrams for each cluster seems to help support the outline above of the different clusters. To expand on the outline below:
#
# - Cluster 0: Media personalities
# - executive director, contributing writer , senior fellow, contributing editor
# - Cluster 1: Independent media, 'far-left' media outlets, senior editor, staff writer
# - social justice, media democracy, independent media , media justice, news analysis , digital media, nvestigative reporting
# - Cluster 2: Mainstream Liberal organizations or media personalities
# - new york times, new yorker, washington post, daliy kos, washington correspondent
# - Cluster 3: people / journalists involved with civil liberties, human rights and whistleblowing
# - Bigrams: human rights , national security , foreign policy, civil liberties, digital rights, pgp(!)
#
# ### Future Considerations
#
# - Using TF-IDF to make sure bigrams or tokens that appear in all documents are weighed less
# In[ ]: