Fo the final project, I will look at the follower network of one of the think tank Twitter account and perform clustering to find groups of associated accounts. Looking at the clusters, I hope to identify what joins them by performing some NLP tasks on the account's profile contents.
The next section of code does not run in the notebook, but is a copy of the crawler code created for this project. It will take a single account, get the first level followers, and then grab the 'second-level' followers. Those second level follower are only added if they were nodes in the first level (so we focus on the main account, not other accounts tangentially related).
#import graphlab as gl
import pickle
import twitter
import logging
import time
from collections import defaultdict
### Setup a console and file logger
logger = logging.getLogger('crawler')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('crawler.log')
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)
### Setup signals to make sure API calls only take 60s at most
from functools import wraps
import errno
import os
import signal
class TimeoutError(Exception):
pass
def timeout(seconds=60, error_message=os.strerror(errno.ETIME)):
def decorator(func):
def _handle_timeout(signum, frame):
raise TimeoutError(error_message)
def wrapper(*args, **kwargs):
signal.signal(signal.SIGALRM, _handle_timeout)
signal.alarm(seconds)
try:
result = func(*args, **kwargs)
finally:
signal.alarm(0)
return result
return wraps(func)(wrapper)
return decorator
@timeout()
def getFollowers(api, follower):
''' Function that will get a user's list of followers from an api object.
NOTE: the decorator ensures that this only runs for 60s at most. '''
# return api.GetFollowerIDs(follower)
return api.GetFriendIDs(follower)
### Twitter API
# Lets create our list of api OAuth parameters
API_TOKENS = [
{"consumer_key": 'yp4wi4FASXbsRKa6JxYqzhUlH',
"consumer_secret": 'Wkh1d5ygAOp4Bp65syFzHRN4xQsS8O4FvU3zHWosX8NXCqMpcl',
"access_token_key": '16562593-F6lRFe7iyoQEahezhPmaI64oInHZD0LNpcIbbq7Wy',
"access_token_secret": 'weregYL8n6DI7yZy9pkizIJ78rH2GY02Do9jvpTe7rCey',
"requests_timeout": 60},
{"consumer_key": 'NsNYFG9LtZV2XMyigPaCKVyVz',
"consumer_secret": '4J1vlowybipqXnSrKgLBvmzPmwqx71uHN32noljTgDLS2xQNfI',
"access_token_key": '16562593-NCuQWVnpzcnB55w7VLdoCkdobdUQBRDJKjIPXAksP',
"access_token_secret": 'nX9OksrYQxj0jBXYJTkUjlX5mZh4rZljfVRXtSM3Tjc8c',
"requests_timeout": 60},
{"consumer_key": 'ZcAMGe2MUcnTO9ATCIo563SHN',
"consumer_secret": 'dJAB7mBfoYyx27Yccbmzz98GtNigAA67Ish9Y1NjN2wNznciM1',
"access_token_key": '16562593-AmaoKVLEYL3o8rVUS3b6u4PUbVPTI6BPsyaqCdwxY',
"access_token_secret": '8pjYJCFWTErJlb2WSkLwsYNoptVazQQs95JAvIU8JApUA',
"requests_timeout": 60},
{"consumer_key": 'avZpjObqQN9vue2Y4gu9zIF9X',
"consumer_secret": 'Ka6WCj3fyon5yGgf5YJIIl8nVcLcUh5YT99N58qy8qv4kfaMbc',
"access_token_key": '16562593-VNuGD09Cr29ZlzNCWnV5MOujU7PsexSwfTgfKQNqC',
"access_token_secret": '9P3hB3qDb9zPDFCUhWU16N4CMXPwHacl6HJbCc0EuGj7s',
"requests_timeout": 60},
{"consumer_key": 'sQ9H5NKteroNZSWvIrkSWvXR0',
"consumer_secret": 'lC0ttZKdIZhhJAE1I5RxMxdjpSiADQCVUnHS7LbtfVmI2pz2F2',
"access_token_key": '16562593-4LOk7QkXWD0boF01BmZ6NP2oPtHmDZ1OVJ883aANG',
"access_token_secret": 'JJ85qMqzVowN1KdQ6w4YlhJB9YF9eWbw6SGbxQoU6gvne',
"requests_timeout": 60},
{"consumer_key": 'DHppZ2LG3iYj8vEx7ibRRLN35',
"consumer_secret": 'wdTQeyp7ZNDN7ne40IriRw7Ah1J8cAi2OIlw4MVtgpq5MMKjYE',
"access_token_key": '16562593-WN8zvEWAxVfJPrneMwUjDoVQw0geuLckOOJqFimsC',
"access_token_secret": 'ZgVi2onPB3RPGtRmPBs6QXymIMgXwJHUOQycesp64S0Hp',
"requests_timeout": 60},
{"consumer_key": 'lIgtfdkC2WmN7XAcicrGygQBp',
"consumer_secret": '2D9WIJN2MIPwFpMeIGcP6vWjQC8vvy7G5ZlHMSH1F1CsgWGKfz',
"access_token_key": '16562593-7lhPpeZNNAGoQQJnqcnTtBiGq1O52XMZ4CMeVqXiY',
"access_token_secret": 'WKRBQsr36MMB2EpCcZLr89ik0MSJfPoBORCKu9E1hw96I',
"requests_timeout": 60},
{"consumer_key": '1XFu2urZzoMoC5sadXAjA7IoQ',
"consumer_secret": 'FrJOlHfNLp3M7ejJWiO5k74E9ai6L5EzQJ45HmlsUINbh8qUUi',
"access_token_key": '16562593-Texko6g7VyCwhNUfxBDoJKJl4058hpvQkqAYWRKpi',
"access_token_secret": 'ISZCTvN6bYJVaJ3Z2iidQObTzE2pxkINBLi0WWe9Ab2Zv',
"requests_timeout": 60},
{"consumer_key": 'r8Bvdm6I8QrRPuVzP4VtRYpqd',
"consumer_secret": 'CzA8u8M8nDiDCCrSzCsXpR3SyTGCaLppDWbdTxSg78ZKgtKkhh',
"access_token_key": '16562593-I3l0ZSmfZbMxIQ2NbiiM2eDMA4KNzFmFBeUkWxunR',
"access_token_secret": '9HkILP4kSMF0hgvsB126jpoUzsRXETYMlSM0YSKb2yMJH',
"requests_timeout": 60},
{"consumer_key": 'NmMjfP1Zt3n2VDZ15X7SDGM6G',
"consumer_secret": 'j9JBx7HUbMpcDnFteiIAAgHSoA8idlqQ20A1xbvnMrqMrOHQ1n',
"access_token_key": '16562593-zUNyMUdO9JnSIstmTrqdyHHmX2lpv9NqkQxGC8faP',
"access_token_secret": 'DEeHvLjTXlxNGmqDntXOK0cJCX08cnpg0btoRXWATW3X2',
"requests_timeout": 60}
]
# Now create a list of twitter API objects
apis = []
for token in API_TOKENS:
apis.append( twitter.Api(consumer_key=token['consumer_key'],
consumer_secret=token['consumer_secret'],
access_token_key=token['access_token_key'],
access_token_secret=token['access_token_secret'],
requests_timeout=60))
# The account id / screen name we want followers from
account_screen_name = 'fairmediawatch'
account_id = '54679731'
# Keep track of nodes connected to account, and all edges we need in the graph
nodes = set()
edges = defaultdict(set)
# Try to load first level followers from pickle;
# otherwise, generate them from a single API call and save via pickle
try:
logger.info("Loading followers for %s" % account_screen_name)
f = open("following1", "rb")
following = pickle.load(f)
except Exception as e:
logger.info("Failed. Generating followers for %s" % account_screen_name)
following = api.GetFriendIDs(screen_name=account_screen_name)
pickle.dump(following, open("following1", "wb"))
# Try to load the nodes and first level edges from pickle;
# otherwise generate them from the 'following' list and save
try:
logger.info("Loading nodes and edges for depth = 1, for %s" % account_screen_name)
n = open("nodes.follow1.set", "rb")
e = open("edges.follow1.dict", "rb")
nodes = pickle.load(n)
edges = pickle.load(e)
except Exception as e:
logger.info("Failed. Generating nodes and edges for depth = 1, for %s" % account_screen_name)
for follower in following:
nodes.add(follower)
edges[account_id].add(follower)
pickle.dump(nodes, open("nodes.follow1.set", "wb"))
pickle.dump(edges, open("edges.follow1.dict", "wb"))
### Crawling for Depth2
# Index the api list, and start from the first api object
api_idx = 0
api = apis[api_idx]
# Some accounts give us issues (either too many followers or no permissions)
blacklist= [74323323, 43532023, 19608297, 25757924, 240369959, 173634807, 17008482, 142143804]
api_updated = False
# It is nice to start from a point in the list, instead of from the beginning
starting_point = 142143804
if starting_point:
starting_point_idx = following.index(starting_point)
following_iter = range(starting_point_idx, len(following))
else:
following_iter = range(len(following))
# Try loading second layer of followers from pickle, otherwise start from scratch
try:
f = open("edges.follow2.dict", "rb")
edges = pickle.load(f)
logger.info("Loaded edges.follow2 into memory!")
except Exception as e:
logger.info("Starting from SCRATCH: did not load edges.follow2 into memory!")
pass
# For each follower of the main account ...
for follower_idx in following_iter:
follower = following[follower_idx]
success = False
# ... check if they are on the blacklist; if so, skip
if follower in blacklist:
logger.info("Skipping due to blacklist")
continue
# Otherwise, attempt to get list of their followers
followers_depth2_list = []
while not success:
try:
logger.info("Getting followers for follower %s" % follower)
followers_depth2_list = getFollowers(api, follower)
success = True
except TimeoutError as e:
# If api call takes too long, move on
logger.info("Timeout after 60s for follower %d" % follower)
success = True # technically not a success but setting flag so next loop moves on
continue
except Exception as e:
# IF we get here, then we hit API limits
logger.info("API Exception %s; api-idx = %d" % (str(e), api_idx))
# Are we at the begining of api list?
# IF so, dump edges so far via pickle and sleep
if api_updated and api_idx % len(API_TOKENS) == 0 and api_idx >= len(API_TOKENS):
logger.info("Save edges to pickle file for follower = %s" % follower)
pickle.dump(edges, open("edges.follow2.dict", "wb"))
logger.info("Sleeping ...")
time.sleep(60)
api_updated = False
# Otherwise, move on to the next api object and try again
else:
api_idx += 1
api = apis[api_idx % len(API_TOKENS)]
api_updated = True
# After getting the followers, find the intersection of those followers
# with those of the first-level followers and add to edge dict
if followers_depth2_list:
logger.info("Adding followers to the graph")
edges[follower].update(nodes.intersection(followers_depth2_list))
# Write out final list of edges via pickle
logger.info("Save edges to pickle file for follower = %s" % follower)
pickle.dump(edges, open("edges.follow2.dict", "wb"))
Instead of running the above, lets just load everything via pickle:
import pickle
n = open("nodes.follow1.set", "rb")
nodes = pickle.load(n)
e = open("edges.follow2.dict", "rb")
edges = pickle.load(e)
f = open("following1", "rb")
following = pickle.load(f)
First, we generate CSV files so we can load data into GraphLab Create.
# Hide some silly output
import logging
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
# Import everything we need
import graphlab as gl
# Generate CSVs from the previous crawl
# TODO
f = open('vertices.csv', 'w')
f.write('id\n')
for node in nodes:
f.write(str(node) + "\n")
f.close()
f = open('edges.csv', 'w')
f.write('src,dst,relation\n')
for node, followers in edges.iteritems():
for follower in followers:
f.write('%s,%s,%s\n' % (follower, node, 'follows'))
f.close()
Next, let us use these CSV files and load them into a graph object called g:
# Load Data
gvertices = gl.SFrame.read_csv('vertices.csv')
gedges = gl.SFrame.read_csv('edges.csv')
# Create graph
g = gl.SGraph()
g = g.add_vertices(vertices=gvertices, vid_field='id')
g = g.add_edges(edges=gedges, src_field='src', dst_field='dst')
g = g.add_edges(edges=gedges, src_field='dst', dst_field='src')
[INFO] This non-commercial license of GraphLab Create is assigned to james.quacinella@gmail.comand will expire on January 01, 2038. For commercial licensing options, visit https://dato.com/buy/. [INFO] Start server at: ipc:///tmp/graphlab_server-18863 - Server binary: /usr/local/lib/python2.7/dist-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1437714775.log [INFO] GraphLab Server Version: 1.5.1
PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/vertices.csv PROGRESS: Parsing completed. Parsed 100 lines in 0.024206 secs. ------------------------------------------------------ Inferred types from first line of file as column_type_hints=[int] If parsing fails due to incorrect types, you can correct the inferred type list above and pass it to read_csv in the column_type_hints argument ------------------------------------------------------ PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/vertices.csv PROGRESS: Parsing completed. Parsed 1108 lines in 0.018389 secs. PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/edges.csv PROGRESS: Parsing completed. Parsed 100 lines in 0.114743 secs. ------------------------------------------------------ Inferred types from first line of file as column_type_hints=[int,int,str] If parsing fails due to incorrect types, you can correct the inferred type list above and pass it to read_csv in the column_type_hints argument ------------------------------------------------------ PROGRESS: Finished parsing file /home/james/Development/Masters/IndependentStudy/Final/edges.csv PROGRESS: Parsing completed. Parsed 105006 lines in 0.076969 secs.
Lets try to visualize the graph!
# Visualize graph?
gl.canvas.set_target('browser')
g.show(vlabel="id")
Canvas is accessible via web browser at the URL: http://localhost:48677/index.html Opening Canvas in default web browser.
Looks like its too large of a graph to display.
Lets use pagerank to find important nodes in the network:
pr = gl.pagerank.create(g)
pr.get('pagerank').topk(column_name='pagerank')
PROGRESS: Counting out degree PROGRESS: Done counting out degree PROGRESS: +-----------+-----------------------+ PROGRESS: | Iteration | L1 change in pagerank | PROGRESS: +-----------+-----------------------+ PROGRESS: | 1 | 617.534 | PROGRESS: | 2 | 135.25 | PROGRESS: | 3 | 30.9247 | PROGRESS: | 4 | 8.64859 | PROGRESS: | 5 | 2.52531 | PROGRESS: | 6 | 0.885368 | PROGRESS: | 7 | 0.323184 | PROGRESS: | 8 | 0.126578 | PROGRESS: | 9 | 0.0503135 | PROGRESS: | 10 | 0.0203128 | PROGRESS: | 11 | 0.00825336 | PROGRESS: +-----------+-----------------------+
__id | pagerank | delta |
---|---|---|
54679731 | 7.15893054698 | 2.08163053328e-05 |
59159771 | 5.73589508502 | 5.1434297017e-06 |
169182727 | 5.68248985863 | 2.4887386834e-05 |
16935292 | 4.98957011223 | 3.37281975513e-05 |
1947301 | 4.39339614539 | 1.60673868965e-06 |
23839835 | 4.36113011846 | 1.93642112549e-05 |
16076032 | 4.34163894719 | 7.78788532063e-06 |
10117892 | 3.96520683672 | 4.10425955666e-06 |
16955991 | 3.84512060914 | 1.25983857791e-05 |
478203018 | 3.40106898918 | 2.55084466252e-05 |
Next we will load the graph data into igrpah and perform clustering to find communities
from igraph import *
# Create empty graph
twitter_graph = Graph(directed=False)
# Setup the nodes
for node in nodes:
if isinstance(node, int):
twitter_graph.add_vertex(name=str(node))
# Setup the edges
for user in edges:
for follower in edges[user]:
try:
twitter_graph.add_edge(str(follower), str(user))
except Exception as e:
print user, follower
print e
break
# Add the 'ego' edges
following = pickle.load(open("following1", "rb"))
for node in following:
twitter_graph.add_edge(str(node), "54679731")
# for v in twitter_graph.vs.select(name_eq="54679731"):
# print v
# for e in twitter_graph.es.select(_source=v.index): print e
# for e in twitter_graph.es.select(_target=533): print e
# for v in twitter_graph.vs:
# if len(twitter_graph.es.select(_source=v.index)) == 0 and len(twitter_graph.es.select(_target=v.index)) == 0:
# print v
igraph.Vertex(<igraph.Graph object at 0x7faa9404f528>,544,{'name': '54679731'})
pickle.dump(twitter_graph, open("twitter_graph", "wb"))
# Load twitter grapg to prevent running the above
twitter_graph = pickle.load(open("twitter_graph", "rb"))
layout = twitter_graph.layout_drl()
plt1 = plot(twitter_graph, 'graph.drl.png', layout = layout)
from IPython.display import HTML
s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph.drl.png" width="500" /></td>"""
h = HTML(s); h
layout = twitter_graph.layout("graphopt")
plt2 = plot(twitter_graph, 'graph.graphopt.png', layout = layout)
<igraph.drawing.Plot at 0x7faa8979c250>
s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph.graphopt.png" width="500" /></td>"""
h = HTML(s); h
layout = twitter_graph.layout("lgl")
plt2 = plot(twitter_graph, 'graph.lgl.png', layout = layout)
s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph.lgl.png" width="500" /></td>"""
h = HTML(s); h
Lets trim down the graph to only large nodes:
# https://lists.nongnu.org/archive/html/igraph-help/2012-11/msg00047.html
twitter_graph2 = twitter_graph.copy()
nodes = twitter_graph2.vs(_degree_lt=200)
twitter_graph2.es.select(_within=nodes).delete()
twitter_graph2.vs(_degree_lt=200).delete()
layout = twitter_graph2.layout_drl()
plt1 = plot(twitter_graph2, 'graph2.drl.png', layout = layout)
s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph2.drl.png" width="500" /></td>"""
h = HTML(s); h
First, lets run the walktrap community algorithm, which seems to produce a heirarchical clustering:
wc = twitter_graph.community_walktrap()
plot(wc, 'cluster.walktrap.png', bbox=(3000,3000))
<igraph.drawing.Plot at 0x7faa74930690>
s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/cluster.walktrap.png" width="900" /></td>"""
h = HTML(s); h
eigen = twitter_graph.community_leading_eigenvector()
plot(eigen, 'cluster.eigen.test.png', mark_groups=True, bbox=(5000,5000))
<igraph.drawing.Plot at 0x7fb94c6fe610>
s = """<img src="https://s3.amazonaws.com/jqmasters/cluster.eigen.jpg" width="900" /></td>"""
h = HTML(s); h
Still messy, lets try doing the same thing but on the smaller graph:
eigen2 = twitter_graph2.community_leading_eigenvector()
plot(eigen2, 'cluster2.eigen.png', mark_groups=True, bbox=(5000,5000))
s = """<img src="https://s3.amazonaws.com/jqmasters/cluster2.eigen.jpg" width="900" /></td>"""
h = HTML(s); h
Now that we have the graph communities, lets crawl twitter for the bios on each user. I will not reproduce the code, as it is mostly a copy of the above twitter crawl, using a different API call. Check biocrawl.py in the repo. We'll load the data from pickle:
bios = pickle.load(open("bios", "rb"))
from collections import defaultdict
documents = defaultdict(str)
for v_idx, cluster in zip(range(len(eigen.membership)), eigen.membership):
twitter_id = int(twitter_graph.vs[v_idx].attributes()['name'])
if twitter_id in bios:
documents[cluster] += "\n%s" % bios[ twitter_id ]
Lets look at the important nodes in the network and their associated bios:
import prettytable
pts = []
for cluster in set(eigen.membership):
# Create a pretty table of tweet contents and any expanded urls
pt = prettytable.PrettyTable(["Rank", "Bio", "Cluster ID"])
pt.align["Bio"] = "l" # Left align bio
pt.max_width = 60
pt.padding_width = 1 # One space between column edges and contents (default)
pts.append(pt)
# Loop thru top 100 page-rank'ed nodes
x = pr.get('pagerank')
for row in x.topk(column_name='pagerank', k=100):
if row['__id'] in bios:
vidx = twitter_graph.vs.select(name_eq=str(row['__id']))[0].index
clusterid = eigen.membership[vidx]
pts[clusterid].add_row([row['pagerank'], bios[row['__id']].split("\n")[0], clusterid ])
# Lets see the results!
for pt in pts:
print pt
+---------------+--------------------------------------------------------------+------------+ | Rank | Bio | Cluster ID | +---------------+--------------------------------------------------------------+------------+ | 4.39339614539 | Instigating progress since 1865 | 0 | | 3.84512060914 | Providing fearless political journalism and cultural | 0 | | | analysis since the dawn of the digital era. We're also at | | | | http://t.co/arAhFcxlTr | | | 3.30740490311 | Investigative journalist, blogger at Clear it With Sidney at | 0 | | | the Hillman Foundation and Duly Noted at In These Times. Co- | | | | Host of @PointofInquiry. #binders | | | 3.2642239473 | Media Matters for America is the nation's premier | 0 | | | progressive media watchdog, research and information center. | | | 3.24584856758 | AlterNet is a progressive news magazine and online community | 0 | | | http://t.co/f4LPGamU | | | 3.05310397834 | Investigative journalism, politics, chart-tastic, and | 0 | | | sometimes sarcastic. We're the nonprofit news organization | | | | that brought you the 47 percent video. | | | 2.63898887362 | The latest political news from The Huffington Post's | 0 | | | politics team. | | | 2.46692594602 | Dad/husband. @CNN. Co-founded @RebuildDream, @GreenForAll, | 0 | | | @ColorOfChange, @EllaBakerCenter, @YesWeCode & #cut50. | | | | Wrote: GreenCollar Economy & Rebuild The Dream | | | 2.38625057759 | Bloomberg @BW reporter, covering politics/ policy/ labor. | 0 | | | Send me your tips (and complaints): jeidelson at bloomberg | | | | dot net [Usual disclaimers] | | | 2.3499563445 | Adele M. Stan is a columnist at The American Prospect, and | 0 | | | editor of Clarion, the newspaper of Professional Staff | | | | Congress/CUNY. She also plays the ukulele. | | | 2.34208284562 | The official Twitter of http://t.co/HJOFeYodXw | 0 | | 2.25057081559 | The official Twitter of @msnbc's Melissa Harris-Perry. | 0 | | | Exploring politics, culture, art, and community beyond the | | | | beltway every Saturday and Sunday, 10a-12p ET. | | | 2.19802709748 | labor movement, left politics, superhero comics, rock & roll | 0 | | | & online organizing. | | | 2.19673751375 | Host of The Katie Halper Show on WBAI, lefty comedian / | 0 | | | blogger at Salon, Vice, The Nation, Feministing, Raw Story, | | | | Alternet, Comedy Central & more / filmmaker | | | 2.17501663699 | Reporter/enviro editor at @HuffPostPol and VP of membership | 0 | | | at @sejorg. Fan of gravity, fermentation, and the serial | | | | comma. | | | 2.11935441039 | Writer/Producer/Comic. Co creator of The Daily Show. Author: | 0 | | | Lizz Free Or Die. Exposer of crackpots at | | | | http://t.co/COQwCZPcmF and http://t.co/WhllLpegoQ | | | 2.09854384862 | Analyst, advocate, writer, mom. Feminist. Thinker & doer. | 0 | | | Realist & dreamer. Progressive but not predictable. Editor- | | | | in-Chief, RH RealityCheck. UW Madison alum | | | 2.06228546594 | Executive Editor and Columnist, The Nation | 0 | | 2.05351480712 | Columnist @GuardianUS | Feminist author | Pasta enthusiast, | 0 | | | native NYer | My books http://t.co/Ct9Ck2TTqb | Eat Me | | | | http://t.co/qkg3oItKfw | | | 2.03231408258 | Author of THE TEACHER WARS. Staff writer @MarshallProj. | 0 | | | http://t.co/NA3HloQjwN | | | 2.03109035963 | Writer. Lawyer. Eater. Formerly @Cosmopolitan senior | 0 | | | political writer, @GuardianUS columnist, @Feministe blogger. | | | 2.02762932794 | Wake Forest University Professor, Director @AJCCenter, | 0 | | | Executive Director @WakeEngaged, MSNBC Host of @MHPShow, | | | | contributor to @TheNation & @EssenceMag | | | 2.0121987956 | National Editor @buzzfeednews. PGP: http://t.co/xGj7z2Ljki | 0 | | | adam.serwer@buzzfeed.com https://t.co/zl6RcFyMTN | | | 1.99455894381 | Senior Editor, @tnr. Host of @IntersectionTNR, a new podcast | 0 | | | about race, gender, and all the ways we identify. | | +---------------+--------------------------------------------------------------+------------+ +---------------+--------------------------------------------------------------+------------+ | Rank | Bio | Cluster ID | +---------------+--------------------------------------------------------------+------------+ | 4.98957011223 | Independent, Daily Global News Hour Anchored by Amy Goodman | 1 | | | & Juan González. Stream Live 8am ET http://t.co/SL25z1kZE5. | | | | Support Independent Media - Donate Today | | | 3.96520683672 | Investigative reporting, political commentary, cultural | 1 | | | coverage, activism, interviews, poetry, and humor since | | | | 1909. | | | 3.04835708177 | YES! Magazine's award-winning journalism reframes the | 1 | | | biggest problems of our time in terms of their solutions. | | | | Independent, nonprofit, reader-supported. | | | 2.90741081734 | Drilling beneath the headlines. Follow us for provocative | 1 | | | and insightful news, features and analysis. | | | 2.89312248828 | Occupying Wall Street since Sep 17, 2011. Standing with the | 1 | | | global #Occupy movement. About our team: | | | | http://t.co/7SfBMRuTjZ #OWS | | | 2.66065430813 | Monthly news magazine committed to informing and analyzing | 1 | | | movements for social, environmental and economic justice. | | | | Founded in 1976. | | | 2.47763903414 | Pursuing stories with moral force. Curating your best | 1 | | | #muckreads. Tweets by @terryparrisjr + @amzam. Send tips | | | | securely: https://t.co/JWIupK6Wrl | | | 2.36358932648 | Host,The Laura Flanders Show on @GRITtv, radio commentator; | 1 | | | author, BUSHWOMEN, BLUE GRIT; Editor, At the Tea Party... | | | 2.18561738802 | Colorlines is a daily news site where race matters, | 1 | | | featuring award-winning investigative reporting and news | | | | analysis. | | | 2.177018398 | TMC is a North American network of leading independent media | 1 | | | outlets. Follow us for news & nuance you can't find anywhere | | | | else! Tweets by @jgksf + @manolialive | | | 2.06230444459 | Truthout is dedicated to providing independent news & | 1 | | | commentary. We hope to inspire the direct action necessary | | | | to save the planet & humanity. Official Tweets. | | | 2.00360103986 | progressive, bold, 100% independent, journalism and | 1 | | | advocacy, tenth anniversary online | | +---------------+--------------------------------------------------------------+------------+ +---------------+--------------------------------------------------------------+------------+ | Rank | Bio | Cluster ID | +---------------+--------------------------------------------------------------+------------+ | 5.73589508502 | The Nation magazine Editor and Publisher | 2 | | 3.325182719 | Washington editor @the_intercept, a First Look Media | 2 | | | publication. froomkin@theintercept.com How to leak to me: | | | | http://t.co/md5GQRJby1 | | | 3.05833410529 | Host of All In with Chris Hayes on MSNBC, Weeknights at 8pm. | 2 | | | Editor at Large at The Nation. Cubs fan. | | | 2.99593940177 | Covering national politics for @washingtonpost. Finishing a | 2 | | | book about progressive rock (W.W. Norton). | | | | daveweigel@gmail.com, 302-507-6806. | | | 2 | | 2.76573594395 | Moving news forward. Editor-In-Chief @JuddLegum | 2 | | 2.67779981051 | Editor-in-chief, http://t.co/5gESirESRH. Policy analyst at | 2 | | | MSNBC. Hater of filibuster. Lover of charts. Come work | | | | @Voxdotcom! http://t.co/VhALOi3yKC | | | 2.6711824588 | Senior Political Reporter and Politics Managing Editor, | 2 | | | HuffPost. aterkel at huffingtonpost dot com Sign up for my | | | | newsletter: https://t.co/tM0sM6PgOR | | | 2.64430685492 | Photojournalist covering media, culture & politics I also | 2 | | | write (mostly here lately) Also @tigerbeat on instagram | | | | Email srhodes at gmail | | | 2.55744476724 | I teach journalism at NYU, direct the Studio 20 program | 2 | | | there, critique the press and try to understand digital | | | | logic. I also advise media companies sometimes. | | | 2.52993123165 | http://t.co/3NqFkIfQya: the Aggressive Progressives since | 2 | | | 2000. 2 million strong and growing. Yes We Will! | | | 2.45535873698 | Editor-in-Chief, FiveThirtyEight. Author, The Signal and the | 2 | | | Noise (http://t.co/9mLliQYI8N). Sports/politics/food geek. | | | 2.43432862068 | CNN Anchor and Chief Washington Correspondent. Dissecting my | 2 | | | tweets with Talmudic meticulousness will result in wrong | | | | conclusions. RTs do not = endorsement. | | | 2.43060658092 | Making sense of what matters, tweets from ‘Moyers & Company’ | 2 | | | producers and Bill Moyers. Keep track of the corrupting | | | | influence of $ on politics. | | | 2.39529425002 | A blog about politics, politics, and politics | 2 | | 2.35927440968 | Author of a dozen books. Film producer. Ed. of Editor & | 2 | | | Publisher and Crawdaddy. Daily blogger. Next book optioned | | | | for Paul Greengrass flick. | | | 2.34320763082 | DC editor of Mother Jones, MSNBC analyst & author of the new | 2 | | | book, SHOWDOWN: The Inside Story of How Obama Fought Back | | | | Against Boehner, Cantor & the Tea Party | | | 2.33831887877 | Editor in Chief, @YahooPolitics, a @YahooNews production. | 2 | | | Politics, media, breaking. | | | 2.27046419486 | Breaking news and analysis from the TPM team. | 2 | | 2.27025295481 | CNN's senior media correspondent and host of @CNNReliable. | 2 | | | Formerly @nytimes, @tvnewser and Top of the Morning. Email: | | | | bstelter@gmail.com | | | 2.25702219683 | Nobel laureate. Op-Ed columnist, @nytopinion. Author, “The | 2 | | | Return of Depression Economics,” “The Great Unraveling,” | | | | “The Age of Diminished Expectations” + more. | | | 2.17156961195 | Investigative Journalist. @the_intercept | 2 | | | lee.fang@theintercept.com | | | 2.16913235691 | NY Times columnist, co-author of Half the Sky & A Path | 2 | | | Appears, http://t.co/bcxQaJYCMg Newsletter: | | | | http://t.co/EYhBhaKPv1 | | | 2.14931459631 | The New Yorker is a weekly magazine with a mix of reporting | 2 | | | of politics and culture, humor and cartoons, fiction and | | | | poetry, and reviews and criticism. | | | 2.12878229972 | Your favorite national security reporter's favorite national | 2 | | | security reporter. Bette's dad. | | | | spencer.ackerman@theguardian.com Public key: | | | | http://t.co/hRo2CKhJ6Q | | | 2.12198501232 | Monitoring the press, tracking the evolving media business & | 2 | | | encouraging excellence in journalism since 1961. | | | 2.09821142588 | Founder of Daily Kos, Co-founder Vox Media | 2 | | 2.04895071097 | DFH/blogger/humanoid | 2 | | 2.03705055969 | A little of this, a little of that. | 2 | | 2.01207242886 | Website of the Center for Responsive Politics, the most | 2 | | | comprehensive, nonpartisan money-in-politics resource | | | | around. Get the must-reads: http://t.co/3722t5iaZH | | +---------------+--------------------------------------------------------------+------------+ +---------------+--------------------------------------------------------------+------------+ | Rank | Bio | Cluster ID | +---------------+--------------------------------------------------------------+------------+ | 3 | | 4.36113011846 | independent journalist, co founder of @the_intercept PGP | 3 | | | key/contact: https://t.co/lnq46VuHN0 | | | 4.34163894719 | Journalist with @The_Intercept - author, No Place to Hide - | 3 | | | dog/animal fanatic - email/PGP public key | | | | (https://t.co/uJnK90oulZ) | | | 3.40106898918 | Doing communications at @ncacensorship. Many years at | 3 | | | @fairmediawatch. Will get better at surfing someday. | | | 3.37780426034 | Journalist focused on prisons & harsh sentencing. More fun | 3 | | | than I sound. | | | 3.15649462152 | they say I'm polarizing | 3 | | 2.93662754386 | Senior Writer, http://t.co/UX4ClyaE8E Author, The 51 Day | 3 | | | War: Ruin and Resistance in Gaza http://t.co/faFdf2BdZ3 | | | 2.83537931907 | Do @accuracy (news releases), @dcstakeout (questioning | 3 | | | politicos), @votepact (left & right pairing up) and | | | | @xposefacts (whistleblowers). Also, artsy. | | | 2.71412498047 | Author of When the World Outlawed War, War Is A Lie and | 3 | | | Daybreak: Undoing the Imperial Presidency and Forming a More | | | | Perfect Union. | | | 2.69338887562 | @thinkprogress, @unitedrepublic, and @boldprogressive in my | 3 | | | past, @Alternet in my present. Love cats, the South, and | | | | cheesecake | | | 2.6779919512 | We open governments. | 3 | | 2.67055757303 | Abundant tweets about civil liberties and national security, | 3 | | | football, Beer Mecca, and other craic. | | | 2.61628118821 | The Center for Constitutional Rights is dedicated to | 3 | | | advancing and protecting the rights guaranteed by the U.S. | | | | Constitution and the UDHR. | | | 2.56572209168 | @IBTimes Senior Editor, Investigations. Also: Denverite, | 3 | | | vegetarian, author, newspaper columnist, real guy | | | | represented by the character on ABC's The Goldbergs | | | 2.49664644121 | Journalist and author of THE DIVIDE, GRIFTOPIA and THE GREAT | 3 | | | DERANGEMENT | | | 2.4337795872 | HRW provides timely information about #humanrights crises in | 3 | | | 90+ countries. Curated by @jimmurphysf & @astroehlein Staff | | | | list: https://t.co/wBw0SILvlQ | | | 2.32695707327 | Co-host of @CitizenRadio. Independent journalist. I've | 3 | | | written for places. Author of the new @CitizenRadio book | | | | #NEWSFAIL. Order: http://t.co/OenJarCjeU | | | 2.31465869927 | @AJAM columnist & author, The Passion of Chelsea Manning: | 3 | | | The Story behind the Wikileaks Whistleblower. | | | 2.30283084457 | Journalist who covers dissent, whistleblowing, secrecy, | 3 | | | police, spying, etc. Co-host of Unauthorized Disclosure | | | | (@UnauthorizedDis) podcast. Outside agitator. | | | 2.25773320367 | I write think pieces on twitter. | 3 | | 2.22078808059 | Independent journalist. Objectivity is bullshit. | 3 | | 2.162909011 | Communications professional. Amateur expertician. Vellichor | 3 | | | sufferer. Tsundoku artist. | | | 2.14320488526 | Journalist + musician + digger + dad. Author, SPIES FOR | 3 | | | HIRE. Covering war + biz @TheNation. Raised in Japan & South | | | | Korea. Honorary citizen of Kwangju. | | | 2.13987832419 | Filmmaker ('Fahrenheit 9/11'), author ('Stupid White Men'), | 3 | | | citizen ('United States of America'). | | | 2.12001844581 | Publisher, @truthout. Opinions my own. I don't hate you, but | 3 | | | I hate to critique/overrate you. | | | 2.07655045491 | Defending the American soldier imprisoned for revealing the | 3 | | | truth about war crimes and illegal foreign policies. | | | 2.06283984859 | Investigative reporter @vicenews. FOIA terrorist. Band | 3 | | | Tshirt hoarder. Author: News Junkie, a memoir, and The Abu | | | | Zubaydah Diaries. PGP: http://t.co/i2X8fWclVf | | | 2.05084546198 | Author, Political Activist, Columnist | 3 | | 2.03496812847 | Journalist and radio host specializing in economics & | 3 | | | politics | | | 2.01309910723 | Executive director, @FreedomofPress. Columnist, @GuardianUS | 3 | | | and @CJR. Remote operator, @Drones. [Views here are my own.] | | | 2.00461660532 | independent journalist. Democracy Now! correspondent. Nation | 3 | | | Institute fellow. | | | 2.00000440304 | Radio host for KPFK in L.A., Liberty Radio Network 12-2E. | 3 | | | 3,500 interviews since 2003. Married to reporter @larisa_a. | | | | Fan of, but not the lawyer from Harper's. | | | 1.98513659195 | Mondoweiss is a news website devoted to covering American | 3 | | | politics & policy in Israel/Palestine & the broader Middle | | | | East. @Mondowitz does most of the tweeting. | | +---------------+--------------------------------------------------------------+------------+
Looking at the above, I think I can come up with explanations of the clusters just by looking at the important nodes as found by page rank:
Lets do a bigram analysis on the documents that created from a combination from user bio's and their latest tweet, using a custom stop word filter and filtering for tokens of length of 3 or more:
import string
import nltk
from nltk.collocations import *
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
# Custom stopwords, mostly twitter slang and other internet rubbish
my_stopwords = stopwords.words('english')
my_stopwords = my_stopwords + ['http', 'https', 'bit', 'ly', 'co', 'rt', 'rts', 'com', 'org', 'dot', 'go', 'via', 'follow', 'us', 'follow', 'retweet', 'also', 'run']
def preProcess(text):
text = text.lower()
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
filtered_words = [w for w in tokens if not w in my_stopwords and not w.isdigit() and len(w) > 2]
return " ".join(filtered_words)
def getBigrams(content, threshold=5):
tokens = nltk.wordpunct_tokenize(preProcess(content))
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(threshold)
scored = finder.score_ngrams(bigram_measures.raw_freq)
return sorted([ (bigram, score) for (bigram, score) in scored ], key=lambda t: t[1], reverse=True)
bigrams = []
for clusteridx in documents:
bigrams.append(getBigrams(documents[clusteridx], threshold=2))
# Create a pretty table of tweet contents and any expanded urls
bigram_pt = prettytable.PrettyTable(["Cluster 0", "Cluster 1", "Cluster 2", "Cluster 3"])
bigram_pt.align["Cluster 0"] = "l"
bigram_pt.align["Cluster 1"] = "l"
bigram_pt.align["Cluster 2"] = "l"
bigram_pt.align["Cluster 3"] = "l"
bigram_pt.max_width = 60
bigram_pt.padding_width = 1 # One space between column edges and contents (default)
for idx in range(len(bigrams[0])):
bigram_pt.add_row([" ".join(bigram[idx][0]) for bigram in bigrams])
# Lets see the results!
print bigram_pt
+--------------------------+--------------------------+--------------------------+---------------------------+ | Cluster 0 | Cluster 1 | Cluster 2 | Cluster 3 | +--------------------------+--------------------------+--------------------------+---------------------------+ | executive director | social justice | new york | human rights | | official twitter | new york | donald trump | national security | | senior fellow | non profit | york times | foreign policy | | social justice | media democracy | editor chief | independent journalist | | women rights | sandra bland | staff writer | middle east | | contributing writer | social change | white house | views mine | | editor large | award winning | climate change | civil liberties | | editor nation | center media | new yorker | senior editor | | fast food | fast food | washington post | author new | | human rights | public citizen | climate science | new book | | investigative journalism | public interest | managing editor | award winning | | new york | advocacy organization | news chief | center economic | | planned parenthood | around world | talk show | columnist author | | pop culture | crime charges | times columnist | director center | | reproductive rights | director center | abc news | donald trump | | senior editor | dylann roof | climate hawk | economic policy | | social change | economic justice | daily kos | editor chief | | staff writer | executive director | editor publisher | guardian columnist | | american prospect | food workers | fox news | investigative journalism | | american way | hate crime | hillary clinton | iran deal | | calling women | human rights | huffington post | journalist the_intercept | | cartoonist illustrator | independent media | husband father | managing editor | | committed diversifying | institute policy | iran deal | new york | | community journalists | justice peace | level jobs | policy research | | contributor thenation | media justice | lindsey graham | radio host | | crime indictment | news analysis | media critic | sandra bland | | deputy editor | official twitter | media reporter | social justice | | director culture | people history | megyn kelly | treason charges | | diversifying world | policy studies | new book | writer editor | | doctorow last | public radio | news commentary | account tweets | | editor thenation | racial justice | nytopinion author | activist photographer | | essay nation | wall street | politics culture | activist recent | | feminist majority | activist author | politics media | activist writer | | food workers | alec new | pulitzer prize | agenda report | | great doctorow | america populist | views expressed | american politics | | hate crime | american legislative | washington correspondent | americans knew | | hillman foundation | american university | affairs correspondent | analysis current | | huffington post | author diet | amp gov | angeles times | | india clarke | blacklivesmatter protest | analyst author | apology massincarceration | | indictment dylannroof | charleston church | anchor abc | around world | | investigative journalist | children health | anchor chief | associate editor | | iran deal | citizen global | anchor cnn | attack free | | journalists thought | civil liberties | animation one | barrett brown | | last essay | civil rights | anti immigrant | based institute | | leaders committed | community organizer | associate professor | bear expect | | lgbt issues | community radio | author new | bernie sanders | | long way | conspiracy theory | board member | bianca jagger | | looks like | constitutional rights | book pulitzer | bill clinton | | majority foundation | contact press | book reviewer | black agenda | | managing editor | corporate power | box emmy | center constitutional | | media matters | create new | breaking news | clemencies bill | | nation zjuklguchz | criminal justice | breast cancer | climate change | | netroots nation | cultural critic | bureau chief | clinton half | | new book | current events | business government | cold war | | news organization | democracy yjbegzsujz | campaign coverage | constitutional rights | | non profit | digital media | cell phone | contact email | | nonprofit news | doesn explain | center author | contributing editor | | organization dedicated | drug policy | chief foreign | contributing writer | | people american | drug war | chief washington | cops well | | political research | editor progressive | chief white | current events | | power media | exchange council | chris hayes | debunks nuclear | | public policy | exec director | city hall | deepa kumar | | race gender | fast track | clean energy | deray sandrabland | | reporting analysis | fighting rights | columnist author | digital rights | | research associates | flanders show | columnist nytopinion | discourse providing | | sidney hillman | former journalist | columnist pulitzer | documented analysis | | statement thejusticedept | former press | communications director | dylann roof | | thejusticedept hate | forward thinking | confinement xj7wtgcdze | editor author | | thenation great | global trade | contributing editor | editor jacobin | | thought leaders | high school | contributor cjr | email pgp | | twitter account | independent nonprofit | correspondent host | enemy within | | united states | investigative reporting | coverage far | events issues | | vice president | journalism media | critics theirandeal | expect make | | views expressed | justice democracy | daily news | free press | | wam nyc | justice media | daily show | fundraising campaign | | women media | justice system | dancing bug | global affairs | | world conversation | kids need | democracy tweets | great work | | writer editor | labor love | dylann roof | hacking team | | writer feminist | last week | emmy nominated | half apology | +--------------------------+--------------------------+--------------------------+---------------------------+
Looking through the bigrams for each cluster seems to help support the outline above of the different clusters. To expand on the outline below: