#!/usr/bin/env python
# coding: utf-8

# # Final Project - James Quacinella
# 
# Fo the final project, I will look at the follower network of one of the think tank Twitter account and perform clustering to find groups of associated accounts. Looking at the clusters, I hope to identify what joins them by performing some NLP tasks on the account's profile contents.
# 
# ## Step 1 - Crawl Twitter for Followers
# 
# The next section of code does not run in the notebook, but is a copy of the crawler code created for this project. It will take a single account, get the first level followers, and then grab the 'second-level' followers. Those second level follower are only added if they were nodes in the first level (so we focus on the main account, not other accounts tangentially related).

# In[ ]:


#import graphlab as gl
import pickle
import twitter
import logging
import time
from collections import defaultdict


### Setup a console and file logger

logger = logging.getLogger('crawler')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('crawler.log')
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
logger.addHandler(ch)
logger.addHandler(fh)

### Setup signals to make sure API calls only take 60s at most

from functools import wraps
import errno
import os
import signal

class TimeoutError(Exception):
    pass

def timeout(seconds=60, error_message=os.strerror(errno.ETIME)):
    def decorator(func):
        def _handle_timeout(signum, frame):
            raise TimeoutError(error_message)

        def wrapper(*args, **kwargs):
            signal.signal(signal.SIGALRM, _handle_timeout)
            signal.alarm(seconds)
            try:
                result = func(*args, **kwargs)
            finally:
                signal.alarm(0)
            return result

        return wraps(func)(wrapper)

    return decorator

@timeout()
def getFollowers(api, follower):
    ''' Function that will get a user's list of followers from an api object. 
    NOTE: the decorator ensures that this only runs for 60s at most. '''
    # return api.GetFollowerIDs(follower)
    return api.GetFriendIDs(follower)


### Twitter API

# Lets create our list of api OAuth parameters
API_TOKENS = [
    {"consumer_key": 'yp4wi4FASXbsRKa6JxYqzhUlH',
    "consumer_secret": 'Wkh1d5ygAOp4Bp65syFzHRN4xQsS8O4FvU3zHWosX8NXCqMpcl',
    "access_token_key": '16562593-F6lRFe7iyoQEahezhPmaI64oInHZD0LNpcIbbq7Wy',
    "access_token_secret": 'weregYL8n6DI7yZy9pkizIJ78rH2GY02Do9jvpTe7rCey',
    "requests_timeout": 60},

    {"consumer_key": 'NsNYFG9LtZV2XMyigPaCKVyVz',
    "consumer_secret": '4J1vlowybipqXnSrKgLBvmzPmwqx71uHN32noljTgDLS2xQNfI',
    "access_token_key": '16562593-NCuQWVnpzcnB55w7VLdoCkdobdUQBRDJKjIPXAksP',
    "access_token_secret": 'nX9OksrYQxj0jBXYJTkUjlX5mZh4rZljfVRXtSM3Tjc8c',
    "requests_timeout": 60},

    {"consumer_key": 'ZcAMGe2MUcnTO9ATCIo563SHN',
    "consumer_secret": 'dJAB7mBfoYyx27Yccbmzz98GtNigAA67Ish9Y1NjN2wNznciM1',
    "access_token_key": '16562593-AmaoKVLEYL3o8rVUS3b6u4PUbVPTI6BPsyaqCdwxY',
    "access_token_secret": '8pjYJCFWTErJlb2WSkLwsYNoptVazQQs95JAvIU8JApUA',
    "requests_timeout": 60},

    {"consumer_key": 'avZpjObqQN9vue2Y4gu9zIF9X',
    "consumer_secret": 'Ka6WCj3fyon5yGgf5YJIIl8nVcLcUh5YT99N58qy8qv4kfaMbc',
    "access_token_key": '16562593-VNuGD09Cr29ZlzNCWnV5MOujU7PsexSwfTgfKQNqC',
    "access_token_secret": '9P3hB3qDb9zPDFCUhWU16N4CMXPwHacl6HJbCc0EuGj7s',
    "requests_timeout": 60},

    {"consumer_key": 'sQ9H5NKteroNZSWvIrkSWvXR0',
    "consumer_secret": 'lC0ttZKdIZhhJAE1I5RxMxdjpSiADQCVUnHS7LbtfVmI2pz2F2',
    "access_token_key": '16562593-4LOk7QkXWD0boF01BmZ6NP2oPtHmDZ1OVJ883aANG',
    "access_token_secret": 'JJ85qMqzVowN1KdQ6w4YlhJB9YF9eWbw6SGbxQoU6gvne',
    "requests_timeout": 60},

    {"consumer_key": 'DHppZ2LG3iYj8vEx7ibRRLN35',
    "consumer_secret": 'wdTQeyp7ZNDN7ne40IriRw7Ah1J8cAi2OIlw4MVtgpq5MMKjYE',
    "access_token_key": '16562593-WN8zvEWAxVfJPrneMwUjDoVQw0geuLckOOJqFimsC',
    "access_token_secret": 'ZgVi2onPB3RPGtRmPBs6QXymIMgXwJHUOQycesp64S0Hp',
    "requests_timeout": 60},

    {"consumer_key": 'lIgtfdkC2WmN7XAcicrGygQBp',
    "consumer_secret": '2D9WIJN2MIPwFpMeIGcP6vWjQC8vvy7G5ZlHMSH1F1CsgWGKfz',
    "access_token_key": '16562593-7lhPpeZNNAGoQQJnqcnTtBiGq1O52XMZ4CMeVqXiY',
    "access_token_secret": 'WKRBQsr36MMB2EpCcZLr89ik0MSJfPoBORCKu9E1hw96I',
    "requests_timeout": 60},

    {"consumer_key": '1XFu2urZzoMoC5sadXAjA7IoQ',
    "consumer_secret": 'FrJOlHfNLp3M7ejJWiO5k74E9ai6L5EzQJ45HmlsUINbh8qUUi',
    "access_token_key": '16562593-Texko6g7VyCwhNUfxBDoJKJl4058hpvQkqAYWRKpi',
    "access_token_secret": 'ISZCTvN6bYJVaJ3Z2iidQObTzE2pxkINBLi0WWe9Ab2Zv',
    "requests_timeout": 60},

    {"consumer_key": 'r8Bvdm6I8QrRPuVzP4VtRYpqd',
    "consumer_secret": 'CzA8u8M8nDiDCCrSzCsXpR3SyTGCaLppDWbdTxSg78ZKgtKkhh',
    "access_token_key": '16562593-I3l0ZSmfZbMxIQ2NbiiM2eDMA4KNzFmFBeUkWxunR',
    "access_token_secret": '9HkILP4kSMF0hgvsB126jpoUzsRXETYMlSM0YSKb2yMJH',
    "requests_timeout": 60},

    {"consumer_key": 'NmMjfP1Zt3n2VDZ15X7SDGM6G',
    "consumer_secret": 'j9JBx7HUbMpcDnFteiIAAgHSoA8idlqQ20A1xbvnMrqMrOHQ1n',
    "access_token_key": '16562593-zUNyMUdO9JnSIstmTrqdyHHmX2lpv9NqkQxGC8faP',
    "access_token_secret": 'DEeHvLjTXlxNGmqDntXOK0cJCX08cnpg0btoRXWATW3X2',
    "requests_timeout": 60}
]

# Now create a list of twitter API objects
apis = []
for token in API_TOKENS:
    apis.append( twitter.Api(consumer_key=token['consumer_key'],
                    consumer_secret=token['consumer_secret'],
                    access_token_key=token['access_token_key'],
                    access_token_secret=token['access_token_secret'],
                    requests_timeout=60))


# The account id / screen name we want followers from
account_screen_name = 'fairmediawatch'
account_id = '54679731'

# Keep track of nodes connected to account, and all edges we need in the graph
nodes = set()
edges = defaultdict(set)


# Try to load first level followers from pickle;
# otherwise, generate them from a single API call and save via pickle
try:
    logger.info("Loading followers for %s" % account_screen_name)
    f = open("following1", "rb")
    following = pickle.load(f)
except Exception as e:
    logger.info("Failed. Generating followers for %s" % account_screen_name)
    following = api.GetFriendIDs(screen_name=account_screen_name)
    pickle.dump(following, open("following1", "wb"))

# Try to load the nodes and first level edges from pickle;
# otherwise generate them from the 'following' list and save
try:
    logger.info("Loading nodes and edges for depth = 1, for %s" % account_screen_name)
    n = open("nodes.follow1.set", "rb")
    e = open("edges.follow1.dict", "rb")
    nodes = pickle.load(n)
    edges = pickle.load(e)
except Exception as e:
    logger.info("Failed. Generating nodes and edges for depth = 1, for %s" % account_screen_name)
    for follower in following:
        nodes.add(follower)
        edges[account_id].add(follower)
    pickle.dump(nodes, open("nodes.follow1.set", "wb"))
    pickle.dump(edges, open("edges.follow1.dict", "wb"))


### Crawling for Depth2


# Index the api list, and start from the first api object
api_idx = 0
api = apis[api_idx]

# Some accounts give us issues (either too many followers or no permissions)
blacklist= [74323323, 43532023, 19608297, 25757924, 240369959, 173634807, 17008482, 142143804]
api_updated = False

# It is nice to start from a point in the list, instead of from the beginning
starting_point = 142143804
if starting_point:
    starting_point_idx = following.index(starting_point)
    following_iter = range(starting_point_idx, len(following))
else:
    following_iter = range(len(following))

# Try loading second layer of followers from pickle, otherwise start from scratch
try:
    f = open("edges.follow2.dict", "rb")
    edges = pickle.load(f)
    logger.info("Loaded edges.follow2 into memory!")
except Exception as e:
    logger.info("Starting from SCRATCH: did not load edges.follow2 into memory!")
    pass

# For each follower of the main account ...
for follower_idx in following_iter:
    follower = following[follower_idx]
    success = False
    
    # ... check if they are on the blacklist; if so, skip
    if follower in blacklist:
        logger.info("Skipping due to blacklist")
        continue

    # Otherwise, attempt to get list of their followers
    followers_depth2_list = []
    while not success:
        try:
            logger.info("Getting followers for follower %s" % follower)
            followers_depth2_list = getFollowers(api, follower)
            success = True
        except TimeoutError as e:
            # If api call takes too long, move on
            logger.info("Timeout after 60s for follower %d" % follower)
            success = True      # technically not a success but setting flag so next loop moves on
            continue
        except Exception as e:
            # IF we get here, then we hit API limits
            logger.info("API Exception %s; api-idx = %d" % (str(e), api_idx))
            
            # Are we at the begining of api list? 
            # IF so, dump edges so far via pickle and sleep
            if api_updated and api_idx % len(API_TOKENS) == 0 and api_idx >= len(API_TOKENS):
                logger.info("Save edges to pickle file for follower = %s" % follower)
                pickle.dump(edges, open("edges.follow2.dict", "wb"))
                logger.info("Sleeping ...")
                time.sleep(60)
                api_updated = False
            # Otherwise, move on to the next api object and try again
            else:
                api_idx += 1
                api = apis[api_idx % len(API_TOKENS)]
                api_updated = True
            
    
    # After getting the followers, find the intersection of those followers
    # with those of the first-level followers and add to edge dict
    if followers_depth2_list:
        logger.info("Adding followers to the graph")
        edges[follower].update(nodes.intersection(followers_depth2_list))


# Write out final list of edges via pickle
logger.info("Save edges to pickle file for follower = %s" % follower)
pickle.dump(edges, open("edges.follow2.dict", "wb"))


# Instead of running the above, lets just load everything via pickle:

# In[2]:


import pickle
n = open("nodes.follow1.set", "rb")
nodes = pickle.load(n)

e = open("edges.follow2.dict", "rb")
edges = pickle.load(e)

f = open("following1", "rb")
following = pickle.load(f)


# ## Step 2 - Generate Graph from Crawl
# 
# First, we generate CSV files so we can load data into GraphLab Create.

# In[3]:


# Hide some silly output
import logging
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

# Import everything we need
import graphlab as gl

# Generate CSVs from the previous crawl
# TODO
f = open('vertices.csv', 'w')
f.write('id\n')
for node in nodes:
    f.write(str(node) + "\n")
f.close()

f = open('edges.csv', 'w')
f.write('src,dst,relation\n')
for node, followers in edges.iteritems():
    for follower in followers:
        f.write('%s,%s,%s\n' % (follower, node, 'follows'))
f.close()


# Next, let us use these CSV files and load them into a graph object called g:

# In[4]:


# Load Data
gvertices = gl.SFrame.read_csv('vertices.csv')
gedges = gl.SFrame.read_csv('edges.csv')

# Create graph
g = gl.SGraph()
g = g.add_vertices(vertices=gvertices, vid_field='id')
g = g.add_edges(edges=gedges, src_field='src', dst_field='dst')
g = g.add_edges(edges=gedges, src_field='dst', dst_field='src')


# Lets try to visualize the graph!

# In[28]:


# Visualize graph?
gl.canvas.set_target('browser')
g.show(vlabel="id")


# Looks like its too large of a graph to display. 
# 
# ## Central / Important Nodes
# 
# Lets use pagerank to find important nodes in the network:

# In[5]:


pr = gl.pagerank.create(g)
pr.get('pagerank').topk(column_name='pagerank')


# ## Import  The Graph to iGraph
# 
# Next we will load the graph data into igrpah and perform clustering to find communities

# In[140]:


from igraph import *

# Create empty graph
twitter_graph = Graph(directed=False)

# Setup the nodes
for node in nodes:
    if isinstance(node, int):
        twitter_graph.add_vertex(name=str(node))


# In[142]:


# Setup the edges
for user in edges:
    for follower in edges[user]:
        try:
            twitter_graph.add_edge(str(follower), str(user))
        except Exception as e:
            print user, follower
            print e
            break


# In[143]:


# Add the 'ego' edges
following = pickle.load(open("following1", "rb"))
for node in following:
    twitter_graph.add_edge(str(node), "54679731")


# In[145]:


# for v in twitter_graph.vs.select(name_eq="54679731"):
#     print v
    
# for e in twitter_graph.es.select(_source=v.index): print e
# for e in twitter_graph.es.select(_target=533): print e


# In[102]:


# for v in twitter_graph.vs:
#     if len(twitter_graph.es.select(_source=v.index)) == 0 and len(twitter_graph.es.select(_target=v.index)) == 0:
#         print v


# In[ ]:


pickle.dump(twitter_graph, open("twitter_graph", "wb"))


# In[19]:


# Load twitter grapg to prevent running the above
twitter_graph = pickle.load(open("twitter_graph", "rb"))


# ## Display the Graph

# In[ ]:


layout = twitter_graph.layout_drl()
plt1 = plot(twitter_graph, 'graph.drl.png', layout = layout)


# In[257]:


from IPython.display import HTML
s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph.drl.png" width="500" /></td>"""
h = HTML(s); h


# In[214]:


layout = twitter_graph.layout("graphopt")
plt2 = plot(twitter_graph, 'graph.graphopt.png', layout = layout)


# In[258]:


s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph.graphopt.png" width="500" /></td>"""
h = HTML(s); h


# In[227]:


layout = twitter_graph.layout("lgl")
plt2 = plot(twitter_graph, 'graph.lgl.png', layout = layout)


# In[260]:


s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph.lgl.png" width="500" /></td>"""
h = HTML(s); h


# Lets trim down the graph to only large nodes:

# In[237]:


# https://lists.nongnu.org/archive/html/igraph-help/2012-11/msg00047.html
twitter_graph2 = twitter_graph.copy()
nodes = twitter_graph2.vs(_degree_lt=200)
twitter_graph2.es.select(_within=nodes).delete()
twitter_graph2.vs(_degree_lt=200).delete()
layout = twitter_graph2.layout_drl()
plt1 = plot(twitter_graph2, 'graph2.drl.png', layout = layout)


# In[261]:


s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/graph2.drl.png" width="500" /></td>"""
h = HTML(s); h


# ## Clustering

# First, lets run the walktrap community algorithm, which seems to produce a heirarchical clustering:

# In[243]:


wc = twitter_graph.community_walktrap()
plot(wc, 'cluster.walktrap.png', bbox=(3000,3000))


# In[266]:


s = """<img src="https://raw.githubusercontent.com/jquacinella/IndependentStudy/master/Final/cluster.walktrap.png" width="900" /></td>"""
h = HTML(s); h


# In[22]:


eigen = twitter_graph.community_leading_eigenvector()
plot(eigen, 'cluster.eigen.test.png', mark_groups=True, bbox=(5000,5000))


# In[270]:


s = """<img src="https://s3.amazonaws.com/jqmasters/cluster.eigen.jpg" width="900" /></td>"""
h = HTML(s); h


# Still messy, lets try doing the same thing but on the smaller graph:

# In[238]:


eigen2 = twitter_graph2.community_leading_eigenvector()
plot(eigen2, 'cluster2.eigen.png', mark_groups=True, bbox=(5000,5000))


# In[269]:


s = """<img src="https://s3.amazonaws.com/jqmasters/cluster2.eigen.jpg" width="900" /></td>"""
h = HTML(s); h


# ## Bios of Twitter Users
# 
# Now that we have the graph communities, lets crawl twitter for the bios on each user. I will not reproduce the code, as it is mostly a copy of the above twitter crawl, using a different API call. Check biocrawl.py in the repo. We'll load the data from pickle:

# In[23]:


bios = pickle.load(open("bios", "rb"))

from collections import defaultdict
documents = defaultdict(str)
for v_idx, cluster in zip(range(len(eigen.membership)), eigen.membership):
    twitter_id = int(twitter_graph.vs[v_idx].attributes()['name'])
    if twitter_id in bios:
        documents[cluster] += "\n%s" % bios[ twitter_id ]


# ### Important Nodes from PageRank
# 
# Lets look at the important nodes in the network and their associated bios:

# In[61]:


import prettytable

pts = []

for cluster in set(eigen.membership):
    # Create a pretty table of tweet contents and any expanded urls
    pt = prettytable.PrettyTable(["Rank", "Bio", "Cluster ID"])
    pt.align["Bio"] = "l" # Left align bio
    pt.max_width = 60 
    pt.padding_width = 1 # One space between column edges and contents (default)
    pts.append(pt)

# Loop thru top 100 page-rank'ed nodes
x = pr.get('pagerank')
for row in x.topk(column_name='pagerank', k=100):
    if row['__id'] in bios:
        vidx = twitter_graph.vs.select(name_eq=str(row['__id']))[0].index
        clusterid = eigen.membership[vidx]
        pts[clusterid].add_row([row['pagerank'], bios[row['__id']].split("\n")[0], clusterid ])

# Lets see the results!
for pt in pts:
    print pt


# Looking at the above, I think I can come up with explanations of the clusters just by looking at the important nodes as found by page rank:
# 
# - Cluster 0: Media personalities
#   - This is a little tougher to gauge, but cluster 0 is more oriented towards editors and personalities, while the others tend to be organizations or institutional accounts
# - Cluster 1: Independent media, 'far-left' media outlets
#   - Amy Goodman (democracy now), muckreads, #Occupy, GritTV, Truthout
#   - Notice how this is a smaller group than the others
# - Cluster 2: Mainstream Liberal organizations or media personalities
#  - HuffPost, NY Times, CNN, MSNBC, Bill Moyers, New Yorker
# - Cluster 3: people / journalists involved with civil liberties, human rights and whistleblowing
#   - FOIA, PGP, Whistleblower, secrecy

# ### Bigram Analysis
# 
# Lets do a bigram analysis on the documents that created from a combination from user bio's and their latest tweet, using a custom stop word filter and filtering for tokens of length of 3 or more:

# In[93]:


import string

import nltk
from nltk.collocations import *
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Custom stopwords, mostly twitter slang and other internet rubbish
my_stopwords = stopwords.words('english')
my_stopwords = my_stopwords + ['http', 'https', 'bit', 'ly', 'co', 'rt', 'rts', 'com', 'org', 'dot', 'go', 'via', 'follow', 'us', 'follow', 'retweet', 'also', 'run']

def preProcess(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    filtered_words = [w for w in tokens if not w in my_stopwords and not w.isdigit() and len(w) > 2]
    return " ".join(filtered_words)

def getBigrams(content, threshold=5):
    tokens = nltk.wordpunct_tokenize(preProcess(content))
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(threshold)
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    return sorted([ (bigram, score) for (bigram, score) in scored ], key=lambda t: t[1], reverse=True)


# In[94]:


bigrams = []

for clusteridx in documents:
    bigrams.append(getBigrams(documents[clusteridx], threshold=2))

# Create a pretty table of tweet contents and any expanded urls
bigram_pt = prettytable.PrettyTable(["Cluster 0", "Cluster 1", "Cluster 2", "Cluster 3"])
bigram_pt.align["Cluster 0"] = "l" 
bigram_pt.align["Cluster 1"] = "l" 
bigram_pt.align["Cluster 2"] = "l" 
bigram_pt.align["Cluster 3"] = "l"     
bigram_pt.max_width = 60 
bigram_pt.padding_width = 1 # One space between column edges and contents (default)

for idx in range(len(bigrams[0])):
    bigram_pt.add_row([" ".join(bigram[idx][0]) for bigram in bigrams])

# Lets see the results!
print bigram_pt


# Looking through the bigrams for each cluster seems to help support the outline above of the different clusters. To expand on the outline below:
# 
# - Cluster 0: Media personalities
#   - executive director, contributing writer , senior fellow, contributing editor
# - Cluster 1: Independent media, 'far-left' media outlets, senior editor, staff writer
#   - social justice, media democracy, independent media , media justice, news analysis , digital media, nvestigative reporting
# - Cluster 2: Mainstream Liberal organizations or media personalities
#   - new york times, new yorker, washington post, daliy kos, washington correspondent
# - Cluster 3: people / journalists involved with civil liberties, human rights and whistleblowing
#   - Bigrams: human rights , national security , foreign policy, civil liberties,  digital rights, pgp(!)
# 
# ### Future Considerations
# 
# - Using TF-IDF to make sure bigrams or tokens that appear in all documents are weighed less

# In[ ]: