In this project I am answering the following questions:
See here for other iPython notebooks on this project.
Project (datasets and the source code) is available on GitHub
The news and the curated tweets used in this study are scraped from theplazz.com approximately matching the duration of 113th US Congress, i.e. between Jan 2013 - Jan 2015. Here is an annotated screenshot of one of the news published on this news media site:
cd ..
/Users/toz/Documents/workspace/News-Commentary-Tweets-of-Elites
import twitter
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
from mykeys import tw
import networkx as nx
import itertools
from collections import Counter
%matplotlib inline
def oauth_login():
"""Twitter authorization """
#tw is a dictionary, the only variable in mykeys.py
auth = twitter.oauth.OAuth(tw['OAUTH_TOKEN'], tw['OAUTH_TOKEN_SECRET'],
tw['CONSUMER_KEY'], tw['CONSUMER_SECRET'])
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
def get_members(members):
"""Scrape only the interesting info from twitter json response """
return [(m['id'],m['screen_name'],m['name'],m['location'],m['description'],
m['created_at'], m['friends_count'],m['followers_count'],
m['statuses_count'],m['favourites_count']) for m in members['users']]
def tw_to_pol(twitter_api,slug,owner_screen_name,group):
"""Get members of a twitter list with known political group into a dataframe """
resp = twitter_api.lists.members(slug=slug,owner_screen_name=owner_screen_name,cursor=-1,count=5000)
members = get_members(resp)
df = pd.DataFrame(members,columns=header)
df['party'] = group
return df
def get_politicians():
"""Download 113th congress tweeps using public Twitter lists"""
header = ['id','screen_name','name','location','description','created_at',
'friends','followers','statuses','favorites']
polists = [{'slug':'senaterepublicans', 'owner_screen_name':'Senate_GOPs', 'group':'gop'}, #62
{'slug':'house-republicans', 'owner_screen_name':'HouseGOP', 'group':'gop'}, #260
{'slug':'housegop', 'owner_screen_name':'GOPLeader', 'group':'gop'}, #237
{'slug':'elected-democrats', 'owner_screen_name':'TheDemocrats', 'group':'dem'}, #259
{'slug':'house-democrats', 'owner_screen_name':'DannyMariachi', 'group':'dem'}, #188
{'slug':'senatedemocrats', 'owner_screen_name':'SenateDems', 'group':'dem'} #52
]
twitter_api = oauth_login()
df = pd.DataFrame(columns=header)
for polist in polists:
df = df.append(tw_to_pol(twitter_api,polist['slug'],polist['owner_screen_name'],polist['group']))
df = df.drop_duplicates()
df.to_csv('data/US-politicians.csv',encoding='utf-8',index=False)
return df
# get twitter IDs of congressmen and senators
df = pd.read_csv('data/US-politicians.csv',encoding='utf-8')
gop = df[df['party']=='gop']
dem = df[df['party']=='dem']
dem_tweeps = set(dem.screen_name.values)
gop_tweeps = set(gop.screen_name.values)
# Principal Accounts of Members of the U.S. Senate (a mix of campaign and government accounts)
senate = pd.read_csv('data/US-senate.csv',encoding='utf-8')
# get commentary tweets of US newsmakers and opinion-shapers
tweets = pd.read_csv('data/US-tweets.csv',encoding='utf-8',parse_dates=['dt'])
tweets.twhandle = tweets.twhandle.str[1:]
#tweets.dt = pd.to_datetime(tweets.dt,unit='D')
# print politician counts curated at least once by theplazz.com
title = tweets.groupby(by=['title','dt'])['twhandle']
print (len(title),'news commentated between',tweets.dt.order().iloc[0].strftime('%d-%b-%Y'),
'and',tweets.dt.order().iloc[-1].strftime('%d-%b-%Y'),'by')
tweepset = set(tweets.twhandle.unique())
senateset = set(senate.screen_name.values)
twcounts = pd.DataFrame(columns=['# of tweeps'])
twcounts.loc['senator'] = [len(senateset & tweepset)]
twcounts.loc['democrat'] = [len(dem_tweeps & tweepset)]
twcounts.loc['republican']= [len(gop_tweeps & tweepset)]
twcounts.loc['total'] = [len(tweepset)]
twcounts
7376 news commentated between 14-Jan-2013 and 09-Jan-2015 by
# of tweeps | |
---|---|
senator | 44 |
democrat | 36 |
republican | 30 |
total | 1442 |
# plot commentating activity of these politicians
tweeps = tweets.groupby(by='twhandle')['twtext'].count().order(ascending=False)
poltweeps = tweeps[tweeps.index.isin(df.screen_name)]
colors = ['blue' if x in dem_tweeps else 'red' for x in poltweeps.index]
data = Data([Bar(
x=poltweeps.index,
y=poltweeps.values,
marker=Marker(color=colors)
)])
layout = Layout(yaxis=YAxis(title='# of news commentated (Jan 2013 - Jan 2015)'),
title="News counts commentated by 113th US Congress (curated by theplazz.com)")
fig = Figure(data=data, layout=layout)
py.iplot(fig,filename="113th US Congress as News Commentators")
# Stats: how many news are commentated by how many democrats and/or republicans...
demnews = title.apply(lambda g: len(dem_tweeps & set(g.values)))
gopnews = title.apply(lambda g: len(gop_tweeps & set(g.values)))
print (demnews.sum(),'comments made on',demnews[demnews>0].size,'news by democrats.')
print (gopnews.sum(),'comments made on',gopnews[gopnews>0].size,'news by republicans.')
dgtotl = (demnews + gopnews)
print ('News commentated by any member of either group:',(dgtotl[dgtotl>0].size))
# Number of comments by dems - number of comments by gops
dgdiff = (demnews - gopnews)
# Normalize the polarity
dgdiv = dgdiff/dgtotl
digdiv = dgdiv.order()[:dgtotl[dgtotl>0].size]
print ('News commentated by democrats only:',(digdiv[digdiv == 1].size))
print ('News commentated by republicans only:',(digdiv[digdiv == -1].size))
print ('News commentated by both of the parties:',(digdiv[(digdiv > -1) & (digdiv < 1)].size))
2829 comments made on 1239 news by democrats. 2109 comments made on 1183 news by republicans. News commentated by any member of either group: 1916 News commentated by democrats only: 733 News commentated by republicans only: 677 News commentated by both of the parties: 506
# commentator group polarity distribution of news
digdiv[(digdiv > -1) & (digdiv < 1)].plot();
data = Data([Bar(
x=digdiv.index.get_level_values(0),
y=digdiv[(digdiv > -1) & (digdiv < 1)]
)])
layout = Layout(yaxis=YAxis(title='# of news commentated (Jan 2013 - Jan 2015)'),
margin=Margin(l=150,r=150,b=150),
title="News polarized by 113th US Congress (curated by theplazz.com)")
fig = Figure(data=data, layout=layout)
py.iplot(fig,filename="Polarity distribution of news")
# Which news got the most attention by the politicians ?
dgtotl.order(ascending=False).head(60)
title dt A day to honor veterans’ courage, sacrifrice 2014-05-26 36 Apartheid resistance leader Mandela has died 2013-12-05 31 World marks 70th anniversary of D-Day 2014-06-06 24 US observes somber 9/11 anniversary 2014-09-11 21 Senate passes 5-month UI extension 2014-04-07 21 Dems’ attempt to raise minimum wage fails 2014-04-30 20 Shinseki resigns from VA following apology 2014-05-30 20 Deportation to focus on ‘felons not families’ 2014-11-20 19 A reflection on rights on Constitution Day 2014-09-17 19 Dems push #EqualPay bill on Equal Pay Day 2014-04-08 19 Around the world, Jews welcome 5774 2013-09-04 18 Remembering Rev. Dr. Martin Luther King Jr. 2014-01-20 18 4 dead after Fort Hood shooting incident 2014-04-02 18 Congress critters @work: #DontDoubleMyRate 2013-07-09 18 Politicians get patriotic in July 4th holiday rush 2014-07-04 17 Dems host climate-change talk-a-thon 2014-03-10 17 Boston honors dead in Marathon bombing 2014-04-15 16 Tweeps count blessings both big and small 2014-11-27 15 Obama unveils his $4T budget based on ‘values’ 2014-03-04 15 Earth Day observers consider climate change 2014-04-21 15 Senate ‘goes nuclear’ on filibuster option 2013-11-21 15 EPA unveils reduction plan for air pollution 2014-06-02 15 Politicos, businesses thank teachers, nurses 2014-05-06 15 Senate’s unemployment bill defeated by 1 vote 2014-02-06 15 Obama to governors: ‘sequester hurts states’ 2013-02-25 15 VA bills unanimously pass House, Senate 2014-06-11 15 Sundown ushers in the Jewish New Year 2014-09-24 14 Jews worldwide reflect on Holocaust Day 2014-04-28 14 Renowned poet, author Maya Angelou dies 2014-05-28 14 6 dead in attack on Jerusalem synagogue 2014-11-18 14 Obama delivers climate change speech 2013-06-25 14 The Civil Rights Act of 1964 turns 50 2014-07-02 14 Season’s tweetings: It’s Christmas Day greetings 2014-12-25 14 Put on your yarmulke, it’s time for Hanukkah 2014-12-16 14 Americans mark 100 years of Mother’s Day 2014-05-11 14 World AIDS Day to raise awareness, funds 2014-12-01 14 Day of remembrance for Pearl Harbor veterans 2014-12-07 13 Military sexual assault bill rejected by Senate 2014-03-06 13 Most diverse Congress yet opens despite snow 2015-01-06 13 ACA website surpasses 6M signups 2014-03-27 13 Dems tweet support for ‘Not My Boss’ Business’ 2014-07-14 13 Hobby Lobby case begins in SCOTUS 2014-03-25 13 Lawmakers pass bipartisan budget deal 2013-12-18 13 Paycheck Fairness Act voted down by GOP 2014-04-09 13 Nobel Peace Prize goes to child rights activists 2014-10-10 12 Senate passes farm bill over to the White House 2014-02-04 12 Feinstein, McCain unveil CIA ‘Torture Report’ 2014-12-09 12 College affordability makes Dem agenda 2014-05-05 12 Obama to announce executive order in SOTU 2014-01-28 12 ‘Not My Boss’ Business Act’ shot down in Senate 2014-07-16 12 #ConstitutionDay trends on Twitter 2013-09-17 12 As Passover arrives, nontrad seders on rise 2014-04-13 12 SCOTUS ends overall campaign donor limits 2014-04-02 12 SCOTUS rules in favor of Hobby Lobby 2014-06-30 12 Keystone pipeline bill fails 59-41 in Senate 2014-11-18 12 Obama to tackle LGBT job discrimination 2014-06-16 11 Obama promises a year of action for 2014 2014-01-28 11 Obama pushes to keep student loan rates low 2013-05-31 11 ENDA clears first hurdle, Senate votes 64-32 2013-11-07 11 #WomensEqualityDay marks suffrage anniv. 2014-08-26 11 Name: twhandle, dtype: int64
# On which news the comment-count differences maximized?
dgdiff.order()
title dt Republicans call Obama executive action illegal 2014-11-20 -9 Tweeps wish US’ 40th prez a happy birthday 2014-02-06 -9 Sign up trouble? WH extends deadline again 2014-03-25 -8 Britain’s Iron Lady dies of stroke at 87 2013-04-08 -8 Late filers beware: The taxman cometh 2014-04-15 -8 SCOTUS rules for prayer at public meetings 2014-05-05 -7 National Day of Prayer offers controversy, unity 2014-05-01 -6 GOP calls for ‘permanent delay’ of ObamaCare 2013-07-10 -6 WH may not cooperate with Benghazi probe 2014-05-05 -6 Repubs squelch short-term budget fix 2013-02-05 -6 Shinseki resigns from VA following apology 2014-05-30 -6 Boehner pressed for Obamacare alternative 2013-12-03 -5 Sudan detains Christian woman after release 2014-06-24 -5 214 days later: Mexican court frees Tahmooressi 2014-11-01 -5 Obama extends to 2016 ACA employer mandate 2014-02-10 -5 Obamacare head steps down from HHS 2014-04-10 -5 Vets at WWII Memorial become props in dispute 2013-10-02 -5 SCOTUS curbs Obama’s recess appointments 2014-06-26 -5 After IG report, calls for Shinseki to resign 2014-05-28 -5 A Presidential icon, now gone for a decade 2014-06-05 -5 CBO: O’care kills equivalent of 2M jobs 2014-02-04 -5 House holds Lerner in contempt of Congress 2014-05-07 -5 March for Life continues despite cold weather 2014-01-22 -5 House Republicans push Benghazi committee 2014-05-08 -4 Obamacare’s employer mandates delayed 2013-07-02 -4 GOP to form special Benghazi committee 2014-05-02 -4 Thousands gather on Washington Mall 2013-02-17 -4 Will beheadings become call to action for Obama? 2014-09-02 -4 O’care figures frustrate pro’bama fact checkers 2014-02-24 -4 IRS to investigators: Can’t find Lerner’s emails 2014-06-13 -4 .. Gun control in focus 2 years after Sandy Hook 2014-12-14 8 Senate advances bill to extend jobless aid 2014-01-07 8 SOTU puts spotlight on min. wage, gender gap 2014-01-29 8 Obama to governors: ‘sequester hurts states’ 2013-02-25 9 Paycheck Fairness Act voted down by GOP 2014-04-09 9 Secret US-China climate agreement unveiled 2014-11-12 9 Senate’s unemployment bill defeated by 1 vote 2014-02-06 9 Exec. order ends federal LGBT job discrimination 2014-07-21 9 Politicos, businesses thank teachers, nurses 2014-05-06 9 Sundown ushers in the Jewish New Year 2014-09-24 10 LGBT non-discrimination act clears Senate hurdle 2013-11-04 10 Nobel Peace Prize goes to child rights activists 2014-10-10 10 World AIDS Day to raise awareness, funds 2014-12-01 10 Obama to tackle LGBT job discrimination 2014-06-16 11 Military sexual assault bill rejected by Senate 2014-03-06 11 ENDA clears first hurdle, Senate votes 64-32 2013-11-07 11 #WomensEqualityDay marks suffrage anniv. 2014-08-26 11 Renowned poet, author Maya Angelou dies 2014-05-28 12 ‘Not My Boss’ Business Act’ shot down in Senate 2014-07-16 12 College affordability makes Dem agenda 2014-05-05 12 SCOTUS rules in favor of Hobby Lobby 2014-06-30 12 The Civil Rights Act of 1964 turns 50 2014-07-02 12 SCOTUS ends overall campaign donor limits 2014-04-02 12 Deportation to focus on ‘felons not families’ 2014-11-20 13 Earth Day observers consider climate change 2014-04-21 13 Dems tweet support for ‘Not My Boss’ Business’ 2014-07-14 13 Dems host climate-change talk-a-thon 2014-03-10 15 Senate passes 5-month UI extension 2014-04-07 17 Dems’ attempt to raise minimum wage fails 2014-04-30 18 Dems push #EqualPay bill on Equal Pay Day 2014-04-08 19 Name: twhandle, dtype: int64
#crate bipartite network for bpnet
G2=nx.Graph()
# add actors
for politician,color in list(zip(poltweeps.index.tolist(),colors)):
G2.add_node(politician, color=color,bipartite=0)
# add events
for e in dgtotl.index.tolist():
G2.add_node(e,bipartite=1)
def updateG2(group,G2,politicians):
""" Create two-mode edges """
actors = set(group.tolist()) #this can be extended, no weight on two-mode
for actor in actors:
if actor not in politicians:
continue
G2.add_edge(group.name, actor)
# add edges
title.apply(updateG2,G2,set(poltweeps.index))
# print number of nodes and edges
actors = events = 0
for n in G2.nodes(data=True):
if n[1]['bipartite']==0:
actors += 1
else:
events += 1
print ('actors:',actors,'\tevents:',events,'\tedges:',G2.number_of_edges())
actors: 66 events: 7376 edges: 4938
"""The Network File is text file with a binary rectangular matrix.
The number of rows for the matrix should be the same as the number of Actors(A),
and the number of columns is the number of Actors(P)."""
actors = [n[0] for n in G2.nodes(data=True) if n[1]['bipartite']==0]
events = [n[0] for n in G2.nodes(data=True) if n[1]['bipartite']==1]
M = np.matrix(np.zeros((len(events),len(actors))))
for i,event in enumerate(events):
for j,actor in enumerate(actors):
if G2.has_edge(actor, event):
M[i,j]=1
np.savetxt("data/congress_2mode.txt", M, fmt='%d')
#ERGM analysis:
"""
b2nodematch is a homophily based two-star statistic. This term adds one statistic to the model unless diff is set to TRUE, in which case the term adds multiple network statistics to the model, one for each of (a subset of) the unique values of the attrname attribute.
"""
R code (ERGM on the bipartite, two-mode network):
#two mode
commentaries <- read.table('data/congress_2mode.txt',sep=' ')
commentaries <- as.data.frame(t(commentaries))
parties <- read.table('data/congress_attributes.txt',sep=' ',header=T,stringsAsFactors=FALSE)
two_mode <-network(commentaries, vertex.attr=parties, matrix.type='bipartite',
directed=F, hyper=F, loops=F, multiple=F, bipartite=66)
set.vertex.attribute(two_mode, 'party', NA, v=seq_len(network.size(two_mode)-66)+66)
summary(two_mode, print.adj=F)
two_mode.diff<-ergm(two_mode~edges+b1nodematch("party",diff=T))
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.921374 0.016658 2 <1e-04 ***
b1nodematch.party.D 0.520067 0.003541 2 <1e-04 ***
b1nodematch.party.R 0.261082 0.017989 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52601 on 486813 degrees of freedom
AIC: 52607 BIC: 52640 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.950125 0.023855 1 <1e-04 ***
b1nodematch.party.D 0.377780 0.023558 2 <1e-04 ***
b1nodematch.party.R 0.628042 0.003964 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52144 on 486813 degrees of freedom
AIC: 52150 BIC: 52183 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.920042 0.019739 1 <1e-04 ***
b2nodematch.party.D 0.358749 0.034140 1 <1e-04 ***
b2nodematch.party.R 0.601879 0.002838 3 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52254 on 486813 degrees of freedom
AIC: 52260 BIC: 52293 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.949630 0.028055 1 <1e-04 ***
b2nodematch.party.D 0.530463 0.003897 1 <1e-04 ***
b2nodematch.party.R 0.273482 0.020214 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52477 on 486813 degrees of freedom
AIC: 52483 BIC: 52516 (Smaller is better.)
#when party types are not differentiated
two_mode_b<-ergm(two_mode~edges+b1nodematch("party"))
summary(two_mode_b)
==========================
Summary of model fit
==========================
Formula: two_mode ~ edges + b1nodematch("party")
Iterations: 20 out of 20
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.82148 0.02591 1 <1e-04 ***
b1nodematch.party 0.31572 0.02996 1 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52736 on 486814 degrees of freedom
AIC: 52740 BIC: 52762 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.86517 0.01552 2 <1e-04 ***
b2nodematch.party 0.34602 0.01707 2 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 52590 on 486814 degrees of freedom
AIC: 52594 BIC: 52616 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.77669 0.02745 0 <1e-04 ***
b2nodematch.party 0.23968 0.02059 1 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 53149 on 486814 degrees of freedom
AIC: 53153 BIC: 53175 (Smaller is better.)
============================ (another run results)
Monte Carlo MLE Results:
Estimate Std. Error MCMC % p-value
edges -4.826963 0.018368 1 <1e-04 ***
b1nodematch.party 0.247133 0.007967 4 <1e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Null Deviance: 674870 on 486816 degrees of freedom
Residual Deviance: 53158 on 486814 degrees of freedom
AIC: 53162 BIC: 53184 (Smaller is better.)
# let's create the actor network
# vertices <- commentators
# edges (weighted) <- number of news commentated by vertices incident to the edge
G=nx.Graph()
# add vertices
for politician,color in list(zip(poltweeps.index.tolist(),colors)):
G.add_node(politician, color=color)
# print(G.nodes(data=True))
def updateG(group,G,politicians):
""" Create weighted edges """
edges = itertools.combinations(group.tolist(), 2)
for v1,v2 in edges:
if v1 not in politicians or v2 not in politicians:
continue
if G.has_edge(v1, v2):
G[v1][v2]['weight'] += 1
else:
G.add_edge(v1, v2, weight=1)
# add edges
title.apply(updateG,G,set(poltweeps.index))
# print number of nodes and edges
print (G.number_of_nodes(),G.number_of_edges())
66 1863
# Exporting to be read by Gephi for better visualization
# nx.write_gml(G,"data/theplazz_politics.gml")
# export for R-ergm
A = nx.to_numpy_matrix(G, weight='weight')
np.savetxt("data/congress_actors_weighted.txt", A, fmt='%d')
# network file for pnet
A = nx.to_numpy_matrix(G, weight=None)
np.savetxt("data/congress_actor.txt", A, fmt='%d')
#attribute file for pnet
party = [str(2) if n[1]['color']=='red' else str(1) for n in G.nodes(data=True)]
party.insert(0,'party')
with open('data/congress_attribute.txt','w') as w:
w.write('\n'.join(party))
#attribute file for R-ergm
party = ['R' if n[1]['color']=='red' else 'D' for n in G.nodes(data=True)]
party.insert(0,'party')
with open('data/congress_attributes.txt','w') as w:
w.write('\n'.join(party))
# 62 of the 65 monitored Congress members are found to be in the same group as their co-party members
# 3 congresspeople not in the same group as their co-party members are circled
from IPython.display import Image
Image(url='http://talhaoz.com/wp-content/uploads/2015/03/Modularity_Labeled.png')