Here we provide code used to collect Twitter data for our AAAI paper.
Note that to this is a long running process (days) and may result in data that differs from that in the original paper. So, if you're interested in reproducing the results, I'd instead start with the data_processing.ipynb notebook.
For each brand in brands.json, we collect 300 followers. For each of these followers, we collect up to 5,000 of their friends (i.e., users they follow).
The results are stored in three pickle files:
username2brand.pkl
: a dict from Twitter handle to brand demographics.id2brand.pkl
: a dict from Twitter user id to brand.brand2counts.pkl
: a dict from brand Twitter id to a Counter object. The Counter object is a dict from Twitter ID to count, representing the number times a follower of brand X is friends with user Y. E.g., {123: {456: 10, 789: 5}}
means that of the users who follow 123, 10 also follow 456 and 5 also follow 789.MongoDB is used to store data for intermediate computations. You may want to distribute calls to process_followers
and process_friends
for efficiency.
from pymongo import MongoClient
dbconn = MongoClient('localhost', 27017)
db = dbconn.twitter_demographics
print db
Database(MongoClient('localhost', 27017), u'twitter_demographics')
import json
#insert twitter credentials into DB
twitter_cred = db['twitter_cred']
#remove all before insert
twitter_cred.remove({})
with open('../data/twitter-cred.json','r') as f:
for line in f.readlines():
user = json.loads(line)
user['is_taken'] = False
twitter_cred.insert(user)
print 'Inserted %d users credentails'%(twitter_cred.count())
Inserted 4 users credentails
#insert brand details in to db
brands = db['brands']
brands.remove({})
with open('../data/brands.json','r') as f:
for line in f.readlines():
brands.insert(eval(line))
print 'Inserted %d brands'%(brands.count())
Inserted 1072 brands
#helper functions
def get_credentials():
"""Gets twitter credentials from DB
Returns:
Respone dict , or None if failed.
"""
twitter_cred = db['twitter_cred']
return twitter_cred.find_and_modify(query={'is_taken':False}, update={'$set':{'is_taken':True}}, upsert=False, sort=None)
def get_brand_to_process():
"""Gets unprocessed brand names from DB
Returns:
brand dict or None
"""
brandObj = db['brands']
brand = brandObj.find_and_modify(query={'is_processed':False,'is_taken':False}, update={'$set':{'is_taken':True}}, upsert=False, sort=None, full_response=False)
if brand:
brand['brand_id'] = brand.pop('_id')
return brand
def get_follower_to_process():
"""Gets unprocessed followers form DB
Returns:
follower details dict or None
"""
xmatrix = db['xmatrix']
followersDtls = xmatrix.find_and_modify(query={'is_processed':False,'is_taken':False}, update={'$set':{'is_taken':True}}, upsert=False, sort=None, full_response=False)
if followersDtls:
followersDtls['follower_id'] = followersDtls.pop('_id')
return followersDtls
def add_followers_to_db(brand_id,followers_ids):
"""Adds followers to DB
Args:
brand_id :brand id of which the user follows
followers_ids: Ids of followers of the brand
"""
xmatrix = db['xmatrix']
for follower_id in followers_ids:
xmatrix.update({'_id':follower_id},{'$addToSet':{'follows':brand_id},'$set':{'is_processed':False,'is_taken':False}},True)
def add_friends_to_dB(follower_id,friends_list):
"""Adds friends to DB
Args:
follower_id: Id of follower
freiends : list of his friends
"""
xmatrix = db['xmatrix']
xmatrix.update({'_id':follower_id},
{'$addToSet':{
'follows':{
'$each':friends_list
}
}
})
def update_processed_flag(user_id):
"""updates processed flag to true
Args:
user_id: user id for which processed flag to be updated
"""
xmatrix = db['xmatrix']
xmatrix.update({'_id':user_id},{'$set':{'is_taken':False,'is_processed':True}})
def remove_user_from_x(self,user_id):
"""removes user from xmatrix
Args:
user_id: user id
"""
xmatrix = db['xmatrix']
xmatrix.remove({'_id':user_id})
def robust_request(twitter, resource, params, max_tries=5):
""" If a Twitter request fails, sleep for 15 minutes.
Do this at most max_tries times before quitting.
Args:
twitter .... A TwitterAPI object.
resource ... A resource string to request.
params ..... A parameter dictionary for the request.
max_tries .. The maximum number of tries to attempt.
Returns:
A TwitterResponse object, or None if failed.
"""
for i in range(max_tries):
request = twitter.request(resource, params)
if request.status_code == 200:
return request
r = [r for r in request][0]
if ('code' in r and r['code'] == 34) or ('error' in r and r['error'] == 'Not authorized.'): # 34 == user does not exist.
print >> sys.stderr, 'skipping bad request', resource, params
return None
else:
print >> sys.stderr, 'Got error:', request.text, '\nsleeping for 15 minutes.'
sys.stderr.flush()
time.sleep(60 * 15)
#get twitter obj
from TwitterAPI import TwitterAPI
twitter = get_credentials()
if twitter:
twitterObj = TwitterAPI(
twitter['api_key'],
twitter['api_secret'],
twitter ['access_token_key'],
twitter['access_token_secret'])
else:
print >> sys.stderr,'Twitter credits not available'
def get_followers(user_id):
"""To get followers of given twitter Ids
Args:
user_id... twitter user id
Returns
followers list
"""
followers = []
request = robust_request(twitterObj, 'followers/ids',
{'user_id': user_id, 'count': 300})
if request:
for result in request:
if 'ids' in result:
followers += result['ids']
return followers
def get_friends(user_id):
"""To get friends of given twitter Ids
Args:
user_id... twitter user id
Returns
friends list
"""
friends = []
request = robust_request(twitterObj, 'friends/ids',
{'user_id': user_id, 'count': 5000})
if request:
for result in request:
if 'ids' in result:
friends += result['ids']
return friends
#For testing purpose
#else set it to -1
cut_off = 3
import sys
def process_followers():
"""Gets unprocessed brands from DB
fetch 300 followers of those brands and
adds it to DB.
Halts when all brands are processed or cutt_off is reached
"""
global cut_off
while True:
if cut_off == 0:
print >> sys.stderr, 'cut off reached'
break
cut_off -= 1
brand = get_brand_to_process()
if not brand:
print >> sys.stderr, 'No brands to process'
break
followers = get_followers(brand['brand_id'])
if len(followers) > 0:
add_followers_to_db(brand['brand_id'],followers)
print 'added %d followers of %s to DB'%(len(followers),brand['brand_id'])
process_followers()
added 300 followers of 255784266 to DB added 300 followers of 261927470 to DB added 300 followers of 268439864 to DB
cut off reached
#For testing purpose
#else set it to -1
cut_off = 3
import sys
def process_friends():
"""Gets unprocessed follower and get
5000 of their friends , adds it to DB
Halts when all followers are processed or cutt_off is reached
"""
global cut_off
while True:
if cut_off == 0:
print >> sys.stderr, 'cut off reached'
break
cut_off -= 1
follower = get_follower_to_process()
if not follower:
print >> sys.stderr, 'No follower to process'
break
friends = get_friends(str(follower['follower_id']))
if len(friends) > 0:
add_friends_to_dB(follower['follower_id'],friends)
print 'added %d followers of %s to DB'%(len(friends),follower['follower_id'])
update_processed_flag(str(follower['follower_id']))
else:
remove_user_from_x(str(follower['follower_id']))
print >> sys.stderr, 'removed user , unable to fetch friends list for %s'%(str(follower['follower_id']))
process_friends()
added 763 followers of 2891952572 to DB added 571 followers of 2890499755 to DB added 155 followers of 317563215 to DB
cut off reached
# Get all brands.
id2brand = dict()
username2brand = dict()
for brand in db.brands.find():
id2brand[brand['_id']] = brand
username2brand[brand['brand_name'].lower()] = brand
print 'read', len(id2brand), 'brands'
read 1072 brands
# Iterate over sampled followers for each brand.
from collections import Counter, defaultdict
# Count how often each friend appears so we can remove those occuring fewer than N tims.
friend_counts = Counter()
count = 0
for follower in db.xmatrix.find({'is_processed':True}):
friend_counts.update(follower['follows'])
count += 1
if count % 1000 == 0:
print 'read', count
read 1000 read 2000 read 3000 read 4000 read 5000 read 6000 read 7000 read 8000 read 9000 read 10000 read 11000 read 12000 read 13000 read 14000 read 15000 read 16000 read 17000 read 18000 read 19000 read 20000 read 21000 read 22000 read 23000 read 24000 read 25000 read 26000 read 27000 read 28000 read 29000 read 30000 read 31000 read 32000 read 33000 read 34000 read 35000 read 36000 read 37000 read 38000 read 39000 read 40000 read 41000 read 42000 read 43000 read 44000 read 45000 read 46000 read 47000 read 48000 read 49000 read 50000 read 51000 read 52000 read 53000 read 54000 read 55000 read 56000 read 57000 read 58000 read 59000 read 60000 read 61000 read 62000 read 63000 read 64000 read 65000 read 66000 read 67000 read 68000 read 69000 read 70000 read 71000 read 72000 read 73000 read 74000 read 75000 read 76000 read 77000 read 78000 read 79000 read 80000 read 81000 read 82000 read 83000 read 84000 read 85000 read 86000 read 87000 read 88000 read 89000 read 90000 read 91000 read 92000 read 93000 read 94000 read 95000 read 96000 read 97000 read 98000 read 99000 read 100000 read 101000 read 102000 read 103000 read 104000 read 105000 read 106000 read 107000 read 108000 read 109000 read 110000 read 111000 read 112000 read 113000 read 114000 read 115000 read 116000 read 117000 read 118000 read 119000 read 120000 read 121000 read 122000 read 123000 read 124000 read 125000 read 126000 read 127000 read 128000 read 129000 read 130000 read 131000 read 132000 read 133000 read 134000 read 135000 read 136000 read 137000 read 138000 read 139000 read 140000 read 141000 read 142000 read 143000 read 144000 read 145000 read 146000
# Filter accounts not appearing a minimum number of times.
count_thresh = 100
friend_set = set(f for f, v in friend_counts.iteritems() if v >= count_thresh)
print len(friend_set), 'of', len(friend_counts), 'appear at least', count_thresh, 'times'
67760 of 21185659 appear at least 100 times
# Now construct friend counts for each brand, using the filtered set of accounts.
brand2counts = defaultdict(lambda: Counter())
count = 0
for follower in db.xmatrix.find({'is_processed':True}):
count += 1
if count % 1000 == 0:
print 'read', count
brandids = set([f for f in follower['follows'] if f in id2brand])
friends = set(follower['follows']) & friend_set
for b in brandids:
brand2counts[b].update(friends)
read 1000 read 2000 read 3000 read 4000 read 5000 read 6000 read 7000 read 8000 read 9000 read 10000 read 11000 read 12000 read 13000 read 14000 read 15000 read 16000 read 17000 read 18000 read 19000 read 20000 read 21000 read 22000 read 23000 read 24000 read 25000 read 26000 read 27000 read 28000 read 29000 read 30000 read 31000 read 32000 read 33000 read 34000 read 35000 read 36000 read 37000 read 38000 read 39000 read 40000 read 41000 read 42000 read 43000 read 44000 read 45000 read 46000 read 47000 read 48000 read 49000 read 50000 read 51000 read 52000 read 53000 read 54000 read 55000 read 56000 read 57000 read 58000 read 59000 read 60000 read 61000 read 62000 read 63000 read 64000 read 65000 read 66000 read 67000 read 68000 read 69000 read 70000 read 71000 read 72000 read 73000 read 74000 read 75000 read 76000 read 77000 read 78000 read 79000 read 80000 read 81000 read 82000 read 83000 read 84000 read 85000 read 86000 read 87000 read 88000 read 89000 read 90000 read 91000 read 92000 read 93000 read 94000 read 95000 read 96000 read 97000 read 98000 read 99000 read 100000 read 101000 read 102000 read 103000 read 104000 read 105000 read 106000 read 107000 read 108000 read 109000 read 110000 read 111000 read 112000 read 113000 read 114000 read 115000 read 116000 read 117000 read 118000 read 119000 read 120000 read 121000 read 122000 read 123000 read 124000 read 125000 read 126000 read 127000 read 128000 read 129000 read 130000 read 131000 read 132000 read 133000 read 134000 read 135000 read 136000 read 137000 read 138000 read 139000 read 140000 read 141000 read 142000 read 143000 read 144000 read 145000 read 146000
# Print the top accounts for the first brand.
print brand2counts.keys()[0], sorted(brand2counts.values()[0].items(), key=lambda x: -x[1])[:5]
15650816 [(15650816, 626), (35764757, 322), (15846407, 306), (25525507, 283), (90420314, 273)]
# Read the demographics data for each brand.
import json
username2demo = dict()
for line in open('../data/demo.json', 'rt'):
js = json.loads(line)
username2demo[js['twitter'].lower()] = js
print 'read', len(username2demo), 'demographics'
read 1513 demographics
# Add demographics to each brand dict.
for username, brand in username2brand.iteritems():
brand['demo'] = username2demo[username]
if not 'Female' in brand['demo']:
print brand['brand_name'] # , brand['demo']['Female']
stltoday World_Wildlife BettyBuzz TeamRankings InsideHoops GrindTV
# Set self reference counts to 0.
for brand in brand2counts:
brand2counts[brand][brand] = 0.
# Now, the brand id should not appear in the count dict.
print brand2counts.keys()[0], sorted(brand2counts.values()[0].items(), key=lambda x: -x[1])[:5]
15650816 [(35764757, 322), (15846407, 306), (25525507, 283), (90420314, 273), (34381878, 251)]
# Pickle everything
from functools import partial
import pickle
pickle.dump(username2brand, open('username2brand.pkl', 'wb'))
pickle.dump(id2brand,open('id2brand.pkl','wb'))
pickle.dump(dict([(b,c) for b,c in brand2counts.iteritems()]), open('brand2counts.pkl', 'wb'))