import json
import twitter
#import urllib2
#import requests
import itertools
import re
from time import time
from datetime import datetime
from pprint import pprint
from hr import hr
import pymongo
from pymongo import MongoClient
client = MongoClient()
db = client.dedup
collection = db.refined
authval = json.load(open("keys.txt"))
CONSUMER_KEY = authval['CONSUMER_KEY']
CONSUMER_SECRET = authval['CONSUMER_SECRET']
OAUTH_TOKEN = authval['OAUTH_TOKEN']
OAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,CONSUMER_KEY, CONSUMER_SECRET)
t = twitter.Twitter(auth=auth)
#Function to check the credentials of the User
def verify():
verificationDetails = t.account.verify_credentials()
print "Name: ", verificationDetails['name']
print "Screen Name: ", verificationDetails['screen_name']
verify()
Name: Rajat Goyal Screen Name: rajat404
#testTweet = t.statuses.home_timeline()[0]
getLast = db.last.find({}).sort([('_id', -1)]).limit(1)
#sinceCounter = None
for item in getLast:
sinceCounter = item['lastTweet']
print sinceCounter
572492397210411008
t1 = time()
completeTimeline = t.statuses.home_timeline(count=200, since_id=sinceCounter)
t2 = time()
print "Time taken to load tweets: ", t2-t1
print "Number of tweets fetched: ", len(completeTimeline)
Time taken to load tweets: 6.10851597786 Number of tweets fetched: 197
lastTweet = completeTimeline[-1]['id']
endTweet = {'lastTweet':lastTweet, 'created_on':datetime.now()}
db.last.insert(endTweet)
#db.last.ensure_index([("id" , pymongo.ASCENDING), ("unique" , True), ("dropDups" , True)])
ObjectId('54f5174b44356233cba2319e')
Tweets contains huge amount of metadeta. We need to extract the useful components
from string import punctuation
set_punct = set(punctuation)
set_punct = set_punct - {"@"}
#set_punct = set_punct - {"_", "@"}
def sanitize(text, set_excludes):
"""
Return a `sanitized` version of the string `text`.
"""
text = text.lower()
text = " ".join([ w for w in text.split() if not ("http://" in w) ])
letters_noPunct = [ (" " if c in set_excludes else c) for c in text ]
text = "".join(letters_noPunct)
words = text.split()
long_enuf_words = [w.strip() for w in words if len(w)>1]
return " ".join(long_enuf_words)
print "Characters that will be removed from the tweets:\n", set_punct
Characters that will be removed from the tweets: set(['!', '#', '"', '%', '$', "'", '&', ')', '(', '+', '*', '-', ',', '/', '.', ';', ':', '=', '<', '?', '>', '[', ']', '\\', '_', '^', '`', '{', '}', '|', '~'])
#List of stop words
stop = "about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount and another any anyhow anyone anything anyway anywhere are around as at back became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but call can cannot cant computer con could couldnt cry describe detail do done down due during each eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen fify fill find fire first five for former formerly forty found four from front full further get give had has hasnt have hence her here hereafter hereby herein hereupon hers him his how however hundred indeed interest into its keep last latter latterly least less ltd made many may me meanwhile might mill mine more moreover most mostly move much must my name namely neither never nevertheless next nine nobody none noone nor not nothing now nowhere off often once one only onto other others otherwise our ours ourselves out over own part per perhaps please put rather same see seem seemed seeming seems serious several she should show side since sincere six sixty some somehow someone something sometime sometimes somewhere still such system take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus together too top toward towards twelve twenty two under until upon very via was well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves"
def Refine(raw_tweet):
simple = {}
simple['text'] = raw_tweet['text']
simple['cleanText'] = sanitize(raw_tweet['text'], set_punct)
words = simple['cleanText']
set_words = set(words.split())
cleanWords = list(set_words)
for term in cleanWords:
if term in stop:
cleanWords.remove(term)
simple['cleanWords'] = set(cleanWords)
simple['id'] = raw_tweet['id']
simple['user_screen_name'] = raw_tweet['user']['screen_name']
#LATER
simple['created_at'] = raw_tweet['created_at']
simple['timestamp'] = datetime.now()
simple['is_active'] = True
simple['is_tested'] = False
# try:
# temp = (requests.get(str(raw_tweet['entities']['urls'][0]['expanded_url'])).url)
# simple['cleanUrl'] = temp
# except:
# #print raw_tweet['entities']['urls']
# simple['cleanUrl'] = None
try:
simple['urls'] = raw_tweet['entities']['urls']
simple['cleanUrl'] = raw_tweet['entities']['urls'][0]['expanded_url']
except:
simple['cleanUrl'] = None
return simple
refinedTweet = []
t1 = time()
for tweet in completeTimeline:
refinedTweet.append(Refine(tweet))
t2 = time()
# for tweet in completeTimeline:
# y = tweet['entities']['urls']
# if y != []:
# refinedTweet.append(Refine(tweet))
#print json.dumps(refinedTweet, sort_keys=True, indent=2)
#data = json.dumps(refinedTweet)
All the tweets after sanitization, are cached in MongoDB, so that they can be used at a later time
#refining for inserting in MongoDB -- converting the set to list!
import copy
mongoRefined = copy.deepcopy(refinedTweet)
for item in mongoRefined:
item['cleanWords'] = list(item['cleanWords'])
#Refined Tweets are Cached in MongoDB
for item in mongoRefined:
db.refined.insert(item)
#In order to avoid duplicates
db.refined.ensure_index([("id" , pymongo.ASCENDING), ("unique" , True), ("dropDups" , True)])
u'id_1_unique_True_dropDups_True'
#Now we shall fetch ALL the tweets gathered so far
allTweets = db.refined.find()
data = []
for item in allTweets:
data.append(item)
len(data)
197
data[0]
{u'_id': ObjectId('54f5176844356233cba2322c'), u'cleanText': u'rt @jonoyeong @sarajchipps first experience with hardware is fantastic am god listening to the new codenewbie podcast', u'cleanUrl': u'http://marchisformakers.com/?utm_content=buffera6071&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer', u'cleanWords': [u'fantastic', u'listening', u'god', u'am', u'experience', u'codenewbie', u'hardware', u'@sarajchipps', u'@jonoyeong', u'new', u'podcast', u'first'], u'created_at': u'Mon Mar 02 23:05:28 +0000 2015', u'id': 572533209621114880L, u'is_active': True, u'is_tested': False, u'text': u'RT @JonoYeong: @SaraJChipps first experience with hardware is fantastic "I am a god". Listening to the new #codenewbie podcast! http://t.co\u2026', u'timestamp': datetime.datetime(2015, 3, 3, 7, 37, 30, 186000), u'urls': [{u'display_url': u'marchisformakers.com/?utm_content=b\u2026', u'expanded_url': u'http://marchisformakers.com/?utm_content=buffera6071&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer', u'indices': [139, 140], u'url': u'http://t.co/PjMVhln9ZJ'}], u'user_screen_name': u'shanselman'}
refData = copy.deepcopy(data)
for item in refData:
item['cleanWords'] = set(item['cleanWords'])
print refData[0]
{u'cleanWords': set([u'fantastic', u'@jonoyeong', u'god', u'am', u'experience', u'codenewbie', u'hardware', u'@sarajchipps', u'listening', u'new', u'podcast', u'first']), u'user_screen_name': u'shanselman', u'cleanUrl': u'http://marchisformakers.com/?utm_content=buffera6071&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer', u'cleanText': u'rt @jonoyeong @sarajchipps first experience with hardware is fantastic am god listening to the new codenewbie podcast', u'text': u'RT @JonoYeong: @SaraJChipps first experience with hardware is fantastic "I am a god". Listening to the new #codenewbie podcast! http://t.co\u2026', u'created_at': u'Mon Mar 02 23:05:28 +0000 2015', u'is_active': True, u'is_tested': False, u'urls': [{u'url': u'http://t.co/PjMVhln9ZJ', u'indices': [139, 140], u'expanded_url': u'http://marchisformakers.com/?utm_content=buffera6071&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer', u'display_url': u'marchisformakers.com/?utm_content=b\u2026'}], u'timestamp': datetime.datetime(2015, 3, 3, 7, 37, 30, 186000), u'_id': ObjectId('54f5176844356233cba2322c'), u'id': 572533209621114880L}
#collections of only the 'clean' words of the tweets
documents_straight = []
for item in refData:
documents_straight.append(item['cleanWords'])
To find the nearly duplicate tweets
def jaccard_set(s1, s2):
u = s1.union(s2)
i = s1.intersection(s2)
if len(u) != 0:
return float(len(i))/float(len(u))
combinations = list( itertools.combinations([x for x in range(len(documents_straight))], 2) )
# print("combinations=%s") %(combinations)
# compare each pair in combinations tuple of the sets of their words
t3 = time()
dupList1 = []
dupList2 = []
#dupJson = []
for c in combinations:
i1 = c[0]
i2 = c[1]
jac = jaccard_set(documents_straight[i1], documents_straight[i2])
if jac == 1:
#print("%s : %s,%s : jaccard=%s") %(c, shingles[i1],shingles[i2],jac)
dupList2.append(c)
#later
#dupJson.append({'c':c,'jac':jac,
#print("%s : jaccard=%s") %(c,jac)
elif jac < 1 and jac >= 0.5:
dupList1.append(c)
t4 = time()
print "time taken:", t4-t3
print "number of exact duplicate pairs:", len(dupList2)
print "number of near duplicate pairs:", len(dupList1)
time taken: 0.0809350013733 number of exact duplicate pairs: 1 number of near duplicate pairs: 3
dupList2
[(88, 123)]
import networkx as nx
g1 = nx.Graph(dupList1)
o1= nx.connected_components(g1)
duplicates1 = []
for item in o1:
duplicates1.append(item)
g2 = nx.Graph(dupList2)
o2= nx.connected_components(g2)
duplicates2 = []
for item in o2:
duplicates2.append(item)
len(duplicates2)
1
duplicates1
[[2, 7], [195, 155], [168, 190]]
#view all near-duplicates!
testing = duplicates1[:2]
for item in testing:
for i in range(len(item)):
print item[i], '\n--------'
print "ID: ", refData[item[i]]['id'], "\nOriginal Tweet: ", refData[item[i]]['text'] ,'\n\nURL:', refData[item[i]]['cleanUrl'] ,'\n\nPosted By:',\
refData[item[i]]['user_screen_name'] ,'\n', refData[item[i]]['cleanWords']
hr('-')
hr()
2 -------- ID: 572578700962623488 Original Tweet: RT @IndianGuru: Rocking #golang - Cross compilation just got a whole lot better in Go 1.5 http://t.co/bL2aGoVzpQ @davecheney URL: http://dave.cheney.net/2015/03/03/cross-compilation-just-got-a-whole-lot-better-in-go-1-5 Posted By: GopherConIndia set([u'golang', u'just', u'compilation', u'better', u'@indianguru', u'lot', u'go', u'got', u'rocking', u'@davecheney']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7 -------- ID: 572578011423379456 Original Tweet: Cross compilation just got a whole lot better in Go 1.5 http://t.co/v8pRstw57R URL: http://dave.cheney.net/2015/03/03/cross-compilation-just-got-a-whole-lot-better-in-go-1-5 Posted By: newsycombinator set([u'just', u'compilation', u'better', u'lot', u'go', u'got']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- #################################################################################################################################################################### 195 -------- ID: 572515270104899585 Original Tweet: February in Africa: All the tech news you shouldn’t miss from the past month http://t.co/rWh0HYtDws http://t.co/pF44KZfB9f URL: http://tnw.me/jjS5v3H Posted By: TheNextWeb set([u'february', u'africa', u'shouldn\u2019t', u'past', u'tech', u'news', u'month', u'miss']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 155 -------- ID: 572528661754195968 Original Tweet: February in Latin America: All the tech news you shouldn’t miss from the past month http://t.co/7X6sxQDLpN http://t.co/i5ixEAdbXe URL: http://tnw.me/ePFGryk Posted By: TheNextWeb set([u'february', u'latin', u'america', u'shouldn\u2019t', u'past', u'tech', u'news', u'month', u'miss']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- ####################################################################################################################################################################
duplicates2
[[88, 123]]
duplicates1
[[2, 7], [195, 155], [168, 190]]
#view all duplicates!
testing = duplicates1
for item in testing:
for i in range(len(item)):
print item[i], '\n--------'
print "Original Tweet: ", refData[item[i]]['text'] ,'\n\nURL:', refData[item[i]]['cleanUrl'] ,'\n\nPosted By:',\
refData[item[i]]['user_screen_name'] ,'\n', refData[item[i]]['cleanWords']
hr('-')
hr()
2 -------- Original Tweet: RT @IndianGuru: Rocking #golang - Cross compilation just got a whole lot better in Go 1.5 http://t.co/bL2aGoVzpQ @davecheney URL: http://dave.cheney.net/2015/03/03/cross-compilation-just-got-a-whole-lot-better-in-go-1-5 Posted By: GopherConIndia set([u'golang', u'just', u'compilation', u'better', u'@indianguru', u'lot', u'go', u'got', u'rocking', u'@davecheney']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7 -------- Original Tweet: Cross compilation just got a whole lot better in Go 1.5 http://t.co/v8pRstw57R URL: http://dave.cheney.net/2015/03/03/cross-compilation-just-got-a-whole-lot-better-in-go-1-5 Posted By: newsycombinator set([u'just', u'compilation', u'better', u'lot', u'go', u'got']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- #################################################################################################################################################################### 195 -------- Original Tweet: February in Africa: All the tech news you shouldn’t miss from the past month http://t.co/rWh0HYtDws http://t.co/pF44KZfB9f URL: http://tnw.me/jjS5v3H Posted By: TheNextWeb set([u'february', u'africa', u'shouldn\u2019t', u'past', u'tech', u'news', u'month', u'miss']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 155 -------- Original Tweet: February in Latin America: All the tech news you shouldn’t miss from the past month http://t.co/7X6sxQDLpN http://t.co/i5ixEAdbXe URL: http://tnw.me/ePFGryk Posted By: TheNextWeb set([u'february', u'latin', u'america', u'shouldn\u2019t', u'past', u'tech', u'news', u'month', u'miss']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- #################################################################################################################################################################### 168 -------- Original Tweet: What it's like to need hardly any sleep (via @NYMag) http://t.co/KWirPlqium URL: http://f-st.co/VRwogs2 Posted By: FastCompany set([u'via', u'like', u'to', u'sleep', u'@nymag', u'need', u'hardly']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 190 -------- Original Tweet: What It’s Like to Need Hardly Any Sleep http://t.co/clgBUaT5X5 URL: http://nymag.com/scienceofus/2015/02/what-its-like-to-need-hardly-any-sleep.html Posted By: newsycombinator set([u'need', u'hardly', u'it\u2019s', u'sleep', u'like']) -------------------------------------------------------------------------------------------------------------------------------------------------------------------- ####################################################################################################################################################################