import json
import twitter
#import urllib2
#import requests
import itertools
import re
from time import time
from datetime import datetime
from pprint import pprint
from hr import hr

import pymongo
from pymongo import MongoClient
client = MongoClient()
db = client.dedup
collection = db.refined

authval = json.load(open("keys.txt"))
CONSUMER_KEY = authval['CONSUMER_KEY']
CONSUMER_SECRET = authval['CONSUMER_SECRET']
OAUTH_TOKEN = authval['OAUTH_TOKEN'] 
OAUTH_TOKEN_SECRET = authval['OAUTH_TOKEN_SECRET']
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,CONSUMER_KEY, CONSUMER_SECRET)
t = twitter.Twitter(auth=auth)

#Function to check the credentials of the User
def verify():
    verificationDetails = t.account.verify_credentials()
    print "Name: ", verificationDetails['name']
    print "Screen Name: ", verificationDetails['screen_name']
    
verify()

#testTweet = t.statuses.home_timeline()[0]

getLast = db.last.find({}).sort([('_id', -1)]).limit(1)
#sinceCounter = None
for item in getLast:
    sinceCounter = item['lastTweet']

print sinceCounter

t1 = time()
completeTimeline = t.statuses.home_timeline(count=200, since_id=sinceCounter)
t2 = time()
print "Time taken to load tweets: ", t2-t1
print "Number of tweets fetched: ", len(completeTimeline)

lastTweet = completeTimeline[-1]['id']
endTweet = {'lastTweet':lastTweet, 'created_on':datetime.now()}
db.last.insert(endTweet)
#db.last.ensure_index([("id" , pymongo.ASCENDING), ("unique" , True), ("dropDups" , True)])


from string import punctuation
set_punct = set(punctuation)
set_punct = set_punct - {"@"}
#set_punct = set_punct - {"_", "@"}

def sanitize(text, set_excludes):
    """
    Return a `sanitized` version of the string `text`.
    """
    text = text.lower()
    text = " ".join([ w for w in text.split() if not ("http://" in w) ])
    letters_noPunct = [ (" " if c in set_excludes else c) for c in text ]
    text = "".join(letters_noPunct)
    words = text.split()
    long_enuf_words = [w.strip() for w in words if len(w)>1]
    return " ".join(long_enuf_words)

print "Characters that will be removed from the tweets:\n", set_punct 

#List of stop words
stop = "about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount and another any anyhow anyone anything anyway anywhere are around as at back became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but call can cannot cant computer con could couldnt cry describe detail do done down due during each eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen fify fill find fire first five for former formerly forty found four from front full further get give had has hasnt have hence her here hereafter hereby herein hereupon hers him his how however hundred indeed interest into its keep last latter latterly least less ltd made many may me meanwhile might mill mine more moreover most mostly move much must my name namely neither never nevertheless next nine nobody none noone nor not nothing now nowhere off often once one only onto other others otherwise our ours ourselves out over own part per perhaps please put rather same see seem seemed seeming seems serious several she should show side since sincere six sixty some somehow someone something sometime sometimes somewhere still such system take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus together too top toward towards twelve twenty two under until upon very via was well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves"

def Refine(raw_tweet):
    simple = {}
    simple['text'] = raw_tweet['text']
    simple['cleanText'] = sanitize(raw_tweet['text'], set_punct)
    words = simple['cleanText']
    set_words = set(words.split())
    cleanWords = list(set_words)
    for term in cleanWords:
        if term in stop:
            cleanWords.remove(term)
    simple['cleanWords'] = set(cleanWords)
    simple['id'] = raw_tweet['id']
    simple['user_screen_name'] = raw_tweet['user']['screen_name']
    #LATER
    simple['created_at'] = raw_tweet['created_at']
    simple['timestamp'] = datetime.now()
    simple['is_active'] = True
    simple['is_tested'] = False
    # try:
    #     temp = (requests.get(str(raw_tweet['entities']['urls'][0]['expanded_url'])).url)
    #     simple['cleanUrl'] = temp
    # except:
    #     #print raw_tweet['entities']['urls']
    #     simple['cleanUrl'] = None
    try:
        simple['urls'] = raw_tweet['entities']['urls']
    	simple['cleanUrl'] = raw_tweet['entities']['urls'][0]['expanded_url']
    except:
    	simple['cleanUrl'] = None
    return simple

refinedTweet = []
t1 = time()
for tweet in completeTimeline:
    refinedTweet.append(Refine(tweet))
t2 = time()
# for tweet in completeTimeline:
#     y = tweet['entities']['urls']
#     if y != []:
#         refinedTweet.append(Refine(tweet))

#print json.dumps(refinedTweet, sort_keys=True, indent=2)
#data = json.dumps(refinedTweet)

#refining for inserting in MongoDB -- converting the set to list!
import copy
mongoRefined = copy.deepcopy(refinedTweet)
for item in mongoRefined:
    item['cleanWords'] = list(item['cleanWords'])

#Refined Tweets are Cached in MongoDB
for item in mongoRefined:
    db.refined.insert(item)

#In order to avoid duplicates
db.refined.ensure_index([("id" , pymongo.ASCENDING), ("unique" , True), ("dropDups" , True)])

#Now we shall fetch ALL the tweets gathered so far
allTweets = db.refined.find()
data = []
for item in allTweets:
    data.append(item)

len(data)

data[0]

refData = copy.deepcopy(data)
for item in refData:
    item['cleanWords'] = set(item['cleanWords'])

print refData[0]

#collections of only the 'clean' words of the tweets
documents_straight = []
for item in refData:
    documents_straight.append(item['cleanWords'])

def jaccard_set(s1, s2):
    u = s1.union(s2)
    i = s1.intersection(s2)
    if len(u) != 0:
        return float(len(i))/float(len(u))

combinations = list( itertools.combinations([x for x in range(len(documents_straight))], 2) )
# print("combinations=%s") %(combinations)
# compare each pair in combinations tuple of the sets of their words
t3 = time()
dupList1 = []
dupList2 = []
#dupJson = []
for c in combinations:
    i1 = c[0]
    i2 = c[1]
    jac = jaccard_set(documents_straight[i1], documents_straight[i2])
    if jac == 1:
        #print("%s : %s,%s : jaccard=%s") %(c, shingles[i1],shingles[i2],jac)
        dupList2.append(c)
        #later
        #dupJson.append({'c':c,'jac':jac,
        #print("%s : jaccard=%s") %(c,jac)
    elif jac < 1 and jac >= 0.5:
        dupList1.append(c)
t4 = time()
print "time taken:", t4-t3
print "number of exact duplicate pairs:", len(dupList2)
print "number of near duplicate pairs:", len(dupList1)

dupList2

import networkx as nx

g1 = nx.Graph(dupList1)
o1= nx.connected_components(g1)
duplicates1 = []
for item in o1:
    duplicates1.append(item)

g2 = nx.Graph(dupList2)
o2= nx.connected_components(g2)
duplicates2 = []
for item in o2:
    duplicates2.append(item)

len(duplicates2)

duplicates1

#view all near-duplicates!
testing = duplicates1[:2]
for item in testing:
    for i in range(len(item)):
        print item[i], '\n--------'
        print "ID: ", refData[item[i]]['id'], "\nOriginal Tweet: ", refData[item[i]]['text'] ,'\n\nURL:', refData[item[i]]['cleanUrl'] ,'\n\nPosted By:',\
        refData[item[i]]['user_screen_name'] ,'\n', refData[item[i]]['cleanWords']
        hr('-')
    hr()

duplicates2

duplicates1

#view all duplicates!
testing = duplicates1
for item in testing:
    for i in range(len(item)):
        print item[i], '\n--------'
        print "Original Tweet: ", refData[item[i]]['text'] ,'\n\nURL:', refData[item[i]]['cleanUrl'] ,'\n\nPosted By:',\
        refData[item[i]]['user_screen_name'] ,'\n', refData[item[i]]['cleanWords']
        hr('-')
    hr()