import time
import pandas as pd
import json
import os
import urllib
import urllib2
import datetime

user_agent = ("Project for Data Science class v1.0"  " https://github.com/jaysayre/intelligentdolphins")

def json_extract(baseurl, headrs=None, params=None, extraparam=None):
    '''
    Helper function to download and read json data. Takes in explanatory headers and returns json dict.
    '''
    if params != None:
        if extraparam != None:
                params['t'] = extraparam
        form = urllib.urlencode(params)
        url = baseurl+form
    else:
        url = baseurl
    if headrs != None:
        request = urllib2.Request(url, headers=headrs)
    else: 
        request = urllib2.Request(url)
    return json.loads(urllib2.urlopen(request).read())


'''
appends the data we currently look at to the relevant list
'''
def subreddit_json_parser(data, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates):
        for post in data['data']['children']:
            if post['kind'] == 't3':
                for key, value in post['data'].items():
                    if key == "title":
                        title.append(value)
                    elif key == "id":
                        ids.append(str(value))
                    elif key == "ups":
                        upvts.append(value)
                    elif key == "downs":
                        downvts.append(value)
                    elif key == "author":
                        authors.append(value)
                    elif key == "num_comments":
                        comments.append(value)
                    elif key == "score":
                        score.append(value)
                    elif key == "media":
                        media.append(value)
                    elif key == "distinguished":
                        distin.append(value)
                    elif key == "selftext":
                        selftxt.append(value)
                    elif key == "created_utc":
                        dates.append(value)
                    else:
                        pass

def get_subreddit_df(subreddit, sort_call, user_agent, n, t=None, api_call_limit=100, status=False):
    '''
    Builds a subreddit dataframe.
    
    Parameters --
    subreddit: specifies which reddit subreddit to pull information from
    sort_call: specifies whether you want to sort by top, hot, rising etc.
    user_agent: so reddit won't reject our https requests
    n: int describing how many top results one would like
    t: specifies whether scope should be week, hour, day, year, etc.
    api_call_limit: Reddit doesn't serve more than 100 results at a time
    status: whether or not you want the status of the download printed in the console
    
    Returns --
    
    A pandas dataframe containing various bits of information about the top posts.
    
    '''
    reddit_base = 'http://www.reddit.com/r/%s/%s.json?' % (subreddit, sort_call) #Base api call
    headers = {'User-agent': user_agent}
    
    #Empty lists for information we'll extract
    ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates = [], [], [], [], [], [], [], [], [], [], []
    
    #Makes sure n and api_call_limit aren't floats!
    n = int(n) 
    api_call_limit = int(api_call_limit)
    
    #Since reddit only provides <= 100 calls at a time, looks at n requested and splits it up into different requests
    if n%api_call_limit != 0:
        remainder = n%api_call_limit
        num = (n/api_call_limit) +1
    else:
        num = n/api_call_limit
        remainder = api_call_limit

    #Makes an api call for all n entries based on the api call limit
    for i in range(num):
        if i == 0:
            post_params = {'limit': api_call_limit} 
            jsondata = json_extract(reddit_base, headers, post_params, t)
            tostartfrom = jsondata['data']['after']
            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
            time.sleep(2)
            if status == True:
                print "Downloaded %s posts..." % len(set(ids))
        elif i == num - 1:
            post_params = {'limit': remainder, 'after': tostartfrom} #Indicates the post after we wish to call from
            jsondata = json_extract(reddit_base, headers, post_params, t)
            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
            time.sleep(2)
            if status == True:
                print "Downloaded %s posts..." % len(set(ids))
        else: 
            post_params = {'limit': api_call_limit, 'after': tostartfrom} 
            jsondata = json_extract(reddit_base, headers, post_params, t)
            tostartfrom = jsondata['data']['after']
            subreddit_json_parser(jsondata, ids, title, upvts, downvts, authors, comments, score, media, distin, selftxt, dates)
            time.sleep(2)
            if status == True:
                print "Downloaded %s posts..." % len(set(ids))
    
    tempdict = {'id': ids, 'title': title, 'upvotes': upvts, 'media': media, 'distinguished' : distin,\
                'selftext': selftxt, 'downvotes': downvts, 'comments': comments, 'score': score, 'author': authors, 'time_created': dates}

    return pd.DataFrame(tempdict)


# helper function to determine if float
def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False

# helper function to clean strings
def filterstr(x):
    try:
        y= ''.join(e for e in x if (e == ' ' or e.isalnum()) and not e.isdigit())
        return y
    except:
        return ''

# Function to clean downloaded information
def subset_subreddit(df):
    df = df[df['distinguished'].apply(lambda x: x==None)] # Makes sure poster isn't a moderator (distinguished)
    df = df[df['media'].apply(lambda x: x==None)] # Also makes sure post is text based
    df = df[df['title'].apply(lambda x: x!=None)] # Makes sure title isn't empty
    df = df.drop('distinguished',1)
    df = df.drop('media', 1)
    # Occasionally, text strings will be in areas where only ints should be. This takes care of that
    df=df[df['comments'].apply(lambda x: isfloat(x))]
    df=df[df['downvotes'].apply(lambda x: isfloat(x))]
    df=df[df['score'].apply(lambda x: isfloat(x))]
    df=df[df['upvotes'].apply(lambda x: isfloat(x))]
    # Ids can be weird as well, sometimes they manifest as long numbers
    # Reddit uses a id system in base 36, so I doubt a reasonable id will be longer than 8
    df=df[df['id'].apply(lambda x: len(str(x)) <= 8)]
    # Filter out characters that cause problems later
    for i in df.index:
        df['title'][i] = filterstr(df['title'][i])
        df['author'][i] = filterstr(df['author'][i])
        df['selftext'][i] = filterstr(df['selftext'][i])
    # Removes accented characters
    df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
    df['selftext'] = df['selftext'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
    df = df.drop_duplicates() #Remove duplicate entries, should they exist
    return df

# Names of the popular text-based subreddits we want to look at
subreddits = ['explainlikeimfive', 'AskReddit', 'TalesFromTechsupport', 
              'talesFromRetail', 'pettyrevenge', 'askhistorians', 
              'askscience', 'tifu', 'nosleep', 'jokes', 'atheism', 'politics']

#Append date stamp to txt file to indicate when download happened
timeformat = '%m-%d-%y-%H%M'
printdate = datetime.datetime.now().strftime(timeformat)
    
with open('Data/New/dltimestamp.txt', 'a') as the_file:
    the_file.write(printdate)

for subreddit in subreddits:
    #print subreddit
    
    # Reddit only serves 1000 posts on a section of a subreddit..
    # we will try to get as much as possible
    top_all = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'all', status=False)
    top_week = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'week', status=False)
    top_day = get_subreddit_df(subreddit, 'top', user_agent, 1000, 'day', status=False)
    hot = get_subreddit_df(subreddit, 'hot', user_agent, 1000, status=False)
    new = get_subreddit_df(subreddit, 'new', user_agent, 1000, status=False)
    
    #Subset and clean each dataframe
    top_all = subset_subreddit(top_all)
    top_week = subset_subreddit(top_week)
    top_day = subset_subreddit(top_day)
    hot = subset_subreddit(hot)
    new = subset_subreddit(new)
    
    # Add columns in each denoting name of subreddit
    top_all['subreddit'] = subreddit
    top_week['subreddit'] = subreddit
    top_day['subreddit'] = subreddit
    hot['subreddit'] = subreddit
    new['subreddit'] = subreddit
    
    #Adds columns denoting section of subreddit
    top_all['type'] = 'top_all'
    top_week['type'] = 'top_week'
    top_day['type'] = 'top_day'
    hot['type'] = 'hot'
    new['type'] = 'new'
    
    #Write scraped subreddit information to a csv file
    dltopall = 'Data/New/' + subreddit + 'top_all.csv'
    dltopweek = 'Data/New/' + subreddit + 'top_week.csv'
    dltopday = 'Data/New/' + subreddit + 'top_day.csv'
    dlhot = 'Data/New/' + subreddit + 'hot.csv'
    dlnew = 'Data/New/' + subreddit + 'new.csv'
    
    top_all.to_csv(dltopall, index=False, encoding='utf-8')
    top_week.to_csv(dltopweek, index=False, encoding='utf-8')
    top_day.to_csv(dltopday, index=False, encoding='utf-8')
    hot.to_csv(dlhot, index=False, encoding='utf-8')
    new.to_csv(dlnew, index=False, encoding='utf-8')