from __future__ import unicode_literals

# You need an API Key for PLOS
import settings

# Data analysis
import numpy as np
import pandas as pd
from numpy import nan
from pandas import Series, DataFrame

# Interacting with API
import requests
import urllib
import time
from retrying import retry
import os
import random
import json

# Natural language processing
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
import string

# For the IPython widgets:
from IPython.display import display, Image, HTML, clear_output
from IPython.html import widgets
from jinja2 import Template

r = requests.get('http://api.plos.org/search?q=subject:"biotechnology"&start=0&rows=500&api_key={%s}&wt=json' % settings.PLOS_KEY).json()
len(r['response']['docs'])

# Write out a file.
with open('biotech500.json', 'wb') as fp:
    json.dump(r, fp)

abstract_df = pd.read_pickle('../data/abstract_df.pkl')

len(list(abstract_df.author))

print list(abstract_df.subject)[0]

abstract_df.tail()

# Globally define a set of stopwords.
stops = set(stopwords.words('english'))
# We can add science-y stuff to it as well. Just an example:
stops.add('conclusions')


def wordify(abs_list, min_word_len=2):
    '''
    Convert the abstract field from PLoS API data to a filtered list of words.
    '''

    # The abstract field is a list. Make it a string.
    text = ' '.join(abs_list).strip(' \n\t')

    if text == '':
        return nan

    else:
        # Remove punctuation & replace with space,
        # because we want 'metal-contaminated' => 'metal contaminated'
        # ...not 'metalcontaminated', and so on.
        for c in string.punctuation:
            text = text.replace(c, ' ')

        # Now make it a Series of words, and do some cleaning.
        words = Series(text.split(' '))
        words = words.str.lower()
        # Filter out words less than minimum word length.
        words = words[words.str.len() >= min_word_len]
        words = words[~words.str.contains(r'[^#@a-z]')]  # What exactly does this do?

        # Filter out globally-defined stopwords
        ignore = stops & set(words.unique())
        words_out = [w for w in words.tolist() if w not in ignore]

        return words_out


with open('biotech500.json', 'rb') as fp:
    data = json.load(fp)
    
articles_list = data['response']['docs']
articles = DataFrame(articles_list)
articles = articles[articles['abstract'].notnull()]
articles.head()

articles['words'] = articles.apply(lambda s: wordify(s['abstract'] + [s['title_display']]), axis=1)
articles.drop(['article_type', 'score', 'title_display', 'abstract'], axis=1, inplace=True)
articles.head()

abs_df = DataFrame(articles['words'].apply(lambda x: ' '.join(x)).tolist(), columns=['text'])
abs_df.head()

#include all words from abstracts for getting common word pairs
words_all = pd.Series(' '.join(abs_df['text']).split(' '))
words_all.value_counts()

relevant_words_pairs = words_all.copy()
relevant_words_pairs.value_counts()

bcf = BigramCollocationFinder.from_words(relevant_words_pairs)
for pair in bcf.nbest(BigramAssocMeasures.likelihood_ratio, 30):
    print ' '.join(pair)

bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)

abs_set_df = DataFrame(articles['words'].apply(lambda x: ' '.join(set(x))).tolist(), columns=['text'])
abs_set_df.head()

words = pd.Series(' '.join(abs_set_df['text']).split(' '))
words.value_counts()

top_words = words.value_counts().reset_index()
top_words.columns = ['word', 'count']
top_words.head(15)

# top_words.to_csv('../wordcloud2.csv', index=False)

articles_list = data['response']['docs']
articles = DataFrame(articles_list)
articles = articles[articles['abstract'].notnull()].ix[:,['abstract', 'publication_date']]
articles.abstract = articles.abstract.apply(wordify, 3)
articles = articles[articles['abstract'].notnull()]
articles.publication_date = pd.to_datetime(articles.publication_date)
articles.head()

print articles.publication_date.min(), articles.publication_date.max()
print len(articles)

articles_timed = articles.set_index('publication_date')
articles_timed.head()

articles_monthly = articles_timed.resample('M', how='sum', fill_method='ffill', kind='period')
articles_monthly.abstract = articles_monthly.abstract.apply(lambda x: np.nan if x == 0 else x)
articles_monthly.fillna(method='ffill', inplace=True)
articles_monthly.head()

widgetmax = len(articles_monthly) - 1

def textbarf(t): 
    html_template = """
    <style>
    #textbarf {
        display: block;
        width: 666px;
        padding: 23px;
        background-color: #ddeeff;
    }
    </style>
    <div id="textbarf"> {{blargh}} </div>"""

    blob = ' '.join(articles_monthly.ix[t]['abstract'])
    html_src = Template(html_template).render(blargh=blob)
    display(HTML(html_src))


widgets.interact(textbarf,
                 t=widgets.IntSliderWidget(min=0,max=widgetmax,step=1,value=42),
                )