from __future__ import unicode_literals # You need an API Key for PLOS import settings # Data analysis import numpy as np import pandas as pd from numpy import nan from pandas import Series, DataFrame # Interacting with API import requests import urllib import time from retrying import retry import os import random import json # Natural language processing import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.corpus import stopwords import string # For the IPython widgets: from IPython.display import display, Image, HTML, clear_output from IPython.html import widgets from jinja2 import Template r = requests.get('http://api.plos.org/search?q=subject:"biotechnology"&start=0&rows=500&api_key={%s}&wt=json' % settings.PLOS_KEY).json() len(r['response']['docs']) # Write out a file. with open('biotech500.json', 'wb') as fp: json.dump(r, fp) abstract_df = pd.read_pickle('../data/abstract_df.pkl') len(list(abstract_df.author)) print list(abstract_df.subject)[0] abstract_df.tail() # Globally define a set of stopwords. stops = set(stopwords.words('english')) # We can add science-y stuff to it as well. Just an example: stops.add('conclusions') def wordify(abs_list, min_word_len=2): ''' Convert the abstract field from PLoS API data to a filtered list of words. ''' # The abstract field is a list. Make it a string. text = ' '.join(abs_list).strip(' \n\t') if text == '': return nan else: # Remove punctuation & replace with space, # because we want 'metal-contaminated' => 'metal contaminated' # ...not 'metalcontaminated', and so on. for c in string.punctuation: text = text.replace(c, ' ') # Now make it a Series of words, and do some cleaning. words = Series(text.split(' ')) words = words.str.lower() # Filter out words less than minimum word length. words = words[words.str.len() >= min_word_len] words = words[~words.str.contains(r'[^#@a-z]')] # What exactly does this do? # Filter out globally-defined stopwords ignore = stops & set(words.unique()) words_out = [w for w in words.tolist() if w not in ignore] return words_out with open('biotech500.json', 'rb') as fp: data = json.load(fp) articles_list = data['response']['docs'] articles = DataFrame(articles_list) articles = articles[articles['abstract'].notnull()] articles.head() articles['words'] = articles.apply(lambda s: wordify(s['abstract'] + [s['title_display']]), axis=1) articles.drop(['article_type', 'score', 'title_display', 'abstract'], axis=1, inplace=True) articles.head() abs_df = DataFrame(articles['words'].apply(lambda x: ' '.join(x)).tolist(), columns=['text']) abs_df.head() #include all words from abstracts for getting common word pairs words_all = pd.Series(' '.join(abs_df['text']).split(' ')) words_all.value_counts() relevant_words_pairs = words_all.copy() relevant_words_pairs.value_counts() bcf = BigramCollocationFinder.from_words(relevant_words_pairs) for pair in bcf.nbest(BigramAssocMeasures.likelihood_ratio, 30): print ' '.join(pair) bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20) abs_set_df = DataFrame(articles['words'].apply(lambda x: ' '.join(set(x))).tolist(), columns=['text']) abs_set_df.head() words = pd.Series(' '.join(abs_set_df['text']).split(' ')) words.value_counts() top_words = words.value_counts().reset_index() top_words.columns = ['word', 'count'] top_words.head(15) # top_words.to_csv('../wordcloud2.csv', index=False) articles_list = data['response']['docs'] articles = DataFrame(articles_list) articles = articles[articles['abstract'].notnull()].ix[:,['abstract', 'publication_date']] articles.abstract = articles.abstract.apply(wordify, 3) articles = articles[articles['abstract'].notnull()] articles.publication_date = pd.to_datetime(articles.publication_date) articles.head() print articles.publication_date.min(), articles.publication_date.max() print len(articles) articles_timed = articles.set_index('publication_date') articles_timed.head() articles_monthly = articles_timed.resample('M', how='sum', fill_method='ffill', kind='period') articles_monthly.abstract = articles_monthly.abstract.apply(lambda x: np.nan if x == 0 else x) articles_monthly.fillna(method='ffill', inplace=True) articles_monthly.head() widgetmax = len(articles_monthly) - 1 def textbarf(t): html_template = """
{{blargh}}
""" blob = ' '.join(articles_monthly.ix[t]['abstract']) html_src = Template(html_template).render(blargh=blob) display(HTML(html_src)) widgets.interact(textbarf, t=widgets.IntSliderWidget(min=0,max=widgetmax,step=1,value=42), )