import urllib2
import urllib
from BeautifulSoup import BeautifulSoup

article_url = 'http://www.pnas.org/content/101/7/1822'
response = urllib2.urlopen(article_url)
html_response = response.read()
data = BeautifulSoup(html_response)

title = data.findAll('title')
title

authors = data.findAll('meta', {'name':'citation_author'})

authors

author_names = [a['content'].encode('utf8') for a in authors]

author_names

pubmed_param = {'db': 'pubmed',
                'usehistory': 'y',
              'term': author_names[0]+' AND '+author_names[1]+'[author]'}
pubmed_encoded_param = urllib.urlencode(pubmed_param)
pubmed_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
pubmed_url = pubmed_url + '?' + pubmed_encoded_param
pubmed_response = urllib2.urlopen(pubmed_url)
xml_response = pubmed_response.read()
pubmed_data = BeautifulSoup(xml_response)

pubmed_ids = [node.findAll(text=True) for node in pubmed_data.findAll('id')]
pubmed_ids = [int(pubmed_id) for pm_list in pubmed_ids for pubmed_id in pm_list]
print pubmed_ids

import itertools

author_combinations = [list(itertools.combinations(author_names, number_authors)) for number_authors in range(len(author_names)+1)]

author_combinations = [list(auth_comb) for auth_list in author_combinations for auth_comb in auth_list if len(auth_comb) > 0]

author_combinations

' AND '.join(author_combinations[-1])

uid_data = []
for auth_comb in author_combinations:
    pubmed_param = {'db': 'pubmed',
                    'usehistory': 'y',
                    'term': ' AND '.join(auth_comb)+'[author]'}
    pubmed_encoded_param = urllib.urlencode(pubmed_param)
    pubmed_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    pubmed_url = pubmed_url + '?' + pubmed_encoded_param
    pubmed_response = urllib2.urlopen(pubmed_url)
    xml_response = pubmed_response.read()
    pubmed_data = BeautifulSoup(xml_response)
    pubmed_ids = [node.findAll(text=True) for node in pubmed_data.findAll('id')]
    pubmed_ids = [int(pubmed_id) for pm_list in pubmed_ids for pubmed_id in pm_list]
    pubmed_ids = [pmid for pmid in pubmed_ids]
    print auth_comb, pubmed_ids
    
    uid_data.append([auth_comb, pubmed_ids])

x = [len(t[0]) for t in uid_data if len(t[1]) > 0]
y = [len(t[1]) for t in uid_data if len(t[1]) > 0]
fig = plt.figure()
grid(True)
ax = fig.add_subplot(111)

ax.set_xlabel('number of authors on paper = significance of match')
ax.set_ylabel('number of papers found on PubMed')
#ax.set_ylim(bottom=0)
#ax.set_xlim([0,5])
#ax.set_yticks([1,2,7,13,19])
#ax.set_xticks([1,2,3,4,5])
scatter(x,y)

abstracts = []
for el in uid_data:
    auth_comb = el[0]
    uids = el[1]
    if len(uids) > 0:
        pubmed_param = {'db': 'pubmed',
                        'usehistory': 'y',
                        'id': ','.join([str(uid) for uid in uids]),
                        'retmode': 'xml'}
        pubmed_encoded_param = urllib.urlencode(pubmed_param)
        pubmed_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        pubmed_url = pubmed_url + '?' + pubmed_encoded_param
        pubmed_response = urllib2.urlopen(pubmed_url)
        xml_response = pubmed_response.read()
        pubmed_data = BeautifulSoup(xml_response)
        pubmed_abstracts = [node.findAll(text=True) for node in pubmed_data.findAll('abstracttext')]
        for abstract in pubmed_abstracts:
            abstracts.append((auth_comb, abstract[0]))

len(abstracts)

abstracts[0]

abstracts[1]

from topia.termextract import extract

extractor = extract.TermExtractor()

from operator import itemgetter

keywords_weighted = {}
for abstract in abstracts:
    
    auth_comb = abstract[0]
    
    keywords = sorted(extractor(abstract[1]), key=itemgetter(2), reverse=True)
    
    keywords_filtered = []
    for keyword in keywords:
        include = True
        for el in [')', '(', 'i.e', 'e.g', '.', ',','/','\\','*',';','&']:
            if el in keyword[0]:
                include = False
        if include:
            keywords_filtered.append(keyword)
            
    for keyword in keywords_filtered:
        if keyword[0] not in keywords_weighted.keys():
            keywords_weighted[keyword[0]] = keyword[2]+len(auth_comb)
        else:
            keywords_weighted[keyword[0]] =keywords_weighted[keyword[0]]+keyword[2]+len(auth_comb)
            

[key for key in keywords_weighted.keys() if keywords_weighted[key] > 20]

[key for key in keywords_weighted.keys() if keywords_weighted[key] > 10 and keywords_weighted[key] <= 20]

[key for key in keywords_weighted.keys() if keywords_weighted[key] > 5 and keywords_weighted[key] <= 10]

[key for key in keywords_weighted.keys() if keywords_weighted[key] <= 5]