import urllib2 import urllib from BeautifulSoup import BeautifulSoup article_url = 'http://www.pnas.org/content/101/7/1822' response = urllib2.urlopen(article_url) html_response = response.read() data = BeautifulSoup(html_response) title = data.findAll('title') title authors = data.findAll('meta', {'name':'citation_author'}) authors author_names = [a['content'].encode('utf8') for a in authors] author_names pubmed_param = {'db': 'pubmed', 'usehistory': 'y', 'term': author_names[0]+' AND '+author_names[1]+'[author]'} pubmed_encoded_param = urllib.urlencode(pubmed_param) pubmed_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' pubmed_url = pubmed_url + '?' + pubmed_encoded_param pubmed_response = urllib2.urlopen(pubmed_url) xml_response = pubmed_response.read() pubmed_data = BeautifulSoup(xml_response) pubmed_ids = [node.findAll(text=True) for node in pubmed_data.findAll('id')] pubmed_ids = [int(pubmed_id) for pm_list in pubmed_ids for pubmed_id in pm_list] print pubmed_ids import itertools author_combinations = [list(itertools.combinations(author_names, number_authors)) for number_authors in range(len(author_names)+1)] author_combinations = [list(auth_comb) for auth_list in author_combinations for auth_comb in auth_list if len(auth_comb) > 0] author_combinations ' AND '.join(author_combinations[-1]) uid_data = [] for auth_comb in author_combinations: pubmed_param = {'db': 'pubmed', 'usehistory': 'y', 'term': ' AND '.join(auth_comb)+'[author]'} pubmed_encoded_param = urllib.urlencode(pubmed_param) pubmed_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' pubmed_url = pubmed_url + '?' + pubmed_encoded_param pubmed_response = urllib2.urlopen(pubmed_url) xml_response = pubmed_response.read() pubmed_data = BeautifulSoup(xml_response) pubmed_ids = [node.findAll(text=True) for node in pubmed_data.findAll('id')] pubmed_ids = [int(pubmed_id) for pm_list in pubmed_ids for pubmed_id in pm_list] pubmed_ids = [pmid for pmid in pubmed_ids] print auth_comb, pubmed_ids uid_data.append([auth_comb, pubmed_ids]) x = [len(t[0]) for t in uid_data if len(t[1]) > 0] y = [len(t[1]) for t in uid_data if len(t[1]) > 0] fig = plt.figure() grid(True) ax = fig.add_subplot(111) ax.set_xlabel('number of authors on paper = significance of match') ax.set_ylabel('number of papers found on PubMed') #ax.set_ylim(bottom=0) #ax.set_xlim([0,5]) #ax.set_yticks([1,2,7,13,19]) #ax.set_xticks([1,2,3,4,5]) scatter(x,y) abstracts = [] for el in uid_data: auth_comb = el[0] uids = el[1] if len(uids) > 0: pubmed_param = {'db': 'pubmed', 'usehistory': 'y', 'id': ','.join([str(uid) for uid in uids]), 'retmode': 'xml'} pubmed_encoded_param = urllib.urlencode(pubmed_param) pubmed_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' pubmed_url = pubmed_url + '?' + pubmed_encoded_param pubmed_response = urllib2.urlopen(pubmed_url) xml_response = pubmed_response.read() pubmed_data = BeautifulSoup(xml_response) pubmed_abstracts = [node.findAll(text=True) for node in pubmed_data.findAll('abstracttext')] for abstract in pubmed_abstracts: abstracts.append((auth_comb, abstract[0])) len(abstracts) abstracts[0] abstracts[1] from topia.termextract import extract extractor = extract.TermExtractor() from operator import itemgetter keywords_weighted = {} for abstract in abstracts: auth_comb = abstract[0] keywords = sorted(extractor(abstract[1]), key=itemgetter(2), reverse=True) keywords_filtered = [] for keyword in keywords: include = True for el in [')', '(', 'i.e', 'e.g', '.', ',','/','\\','*',';','&']: if el in keyword[0]: include = False if include: keywords_filtered.append(keyword) for keyword in keywords_filtered: if keyword[0] not in keywords_weighted.keys(): keywords_weighted[keyword[0]] = keyword[2]+len(auth_comb) else: keywords_weighted[keyword[0]] =keywords_weighted[keyword[0]]+keyword[2]+len(auth_comb) [key for key in keywords_weighted.keys() if keywords_weighted[key] > 20] [key for key in keywords_weighted.keys() if keywords_weighted[key] > 10 and keywords_weighted[key] <= 20] [key for key in keywords_weighted.keys() if keywords_weighted[key] > 5 and keywords_weighted[key] <= 10] [key for key in keywords_weighted.keys() if keywords_weighted[key] <= 5]