import json import requests import re import pandas as pd from pandas import DataFrame scholar_search = 'https://scholar.google.com/scholar?cites=http://www.ncbi.nlm.nih.gov/pubmed/' pubmed_search = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=10000&term=%22retracted%20publication%22[Publication%20Type]' pubmed_deets = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id=' file_pattern = 'html/scholar?cites=http:%2F%2Fwww.ncbi.nlm.nih.gov%2Fpubmed%2F' #regex = r'/[1-9](?:\d{0,2})(?:,\d{3})*(?:\.\d*[1-9])?|0?\.\d*[1-9]|0/ ' #regex = r'About\s[0-9],[0-9]*\sresults' #regex = r'About\s[0-9]{1,3}(,[0-9]{3})*(\.[0-9]+)?\sresults' regex = r'About\s[0-9]{1,3}(,[0-9]{3})?\sresults' def get_retracted_article_ids(): """ Return a list of Pubmed ids that are retracted """ return json.loads(requests.get(pubmed_search).content)['esearchresult']['idlist'] def get_pubmed_detail(pubmed_id): """ Use the Pubmed API to return the details of a pubmed article """ return json.loads (requests.get(pubmed_deets+pubmed_id).content)['result'][pubmed_id] #make a list of all of retracted paper pubmed ids retraction_ids = get_retracted_article_ids() #make a list of google scholar citation searches for our retracted pubmed ids filelist = open("list.txt", "w") for retraction in retraction_ids: filelist.write(scholar_search + retraction) filelist.write("\n") filelist.close() #go through our our downloaded search results and extract number of citations text results = {} for retraction in retraction_ids: f = file_pattern + retraction result = open(f, 'r') reg_snip = re.search(regex, result.read()) if reg_snip != None: results[retraction] = reg_snip.group() else: results[retraction] = '0' df = DataFrame.from_dict(results.items()) df.columns = ['pmid', 'citations'] df['citations'] = df['citations'].str.replace('About', '') df['citations'] = df['citations'].str.replace('results', '') df['citations'] = df['citations'].str.replace(',', '') df['citations'] = df['citations'].astype(float) df = df.sort('citations', ascending=False).head(11) #let's make a top 10 table df['title'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['title']) df['author'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['authors'][0]['name']) df['journal'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['fulljournalname']) df['pubdate'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['pubdate']) df['scholar_link'] = scholar_search + df['pmid'] df.head().to_csv('top5.csv') df.to_csv('top10.csv') df = pd.read_csv('top10.csv') df