import json
import requests
import re
import pandas as pd
from pandas import DataFrame
scholar_search = 'https://scholar.google.com/scholar?cites=http://www.ncbi.nlm.nih.gov/pubmed/'
pubmed_search = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=10000&term=%22retracted%20publication%22[Publication%20Type]'
pubmed_deets = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id='
file_pattern = 'html/scholar?cites=http:%2F%2Fwww.ncbi.nlm.nih.gov%2Fpubmed%2F'
#regex = r'/[1-9](?:\d{0,2})(?:,\d{3})*(?:\.\d*[1-9])?|0?\.\d*[1-9]|0/ '
#regex = r'About\s[0-9],[0-9]*\sresults'
#regex = r'About\s[0-9]{1,3}(,[0-9]{3})*(\.[0-9]+)?\sresults'
regex = r'About\s[0-9]{1,3}(,[0-9]{3})?\sresults'
def get_retracted_article_ids():
"""
Return a list of Pubmed ids that are retracted
"""
return json.loads(requests.get(pubmed_search).content)['esearchresult']['idlist']
def get_pubmed_detail(pubmed_id):
"""
Use the Pubmed API to return the details of a pubmed article
"""
return json.loads (requests.get(pubmed_deets+pubmed_id).content)['result'][pubmed_id]
#make a list of all of retracted paper pubmed ids
retraction_ids = get_retracted_article_ids()
#make a list of google scholar citation searches for our retracted pubmed ids
filelist = open("list.txt", "w")
for retraction in retraction_ids:
filelist.write(scholar_search + retraction)
filelist.write("\n")
filelist.close()
#go through our our downloaded search results and extract number of citations text
results = {}
for retraction in retraction_ids:
f = file_pattern + retraction
result = open(f, 'r')
reg_snip = re.search(regex, result.read())
if reg_snip != None:
results[retraction] = reg_snip.group()
else:
results[retraction] = '0'
df = DataFrame.from_dict(results.items())
df.columns = ['pmid', 'citations']
df['citations'] = df['citations'].str.replace('About', '')
df['citations'] = df['citations'].str.replace('results', '')
df['citations'] = df['citations'].str.replace(',', '')
df['citations'] = df['citations'].astype(float)
df = df.sort('citations', ascending=False).head(11) #let's make a top 10 table
df['title'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['title'])
df['author'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['authors'][0]['name'])
df['journal'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['fulljournalname'])
df['pubdate'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['pubdate'])
df['scholar_link'] = scholar_search + df['pmid']
df.head().to_csv('top5.csv')
df.to_csv('top10.csv')
df = pd.read_csv('top10.csv')
df
Unnamed: 0 | pmid | citations | title | author | journal | pubdate | scholar_link | |
---|---|---|---|---|---|---|---|---|
0 | 2892 | 9500320 | 2070 | Ileal-lymphoid-nodular hyperplasia, non-specif... | Wakefield AJ | Lancet | 1998 Feb 28 | https://scholar.google.com/scholar?cites=http:... |
1 | 2391 | 15604363 | 2050 | Visfatin: a protein secreted by visceral fat t... | Fukuhara A | Science (New York, N.Y.) | 2005 Jan 21 | https://scholar.google.com/scholar?cites=http:... |
2 | 508 | 11675329 | 1550 | Purification and ex vivo expansion of postnata... | Reyes M | Blood | 2001 Nov 1 | https://scholar.google.com/scholar?cites=http:... |
3 | 3540 | 12531578 | 1250 | Combination treatment of angiotensin-II recept... | Nakao N | Lancet | 2003 Jan 11 | https://scholar.google.com/scholar?cites=http:... |
4 | 1010 | 15833829 | 1040 | Spontaneous human adult stem cell transformation. | Rubio D | Cancer research | 2005 Apr 15 | https://scholar.google.com/scholar?cites=http:... |
5 | 2710 | 10700237 | 825 | Regression of human metastatic renal cell carc... | Kugler A | Nature medicine | 2000 Mar | https://scholar.google.com/scholar?cites=http:... |
6 | 477 | 14963337 | 805 | Evidence of a pluripotent human embryonic stem... | Hwang WS | Science (New York, N.Y.) | 2004 Mar 12 | https://scholar.google.com/scholar?cites=http:... |
7 | 2378 | 12176951 | 755 | Multiple atherosclerotic plaque rupture in acu... | Rioufol G | Circulation | 2002 Aug 13 | https://scholar.google.com/scholar?cites=http:... |
8 | 350 | 11546864 | 732 | Structure of MsbA from E. coli: a homolog of t... | Chang G | Science (New York, N.Y.) | 2001 Sep 7 | https://scholar.google.com/scholar?cites=http:... |
9 | 3494 | 8633243 | 616 | Synergistic activation of estrogen receptor wi... | Arnold SF | Science (New York, N.Y.) | 1996 Jun 7 | https://scholar.google.com/scholar?cites=http:... |
10 | 665 | 12351674 | 607 | Contribution of human alpha-defensin 1, 2, and... | Zhang L | Science (New York, N.Y.) | 2002 Nov 1 | https://scholar.google.com/scholar?cites=http:... |
11 rows × 8 columns