import json
import requests
import re
import pandas as pd
from pandas import DataFrame

scholar_search = 'https://scholar.google.com/scholar?cites=http://www.ncbi.nlm.nih.gov/pubmed/'
pubmed_search = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=10000&term=%22retracted%20publication%22[Publication%20Type]'
pubmed_deets = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id='

file_pattern = 'html/scholar?cites=http:%2F%2Fwww.ncbi.nlm.nih.gov%2Fpubmed%2F'

#regex = r'/[1-9](?:\d{0,2})(?:,\d{3})*(?:\.\d*[1-9])?|0?\.\d*[1-9]|0/ ' 

#regex = r'About\s[0-9],[0-9]*\sresults'

#regex = r'About\s[0-9]{1,3}(,[0-9]{3})*(\.[0-9]+)?\sresults'

regex = r'About\s[0-9]{1,3}(,[0-9]{3})?\sresults'

def get_retracted_article_ids():
    """
    Return a list of Pubmed ids that are retracted
    """
    return json.loads(requests.get(pubmed_search).content)['esearchresult']['idlist']

def get_pubmed_detail(pubmed_id):
    """
    Use the Pubmed API to return the details of a pubmed article
    """
    return json.loads (requests.get(pubmed_deets+pubmed_id).content)['result'][pubmed_id]

#make a list of all of retracted paper pubmed ids
retraction_ids = get_retracted_article_ids() 

#make a list of google scholar citation searches for our retracted pubmed ids
filelist = open("list.txt", "w")
for retraction in retraction_ids:
    filelist.write(scholar_search + retraction)
    filelist.write("\n")
filelist.close()

#go through our our downloaded search results and extract number of citations text
results = {}

for retraction in retraction_ids:
    f = file_pattern + retraction
    result = open(f, 'r')
    reg_snip = re.search(regex, result.read())
    if reg_snip != None:
        results[retraction] = reg_snip.group()
    else:
        results[retraction] = '0'


df = DataFrame.from_dict(results.items())
df.columns = ['pmid', 'citations']

df['citations'] = df['citations'].str.replace('About', '')
df['citations'] = df['citations'].str.replace('results', '')
df['citations'] = df['citations'].str.replace(',', '')
df['citations'] = df['citations'].astype(float)

df = df.sort('citations', ascending=False).head(11) #let's make a top 10 table

df['title'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['title'])
df['author'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['authors'][0]['name'])
df['journal'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['fulljournalname'])
df['pubdate'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['pubdate'])
df['scholar_link'] = scholar_search + df['pmid']

df.head().to_csv('top5.csv')
df.to_csv('top10.csv')

df = pd.read_csv('top10.csv')

df