#!/usr/bin/env python
# coding: utf-8

# # Popularity scoring for arXiv publications

# **TLDR: I combined recent arXiv machine learning publications with Mendeley reader data to get a kind of popularity scoring similar to Hacker News or Reddit for publications. It works, but there are apparently not enough Mendeley users for actually useful results. Skip to the end of the notebook if you just want to see the results.**

# I had this idea of using some kind of popularity metric to keep up to date with recent publications without much manual effort. This weekend I hacked together a proof of concept implementation. There are a number of possible metrics that could be used for such a scoring:
# 
# * arXiv page view / download statistics: Unfortunately, arXiv doesn't provide these (see their [FAQ](http://arxiv.org/help/faq/statfaq) on the subject).
# * citations: Citations have a significant delay. Furthermore, to the best of my knowledge, there's no decent public API to get this kind of data. Google scholar has good results, but their API specifically forbids scraping their services. Microsoft Academic Search apparently provides an API on request, but they never replied when I asked for one several months ago.
# * crowd sourcing: This is a) out of scoop for this proof of concept and b) would most likely just result in a bad clone of Reddit's MachineLearning subreddit or Hacker News.
# * third party metrics: I finally settled for Mendeley reader stats. They have a good API and provide access to their statistics data. Unfortunately, there are apparently not enough Mendeley users (at least in the Machine Learning community) for really meaningful results. ** I would love to get suggestions for better data sources. **

# In[143]:


import feedparser
import matplotlib.pyplot as plt
import mendeley
import os.path
import pandas as pd
import pickle
import urllib
import time

from datetime import datetime, timedelta, date
from dateutil.parser import parse
from dateutil.tz import tzlocal
from IPython.display import HTML
from IPython.display import clear_output
from ipy_progressbar import ProgressBar
from itertools import tee
from mendeley import Mendeley
from pytz import timezone

get_ipython().run_line_magic('matplotlib', 'notebook')


# In[144]:


# adapted from: http://stackoverflow.com/a/1060330
def period_range(start_date, end_date, period=1):
    for n in range(int ((end_date - start_date).days / period)):
        yield start_date + timedelta(n)


# # Recent arXiv publications

# In[145]:


# see http://arxiv.org/help/api/user-manual#subject_classifications for categories
arxiv_categories = ['stat.ML', 'cs.NE', 'cs.AI']

search_term = '+OR+'.join(['cat:{}'.format(category) for category in arxiv_categories])


# In[146]:


arxiv_base_url = 'http://export.arxiv.org/api/query?';
search_query   = '{}&sortBy=lastUpdatedDate&sortOrder=descending'.format(search_term)

start = 0
total_results = 100000
results_per_iteration = 1000
begin_year = 2014
wait_time = 3

arxiv_file_name = 'arxiv-publications.pickle'


# In[147]:


if (os.path.isfile(arxiv_file_name)):
    with open(arxiv_file_name, 'rb') as f:
        entries = pickle.load(f)
else:
    print('Searching arXiv for %s' % search_query)

    entries = []
    for i in range(start,total_results,results_per_iteration): 
        print("Parsing results %i - %i" % (i,i+results_per_iteration))

        query    = 'search_query=%s&start=%i&max_results=%i' % (search_query, i, results_per_iteration)
        response = urllib.request.urlopen(arxiv_base_url + query).read()
        feed     = feedparser.parse(response)

        if len(feed.entries) == 0:
            print('No more entries')
            break

        done = False
        for entry in feed.entries:        
            dt_updated = parse(entry.updated)
            if (dt_updated.year < begin_year):
                done = True
                break

            dt_published = parse(entry.published)
            if (dt_published.year >= begin_year):
                entries.append(entry)

        if done:
            print('No more entries for selected date range')
            break

        print('Sleeping for %i seconds' % wait_time)
        time.sleep(wait_time)
        
    with open(arxiv_file_name, 'wb') as f:
        pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL)


# In[148]:


fig, ax = plt.subplots(figsize=(12, 8))

df = pd.DataFrame([parse(entry['published']).date() for entry in entries], columns=['date'])
df.date = df.date.astype("datetime64")
_ = df.groupby([df.date.dt.year, df.date.dt.month]).count().plot(kind="bar", ax=ax, title='Publications by month')


# # Match with Mendeley data

# In[149]:


# if you want to run this for yourself, you have to get an API key from mendeley
# http://dev.mendeley.com/
client_id    = 'redacted'
client_key   = 'redacted'
client_uri   = 'redacted'

matched_file_name = 'matched_publications.pickle'
    
mendeley = Mendeley(client_id, client_secret=client_key, redirect_uri=client_uri)
auth     = mendeley.start_client_credentials_flow()
session  = auth.authenticate()


# In[150]:


if (os.path.isfile(matched_file_name)):
    with open(matched_file_name, 'rb') as f:
        matched_entries = pickle.load(f)
else:
    matched_entries = []

    progress = ProgressBar(entries)
    for entry in progress:
        # try to match with the last updated arXiv version. If there's no match, try with the next oldest one
        arxiv_id = 'arXiv:{}'.format(str(entry.id).split('/')[-1])
        base_id, last_version = arxiv_id.rsplit('v', 1)
        for version in range(int(last_version), 0, -1):
            current_id = '{}v{}'.format(base_id, version)
            try:
                document = session.catalog.by_identifier(arxiv=current_id, view='stats')
                matched_entries.append((entry, document))
                break
            except Exception:
                pass

    clear_output()

    print('Matched {0} out of {1} articles ({2:.1f}%)'.format(len(matched_entries), \
                                                        len(entries), \
                                                        100 * len(matched_entries) / len(entries)))
    
    with open(matched_file_name, 'wb') as f:
        pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL)


# In[151]:


data = pd.DataFrame([(ent.published, \
                      ent.title, \
                      ent.authors, \
                      doc.reader_count, \
                      ent.link, \
                      doc.link) for ent, doc in matched_entries], \
                    columns=['Date', 'Title', 'Authors', 'Readers', 'arXiv', 'Mendeley'])


# In[152]:


fig, ax = plt.subplots(figsize=(12, 8))
data.Readers.hist(ax=ax, bins=50)
ax.set_yscale('log')
ax.set_xlabel('Readers')
ax.set_ylabel('Bin Count (Log)')
_ = ax.set_title('Mendeley Readers/Article Histogram')


# # Scoring

# In[153]:


def get_age(elem):
    # TODO: automatically determine correct TZ
    published = parse(elem['Date']).replace(tzinfo=timezone('EST5EDT'))
    return datetime.now().replace(tzinfo=tzlocal()) - published


# In[154]:


# see http://amix.dk/blog/post/19574 for further details
def calculate_score(age, readers, gravity=1.9):
    return (readers - 1) / pow((age + 2), gravity)

def calculate_elem_score(elem, gravity=1.9):
    return calculate_score(get_age(elem).days, elem['Readers'], gravity)


# In[155]:


start_date = date(year=begin_year, month=1, day=1)
end_date   = date.today()

delta_days = np.array([(date.today() - d).days for d in list(period_range(start_date, end_date))])

fig, ax = plt.subplots(figsize=(12, 8))

for readers in [5, 25, 100]:
    ax.plot(delta_days, [calculate_score(days, readers) for days in delta_days], label='{} readers'.format(readers))

ax.legend()

ax.grid()
ax.set_xlim([0, np.max(delta_days)])
ax.set_ylim([0, 1])

ax.set_title('Score decay by time')
ax.set_xlabel('Age in days')
_ = ax.set_ylabel('Score')


# In[156]:


data['Score'] = [calculate_elem_score(elem[1]) for elem in data.iterrows()]


# # Results

# In[157]:


pd.set_option('display.max_colwidth', 100000)

data_out = data.sort(['Score'], ascending=[0])[['Score', 'Date', 'Readers', 'Authors','Title', 'arXiv', 'Mendeley']]
data_out['Date'] = [parse(date).replace(tzinfo=timezone('EST5EDT')).date() for date in data_out['Date']]
data_out['arXiv'] = ['<a href="{}">{}</a>'.format(link, link.rsplit('/')[-1]) for link in data_out['arXiv']]
data_out['Mendeley'] = ['<a href="{}">{}</a>'.format(link, 'link') for link in data_out['Mendeley']]
data_out['Authors'] = [', '.join([author['name'].rsplit()[-1] for author in elem]) for elem in data_out['Authors']]

data_out.set_index('Score',  inplace=True)

with open('Papers.html', 'w') as f:
    f.write(data_out.to_html(escape=False))

HTML(data_out.head(n=30).to_html(escape=False))