TLDR: I combined recent arXiv machine learning publications with Mendeley reader data to get a kind of popularity scoring similar to Hacker News or Reddit for publications. It works, but there are apparently not enough Mendeley users for actually useful results. Skip to the end of the notebook if you just want to see the results.
I had this idea of using some kind of popularity metric to keep up to date with recent publications without much manual effort. This weekend I hacked together a proof of concept implementation. There are a number of possible metrics that could be used for such a scoring:
import feedparser
import matplotlib.pyplot as plt
import mendeley
import os.path
import pandas as pd
import pickle
import urllib
import time
from datetime import datetime, timedelta, date
from dateutil.parser import parse
from dateutil.tz import tzlocal
from IPython.display import HTML
from IPython.display import clear_output
from ipy_progressbar import ProgressBar
from itertools import tee
from mendeley import Mendeley
from pytz import timezone
%matplotlib notebook
# adapted from: http://stackoverflow.com/a/1060330
def period_range(start_date, end_date, period=1):
for n in range(int ((end_date - start_date).days / period)):
yield start_date + timedelta(n)
# see http://arxiv.org/help/api/user-manual#subject_classifications for categories
arxiv_categories = ['stat.ML', 'cs.NE', 'cs.AI']
search_term = '+OR+'.join(['cat:{}'.format(category) for category in arxiv_categories])
arxiv_base_url = 'http://export.arxiv.org/api/query?';
search_query = '{}&sortBy=lastUpdatedDate&sortOrder=descending'.format(search_term)
start = 0
total_results = 100000
results_per_iteration = 1000
begin_year = 2014
wait_time = 3
arxiv_file_name = 'arxiv-publications.pickle'
if (os.path.isfile(arxiv_file_name)):
with open(arxiv_file_name, 'rb') as f:
entries = pickle.load(f)
else:
print('Searching arXiv for %s' % search_query)
entries = []
for i in range(start,total_results,results_per_iteration):
print("Parsing results %i - %i" % (i,i+results_per_iteration))
query = 'search_query=%s&start=%i&max_results=%i' % (search_query, i, results_per_iteration)
response = urllib.request.urlopen(arxiv_base_url + query).read()
feed = feedparser.parse(response)
if len(feed.entries) == 0:
print('No more entries')
break
done = False
for entry in feed.entries:
dt_updated = parse(entry.updated)
if (dt_updated.year < begin_year):
done = True
break
dt_published = parse(entry.published)
if (dt_published.year >= begin_year):
entries.append(entry)
if done:
print('No more entries for selected date range')
break
print('Sleeping for %i seconds' % wait_time)
time.sleep(wait_time)
with open(arxiv_file_name, 'wb') as f:
pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL)
Searching arXiv for cat:stat.ML+OR+cat:cs.NE+OR+cat:cs.AI&sortBy=lastUpdatedDate&sortOrder=descending Parsing results 0 - 1000 Sleeping for 3 seconds Parsing results 1000 - 2000 Sleeping for 3 seconds Parsing results 2000 - 3000 Sleeping for 3 seconds Parsing results 3000 - 4000 Sleeping for 3 seconds Parsing results 4000 - 5000 No more entries
fig, ax = plt.subplots(figsize=(12, 8))
df = pd.DataFrame([parse(entry['published']).date() for entry in entries], columns=['date'])
df.date = df.date.astype("datetime64")
_ = df.groupby([df.date.dt.year, df.date.dt.month]).count().plot(kind="bar", ax=ax, title='Publications by month')
# if you want to run this for yourself, you have to get an API key from mendeley
# http://dev.mendeley.com/
client_id = 'redacted'
client_key = 'redacted'
client_uri = 'redacted'
matched_file_name = 'matched_publications.pickle'
mendeley = Mendeley(client_id, client_secret=client_key, redirect_uri=client_uri)
auth = mendeley.start_client_credentials_flow()
session = auth.authenticate()
if (os.path.isfile(matched_file_name)):
with open(matched_file_name, 'rb') as f:
matched_entries = pickle.load(f)
else:
matched_entries = []
progress = ProgressBar(entries)
for entry in progress:
# try to match with the last updated arXiv version. If there's no match, try with the next oldest one
arxiv_id = 'arXiv:{}'.format(str(entry.id).split('/')[-1])
base_id, last_version = arxiv_id.rsplit('v', 1)
for version in range(int(last_version), 0, -1):
current_id = '{}v{}'.format(base_id, version)
try:
document = session.catalog.by_identifier(arxiv=current_id, view='stats')
matched_entries.append((entry, document))
break
except Exception:
pass
clear_output()
print('Matched {0} out of {1} articles ({2:.1f}%)'.format(len(matched_entries), \
len(entries), \
100 * len(matched_entries) / len(entries)))
with open(matched_file_name, 'wb') as f:
pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL)
Matched 1840 out of 3684 articles (49.9%)
data = pd.DataFrame([(ent.published, \
ent.title, \
ent.authors, \
doc.reader_count, \
ent.link, \
doc.link) for ent, doc in matched_entries], \
columns=['Date', 'Title', 'Authors', 'Readers', 'arXiv', 'Mendeley'])
fig, ax = plt.subplots(figsize=(12, 8))
data.Readers.hist(ax=ax, bins=50)
ax.set_yscale('log')
ax.set_xlabel('Readers')
ax.set_ylabel('Bin Count (Log)')
_ = ax.set_title('Mendeley Readers/Article Histogram')
def get_age(elem):
# TODO: automatically determine correct TZ
published = parse(elem['Date']).replace(tzinfo=timezone('EST5EDT'))
return datetime.now().replace(tzinfo=tzlocal()) - published
# see http://amix.dk/blog/post/19574 for further details
def calculate_score(age, readers, gravity=1.9):
return (readers - 1) / pow((age + 2), gravity)
def calculate_elem_score(elem, gravity=1.9):
return calculate_score(get_age(elem).days, elem['Readers'], gravity)
start_date = date(year=begin_year, month=1, day=1)
end_date = date.today()
delta_days = np.array([(date.today() - d).days for d in list(period_range(start_date, end_date))])
fig, ax = plt.subplots(figsize=(12, 8))
for readers in [5, 25, 100]:
ax.plot(delta_days, [calculate_score(days, readers) for days in delta_days], label='{} readers'.format(readers))
ax.legend()
ax.grid()
ax.set_xlim([0, np.max(delta_days)])
ax.set_ylim([0, 1])
ax.set_title('Score decay by time')
ax.set_xlabel('Age in days')
_ = ax.set_ylabel('Score')
data['Score'] = [calculate_elem_score(elem[1]) for elem in data.iterrows()]
pd.set_option('display.max_colwidth', 100000)
data_out = data.sort(['Score'], ascending=[0])[['Score', 'Date', 'Readers', 'Authors','Title', 'arXiv', 'Mendeley']]
data_out['Date'] = [parse(date).replace(tzinfo=timezone('EST5EDT')).date() for date in data_out['Date']]
data_out['arXiv'] = ['<a href="{}">{}</a>'.format(link, link.rsplit('/')[-1]) for link in data_out['arXiv']]
data_out['Mendeley'] = ['<a href="{}">{}</a>'.format(link, 'link') for link in data_out['Mendeley']]
data_out['Authors'] = [', '.join([author['name'].rsplit()[-1] for author in elem]) for elem in data_out['Authors']]
data_out.set_index('Score', inplace=True)
with open('Papers.html', 'w') as f:
f.write(data_out.to_html(escape=False))
HTML(data_out.head(n=30).to_html(escape=False))
Date | Readers | Authors | Title | arXiv | Mendeley | |
---|---|---|---|---|---|---|
Score | ||||||
0.092487 | 2015-08-05 | 72 | Zhang, Wang | Relation Classification via Recurrent Neural Network | 1508.01006v1 | link |
0.011518 | 2015-07-03 | 34 | Wood, Meent, Mansinghka | A New Approach to Probabilistic Programming Inference | 1507.00996v2 | link |
0.009359 | 2015-05-19 | 73 | Kato, Harada | Image Reconstruction from Bag-of-Visual-Words | 1505.05190v1 | link |
0.009236 | 2015-06-02 | 56 | Ipeirotis, Gabrilovich | Quizz: Targeted crowdsourcing with a billion (potential) users | 1506.01062v1 | link |
0.008141 | 2015-07-05 | 23 | Mirowski, Vlachos | Dependency Recurrent Neural Language Models for Sentence Completion | 1507.01193v1 | link |
0.005800 | 2015-05-28 | 39 | Karaletsos, Rätsch | Automatic Relevance Determination For Deep Generative Models | 1505.07765v3 | link |
0.004865 | 2015-07-08 | 13 | Steeg, Galstyan | The Information Sieve | 1507.02284v1 | link |
0.004676 | 2014-04-30 | 617 | Schmidhuber | Deep Learning in Neural Networks: An Overview | 1404.7828v4 | link |
0.004497 | 2015-05-15 | 38 | Ramdas, Peña | Margins, Kernels and Non-linear Smoothed Perceptrons | 1505.04123v1 | link |
0.004333 | 2015-04-27 | 48 | Hu, Zhu, Xu, Zhang | Fast Sampling for Bayesian Max-Margin Models | 1504.07107v4 | link |
0.004306 | 2015-03-11 | 84 | Hu, Lu, Li, Chen | Convolutional Neural Network Architectures for Matching Natural Language\n Sentences | 1503.03244v1 | link |
0.003273 | 2015-04-30 | 35 | Aodha, Campbell, Kautz, Brostow | Hierarchical Subquery Evaluation for Active Learning on a Graph | 1504.08219v1 | link |
0.003243 | 2015-07-08 | 9 | Landesa-Vázquez, Alba-Castro | Double-Base Asymmetric AdaBoost | 1507.02154v1 | link |
0.003203 | 2015-02-10 | 83 | Deisenroth, Fox, Rasmussen | Gaussian Processes for Data-Efficient Learning in Robotics and Control | 1502.02860v1 | link |
0.003058 | 2014-03-12 | 483 | Brigadir, Greene, Cunningham | Adaptive Representations for Tracking Breaking News on Twitter | 1403.2923v3 | link |
0.002651 | 2015-04-24 | 31 | Neelakantan, Collins | Learning Dictionaries for Named Entity Recognition using Minimal\n Supervision | 1504.06650v1 | link |
0.002585 | 2015-06-01 | 17 | Roudi, Taylor | Learning with hidden variables | 1506.00354v2 | link |
0.002513 | 2015-05-20 | 20 | Sartakhti, Ghadiri, Afrabandpey | Fuzzy Least Squares Twin Support Vector Machines | 1505.05451v1 | link |
0.002120 | 2014-12-23 | 82 | Firat, Aksan, Oztekin, Vural | Learning Deep Temporal Representations for Brain Decoding | 1412.7522v4 | link |
0.001799 | 2014-08-25 | 143 | Kim | Convolutional Neural Networks for Sentence Classification | 1408.5882v2 | link |
0.001673 | 2015-07-09 | 5 | Yu, Li | Parameter Sensitivity Analysis of Social Spider Algorithm | 1507.02491v1 | link |
0.001649 | 2015-07-02 | 6 | Zhan, Taylor | Online Transfer Learning in Reinforcement Learning Domains | 1507.00436v2 | link |
0.001473 | 2015-03-09 | 30 | Hinton, Vinyals, Dean | Distilling the Knowledge in a Neural Network | 1503.02531v1 | link |
0.001445 | 2015-07-24 | 3 | Krijthe, Loog | Implicitly Constrained Semi-Supervised Least Squares Classification | 1507.06802v1 | link |
0.001445 | 2015-07-24 | 3 | Doncieux, Liénard, Girard, Hamdaoui, Chaskalovic | Multi-objective analysis of computational models | 1507.06877v1 | link |
0.001419 | 2015-06-18 | 7 | Runge, Donner, Kurths | Optimal model-free prediction from multivariate time series | 1506.05822v1 | link |
0.001402 | 2014-12-12 | 59 | Abraham, Pedregosa, Eickenberg, Gervais, Muller, Kossaifi, Gramfort, Thirion, Varoquaux | Machine Learning for Neuroimaging with Scikit-Learn | 1412.3919v1 | link |
0.001355 | 2015-06-17 | 7 | Liang, Ganesh, Raman, Czarnecki | SAT-based Analysis of Large Real-world Feature Models is Easy | 1506.05198v3 | link |
0.001289 | 2014-08-13 | 109 | Le, Sarlos, Smola | Fastfood: Approximate Kernel Expansions in Loglinear Time | 1408.3060v1 | link |
0.001239 | 2015-06-12 | 7 | Andreas, Rabinovich, Klein, Jordan | On the accuracy of self-normalized log-linear models | 1506.04147v2 | link |