#!/usr/bin/env python # coding: utf-8 # # Popularity scoring for arXiv publications # **TLDR: I combined recent arXiv machine learning publications with Mendeley reader data to get a kind of popularity scoring similar to Hacker News or Reddit for publications. It works, but there are apparently not enough Mendeley users for actually useful results. Skip to the end of the notebook if you just want to see the results.** # I had this idea of using some kind of popularity metric to keep up to date with recent publications without much manual effort. This weekend I hacked together a proof of concept implementation. There are a number of possible metrics that could be used for such a scoring: # # * arXiv page view / download statistics: Unfortunately, arXiv doesn't provide these (see their [FAQ](http://arxiv.org/help/faq/statfaq) on the subject). # * citations: Citations have a significant delay. Furthermore, to the best of my knowledge, there's no decent public API to get this kind of data. Google scholar has good results, but their API specifically forbids scraping their services. Microsoft Academic Search apparently provides an API on request, but they never replied when I asked for one several months ago. # * crowd sourcing: This is a) out of scoop for this proof of concept and b) would most likely just result in a bad clone of Reddit's MachineLearning subreddit or Hacker News. # * third party metrics: I finally settled for Mendeley reader stats. They have a good API and provide access to their statistics data. Unfortunately, there are apparently not enough Mendeley users (at least in the Machine Learning community) for really meaningful results. ** I would love to get suggestions for better data sources. ** # In[143]: import feedparser import matplotlib.pyplot as plt import mendeley import os.path import pandas as pd import pickle import urllib import time from datetime import datetime, timedelta, date from dateutil.parser import parse from dateutil.tz import tzlocal from IPython.display import HTML from IPython.display import clear_output from ipy_progressbar import ProgressBar from itertools import tee from mendeley import Mendeley from pytz import timezone get_ipython().run_line_magic('matplotlib', 'notebook') # In[144]: # adapted from: http://stackoverflow.com/a/1060330 def period_range(start_date, end_date, period=1): for n in range(int ((end_date - start_date).days / period)): yield start_date + timedelta(n) # # Recent arXiv publications # In[145]: # see http://arxiv.org/help/api/user-manual#subject_classifications for categories arxiv_categories = ['stat.ML', 'cs.NE', 'cs.AI'] search_term = '+OR+'.join(['cat:{}'.format(category) for category in arxiv_categories]) # In[146]: arxiv_base_url = 'http://export.arxiv.org/api/query?'; search_query = '{}&sortBy=lastUpdatedDate&sortOrder=descending'.format(search_term) start = 0 total_results = 100000 results_per_iteration = 1000 begin_year = 2014 wait_time = 3 arxiv_file_name = 'arxiv-publications.pickle' # In[147]: if (os.path.isfile(arxiv_file_name)): with open(arxiv_file_name, 'rb') as f: entries = pickle.load(f) else: print('Searching arXiv for %s' % search_query) entries = [] for i in range(start,total_results,results_per_iteration): print("Parsing results %i - %i" % (i,i+results_per_iteration)) query = 'search_query=%s&start=%i&max_results=%i' % (search_query, i, results_per_iteration) response = urllib.request.urlopen(arxiv_base_url + query).read() feed = feedparser.parse(response) if len(feed.entries) == 0: print('No more entries') break done = False for entry in feed.entries: dt_updated = parse(entry.updated) if (dt_updated.year < begin_year): done = True break dt_published = parse(entry.published) if (dt_published.year >= begin_year): entries.append(entry) if done: print('No more entries for selected date range') break print('Sleeping for %i seconds' % wait_time) time.sleep(wait_time) with open(arxiv_file_name, 'wb') as f: pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL) # In[148]: fig, ax = plt.subplots(figsize=(12, 8)) df = pd.DataFrame([parse(entry['published']).date() for entry in entries], columns=['date']) df.date = df.date.astype("datetime64") _ = df.groupby([df.date.dt.year, df.date.dt.month]).count().plot(kind="bar", ax=ax, title='Publications by month') # # Match with Mendeley data # In[149]: # if you want to run this for yourself, you have to get an API key from mendeley # http://dev.mendeley.com/ client_id = 'redacted' client_key = 'redacted' client_uri = 'redacted' matched_file_name = 'matched_publications.pickle' mendeley = Mendeley(client_id, client_secret=client_key, redirect_uri=client_uri) auth = mendeley.start_client_credentials_flow() session = auth.authenticate() # In[150]: if (os.path.isfile(matched_file_name)): with open(matched_file_name, 'rb') as f: matched_entries = pickle.load(f) else: matched_entries = [] progress = ProgressBar(entries) for entry in progress: # try to match with the last updated arXiv version. If there's no match, try with the next oldest one arxiv_id = 'arXiv:{}'.format(str(entry.id).split('/')[-1]) base_id, last_version = arxiv_id.rsplit('v', 1) for version in range(int(last_version), 0, -1): current_id = '{}v{}'.format(base_id, version) try: document = session.catalog.by_identifier(arxiv=current_id, view='stats') matched_entries.append((entry, document)) break except Exception: pass clear_output() print('Matched {0} out of {1} articles ({2:.1f}%)'.format(len(matched_entries), \ len(entries), \ 100 * len(matched_entries) / len(entries))) with open(matched_file_name, 'wb') as f: pickle.dump(entries, f, pickle.HIGHEST_PROTOCOL) # In[151]: data = pd.DataFrame([(ent.published, \ ent.title, \ ent.authors, \ doc.reader_count, \ ent.link, \ doc.link) for ent, doc in matched_entries], \ columns=['Date', 'Title', 'Authors', 'Readers', 'arXiv', 'Mendeley']) # In[152]: fig, ax = plt.subplots(figsize=(12, 8)) data.Readers.hist(ax=ax, bins=50) ax.set_yscale('log') ax.set_xlabel('Readers') ax.set_ylabel('Bin Count (Log)') _ = ax.set_title('Mendeley Readers/Article Histogram') # # Scoring # In[153]: def get_age(elem): # TODO: automatically determine correct TZ published = parse(elem['Date']).replace(tzinfo=timezone('EST5EDT')) return datetime.now().replace(tzinfo=tzlocal()) - published # In[154]: # see http://amix.dk/blog/post/19574 for further details def calculate_score(age, readers, gravity=1.9): return (readers - 1) / pow((age + 2), gravity) def calculate_elem_score(elem, gravity=1.9): return calculate_score(get_age(elem).days, elem['Readers'], gravity) # In[155]: start_date = date(year=begin_year, month=1, day=1) end_date = date.today() delta_days = np.array([(date.today() - d).days for d in list(period_range(start_date, end_date))]) fig, ax = plt.subplots(figsize=(12, 8)) for readers in [5, 25, 100]: ax.plot(delta_days, [calculate_score(days, readers) for days in delta_days], label='{} readers'.format(readers)) ax.legend() ax.grid() ax.set_xlim([0, np.max(delta_days)]) ax.set_ylim([0, 1]) ax.set_title('Score decay by time') ax.set_xlabel('Age in days') _ = ax.set_ylabel('Score') # In[156]: data['Score'] = [calculate_elem_score(elem[1]) for elem in data.iterrows()] # # Results # In[157]: pd.set_option('display.max_colwidth', 100000) data_out = data.sort(['Score'], ascending=[0])[['Score', 'Date', 'Readers', 'Authors','Title', 'arXiv', 'Mendeley']] data_out['Date'] = [parse(date).replace(tzinfo=timezone('EST5EDT')).date() for date in data_out['Date']] data_out['arXiv'] = ['{}'.format(link, link.rsplit('/')[-1]) for link in data_out['arXiv']] data_out['Mendeley'] = ['{}'.format(link, 'link') for link in data_out['Mendeley']] data_out['Authors'] = [', '.join([author['name'].rsplit()[-1] for author in elem]) for elem in data_out['Authors']] data_out.set_index('Score', inplace=True) with open('Papers.html', 'w') as f: f.write(data_out.to_html(escape=False)) HTML(data_out.head(n=30).to_html(escape=False))