from __future__ import division, print_function try: import ConfigParser as configparser except ImportError: import configparser import copy import json import matplotlib import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import re import requests import scipy.spatial import time import tweepy from urllib import quote # Read configuration file for request user-agent and Twitter tokens config = configparser.ConfigParser() config.read(os.path.expanduser('~/etc/python.cfg')) headers = { 'User-Agent': config.get('requests', 'user_agent'), 'From': config.get('requests', 'from') } # Fetch name of Brede Wiki page with researchers on Google Scholar and Twitter url_gst = ('http://neuro.compute.dtu.dk/w/api.php?' 'action=query&format=json&list=categorymembers&' 'cmtitle=Category:Researchers in Google Scholar and Twitter') response = requests.get(url_gst, headers=headers).json() pagetitles = [page['title'] for page in response['query']['categorymembers']] while 'query-continue' in response: url_continue = url_gst + '&cmcontinue=' + response['query-continue']['categorymembers']['cmcontinue'].encode('utf-8') response = requests.get(url_continue).json() pagetitles.extend([page['title'] for page in response['query']['categorymembers']]) print(pagetitles) # Get researcher data from Brede Wiki url_pages = "http://neuro.compute.dtu.dk/w/index.php?action=raw&title=" pattern_researcher = re.compile('{{Researcher(\s*?\|.*?)}}', re.DOTALL | re.IGNORECASE | re.UNICODE) pattern_fields = re.compile(r'\s*\|\s*(\w+)\s*=\s*([^\|]*\w)', re.DOTALL | re.UNICODE) researchers = [] for pagetitle in pagetitles: response = requests.get(url_pages + quote(pagetitle.encode('utf-8')), headers=headers) print(pagetitle) researcher = pattern_researcher.findall(response.text) if researcher: researchers.append(dict(pattern_fields.findall(researcher[0]))) else: print("Problem with " + pagetitle) researchers[14] url_gs = 'http://scholar.google.com/citations?user=' headers = { 'User-Agent': config.get('requests', 'user_agent'), 'From': config.get('requests', 'from') } # 2537 pattern_gscount = re.compile('(\d+)') def get_google_scholar_counts(google_scholar_id): response = requests.get(url_gs + google_scholar_id, headers=headers) counts = dict(zip(['citations', 'citations5', 'h-index', 'h-index5', 'i10-index', 'i10-index5'], map(int, pattern_gscount.findall(response.text)))) return counts # Yong-Yeol Ahn check get_google_scholar_counts('US7OSNgAAAAJ') # Get data from Google Scholar for researcher in researchers: if 'citations' not in researcher: print(researcher['name']) researcher.update(get_google_scholar_counts(researcher['googlescholar'])) time.sleep(5) # Saving just in case json.dump(researchers, open('researchers.json', 'w')) # Twitter authentication auth = tweepy.OAuthHandler(config.get('twitter', 'consumer_key'), config.get('twitter', 'consumer_secret')) auth.set_access_token(config.get('twitter', 'access_token'), config.get('twitter', 'access_secret')) # Function to download data from Twitter profiles api = tweepy.API(auth) def get_twitter_count(twitter_id): try: user = api.get_user(twitter_id) counts = { 'Followers count': user.followers_count, 'Friends count': user.friends_count, 'Statuses count': user.statuses_count } return counts except Exception: print('Problem with ' + twitter_id) return {} # Testing with Finn Aarup Nielsen (fnielsen) get_twitter_count('fnielsen') # Download data from Twitter for researcher in researchers: researcher.update(get_twitter_count(researcher['twitter'])) print(researcher['name']) # Save just in case json.dump(researchers, open('researchers.json', 'w')) researchers = json.load(open('researchers.json')) researchers[0] # Pandas! df = pd.DataFrame(researchers) %matplotlib inline # isnan: Houston, we've had a problem indices = (~np.isnan(df['citations'])) & (df['citations'] != 0) reverse_index = indices[indices].index.values # Plot the data matplotlib.rc('font', family='DejaVu Sans') fig = df.plot(x='citations', y='Followers count', kind='scatter', figsize=(15, 10), marker='*', s=df['Statuses count']/10, linewidth=2, color=(0.8, 0.8, 0.8)) ax = plt.gca() ax.set_xscale('log') ax.set_yscale('log') plt.xlabel('Google Scholar citations') plt.ylabel('Twitter followers count') plt.title('Kardashian index for Brede Wiki researchers on Google Scholar and Twitter') # Power law fit p = np.polyfit(np.log(df.ix[indices, 'citations']), np.log(df.ix[indices, 'Followers count']), 1) powerlaw = np.frompyfunc(lambda x: np.exp(p[1]) * x ** p[0], 1, 1) plt.plot([1, 200000], powerlaw([1, 200000]), linewidth=5, color=(0.5, 1, 0.5)) plt.text(10, 5000, '{:.3} x citations^{:0.2}'.format(np.exp(p[1]), p[0]), fontsize=20) # Annotation of some of the points with researcher names hull = scipy.spatial.ConvexHull(df.ix[indices, ['citations', 'Followers count']]) for index in hull.vertices: x, y, name = df.ix[reverse_index[index], ['citations', 'Followers count', 'name']].values try: plt.text(x, y, name, horizontalalignment='center', verticalalignment='center') except: pass # Myself and Ryoto and et al. family_names = ['Nielsen', 'Tomioka', 'Willighagen'] for family_name in family_names: x, y, name = df.ix[df['family_name'] == family_name, ['citations', 'Followers count', 'name']].values.flatten() plt.text(x, y, name, horizontalalignment='center', verticalalignment='center') dummy = plt.axis((1, 200000, 1, 20000)) plt.show() df.describe() df['K-index'] = df['Followers count'] / powerlaw(df['citations']) # Identify the 'scientific Kardashians' high_score = df[indices].sort(columns='K-index', ascending=False)[['name', 'K-index', 'Statuses count']] high_score # Not all is shown above # The below code will give the full list: # https://stackoverflow.com/questions/23388810/ipython-notebook-output-cell-is-truncating-contents-of-my-list from IPython.display import HTML HTML(high_score.to_html())