import pandas as pd, numpy as np, json
data={}
for year in range(2003,2015):
data[year]=pd.read_html('http://www.shanghairanking.com/ARWU'+repr(year)+'.html',infer_types=False, header=0)[0].sort_index(axis=1)
print year
#save input data for later editing
DATA={}
for year in range(2003,2015):
DATA[year]=data[year].to_json()
file('data.json','w').write(json.dumps(DATA))
#load data if already saved
DATA = json.loads(open('data.json').read())
data={}
for year in range(2003,2015):
data[year]=pd.read_json(DATA[repr(year)]).sort_index(axis=1)
u={}
v={}
locs=['Institution',u'Institution*']
years=[[2003,2004,2011],[2005,2006,2007,2008,2009,2010,2012,2013,2014]]
for r in range(2):
for year in years[r]:
unis=[]
for i in range(data[year]['World Rank'].count()):
uni={}
uni['name']=data[year].loc[i][locs[r]]
if uni['name'] not in v:v[uni['name']]={'years':[],'ranks':[]}
v[uni['name']]['years'].append(year)
uni['rank']=data[year].loc[i]['World Rank']
v[uni['name']]['ranks'].append(uni['rank'])
unis.append(uni)
u[year]=unis
exceptions=[]
#geocode uni names and create list with unis
from geopy.geocoders import Bing
from geopy.geocoders import GoogleV3
from geopy.geocoders import OpenMapQuest
from geopy.geocoders import Nominatim
geolocator_n = Nominatim()
geolocator_q = OpenMapQuest()
geolocator_g = GoogleV3()
geolocator_b = Bing('AiQdfYGfIiDP0FXKQ3yQ3NXHOZBPuSVZJzpJzu1641ffd9GkzBbS_yblwqPym2WR')
counter=0
for k in v.keys():
counter+=1
try:
location = geolocator_q.geocode(k)
v[k]["coord"]=((location.latitude, location.longitude))
print 'OK MapQuest',counter
except:
try:
location = geolocator_g.geocode(k)
v[k]["coord"]=((location.latitude, location.longitude))
print 'OK Google',counter
except:
try:
location = geolocator_n.geocode(k)
v[k]["coord"]=((location.latitude, location.longitude))
print 'OK Nominatim',counter
except:
try:
location = geolocator_b.geocode(k)
v[k]["coord"]=((location.latitude, location.longitude))
print 'OK Bing',counter
except:
exceptions.append(k)
print k,counter
geohelper={
'The Imperial College of Science, Technology and Medicine': (51.500229,-0.178940),
'University of Manchester Institute of Science and Technology':(53.470741,-2.235570),
'University of the Mediterranean (Aix-Marseille 2)':(43.299916, 5.374818)
}
for k in v:
if 'coord' not in v[k]:
v[k]['coord']=geohelper[k]
for k in v:
if 'coord' not in v[k]: print k
#db ok
#save results
file('u.json','w').write(json.dumps(u))#rankings
file('v.json','w').write(json.dumps(v))#universities