#Grab the data file from a copied URL #!curl http://results.ref.ac.uk/\(S\(jjedtxoydmmvwidxuktryu15\)\)/DownloadFile/AllResults/xlsx > ref2014.xlsx import pandas as pd xls=pd.ExcelFile('ref2014.xlsx') #How many sheets are there? xls.sheet_names #Quick preview of the data df=pd.read_excel('ref2014.xlsx') df[:10] #We have some metadata rows so let's try again... df=pd.read_excel('ref2014.xlsx',header=6) df[:5] #What are the columns? for col in df.columns: print(col) df['Profile'].unique() #What are the units of assessment? df['Unit of assessment name'].unique() #How many institutions? df['Institution code (UKPRN)'].unique().size #How many institutions by unit of assessment, ordered? #Rather than mulitpl count, let's base this on the Overall profile results df[df['Profile']=='Overall'].groupby(['Unit of assessment name']).size().order(ascending=False) #Which institutions submitted to most Units of Assessment? df[df['Profile']=='Overall'].groupby(['Institution name']).size().order(ascending=False) #Can an institution have more than one submission to the same Units of Assessment? df[df['Profile']=='Overall'].groupby(['Institution name','Unit of assessment name']).size().order(ascending=False)[:10] dfo=df[df['Profile']=='Overall'] dfo[(dfo['Institution name']=='King\'s College London') & (dfo['Unit of assessment name']=='Allied Health Professions, Dentistry, Nursing and Pharmacy')] dfo[dfo['Unit of assessment name']=='Civil and Construction Engineering'].sort(['4*','3*','2*','1*','unclassified'], ascending=False)[:5] dfi=df[df['Profile']=='Impact'] dfi[dfi['Unit of assessment name']=='Civil and Construction Engineering'].sort(['4*','3*','2*','1*','unclassified'], ascending=False)[:5] def rankUoAbyProfile(uoa,profile): tmp=df[df['Profile']==profile] tmp=tmp[tmp['Unit of assessment name']==uoa].sort(['4*','3*','2*','1*','unclassified'],ascending=False) return tmp rankUoAbyProfile('Computer Science and Informatics','Environment')[:5] def rankUoAbySize(uoa,profile='Overall'): tmp=df[df['Profile']==profile] tmp=tmp[tmp['Unit of assessment name']==uoa].sort(['FTE Category A staff submitted','4*','3*','2*','1*','unclassified'],ascending=False) return tmp rankUoAbySize('History')[:5] def internalSize(hei): tmp=dfo[dfo['Institution name']==hei] tmp=tmp.sort(['FTE Category A staff submitted','4*','3*','2*','1*','unclassified'],ascending=False) return tmp internalSize("Open University") def internalRanking(hei): tmp=dfo[dfo['Institution name']==hei] tmp=tmp.sort(['4*','3*','2*','1*','unclassified'],ascending=False) return tmp internalRanking("Open University")