#Grab the data file from a copied URL
#!curl http://results.ref.ac.uk/\(S\(jjedtxoydmmvwidxuktryu15\)\)/DownloadFile/AllResults/xlsx > ref2014.xlsx

import pandas as pd
xls=pd.ExcelFile('ref2014.xlsx')

#How many sheets are there?
xls.sheet_names

#Quick preview of the data
df=pd.read_excel('ref2014.xlsx')
df[:10]

#We have some metadata rows so let's try again...
df=pd.read_excel('ref2014.xlsx',header=6)
df[:5]

#What are the columns?
for col in df.columns: print(col)

df['Profile'].unique()

#What are the units of assessment?
df['Unit of assessment name'].unique()

#How many institutions?
df['Institution code (UKPRN)'].unique().size

#How many institutions by unit of assessment, ordered?
#Rather than mulitpl count, let's base this on the Overall profile results
df[df['Profile']=='Overall'].groupby(['Unit of assessment name']).size().order(ascending=False)

#Which institutions submitted to most Units of Assessment?
df[df['Profile']=='Overall'].groupby(['Institution name']).size().order(ascending=False)

#Can an institution have more than one submission to the same Units of Assessment?
df[df['Profile']=='Overall'].groupby(['Institution name','Unit of assessment name']).size().order(ascending=False)[:10]

dfo=df[df['Profile']=='Overall']
dfo[(dfo['Institution name']=='King\'s College London') & (dfo['Unit of assessment name']=='Allied Health Professions, Dentistry, Nursing and Pharmacy')]

dfo[dfo['Unit of assessment name']=='Civil and Construction Engineering'].sort(['4*','3*','2*','1*','unclassified'],
                                                                               ascending=False)[:5]

dfi=df[df['Profile']=='Impact']
dfi[dfi['Unit of assessment name']=='Civil and Construction Engineering'].sort(['4*','3*','2*','1*','unclassified'],
                                                                               ascending=False)[:5]

def rankUoAbyProfile(uoa,profile):
    tmp=df[df['Profile']==profile]
    tmp=tmp[tmp['Unit of assessment name']==uoa].sort(['4*','3*','2*','1*','unclassified'],ascending=False)
    return tmp

rankUoAbyProfile('Computer Science and Informatics','Environment')[:5]

def rankUoAbySize(uoa,profile='Overall'):
    tmp=df[df['Profile']==profile]
    tmp=tmp[tmp['Unit of assessment name']==uoa].sort(['FTE Category A staff submitted','4*','3*','2*','1*','unclassified'],ascending=False)
    return tmp

rankUoAbySize('History')[:5]

def internalSize(hei):
    tmp=dfo[dfo['Institution name']==hei]
    tmp=tmp.sort(['FTE Category A staff submitted','4*','3*','2*','1*','unclassified'],ascending=False)
    return tmp

internalSize("Open University")

def internalRanking(hei):
    tmp=dfo[dfo['Institution name']==hei]
    tmp=tmp.sort(['4*','3*','2*','1*','unclassified'],ascending=False)
    return tmp

internalRanking("Open University")