#!/usr/bin/env python # coding: utf-8 # # Marking Models # # Simple sketches showing different possible mark distributions. # # Note that this is a working notebook, off the top of my head, and I haven't necessarily checked things properly... That said, even if there are errors, much of the stuff contained in the notebook could be useful when it comes to doing things properly... # In[1]: # pandas is a package for working with 2d tabular data import pandas as pd # numpy is a package for doing things with numbers... import numpy as np # ipywidgets are interactive widgets from ipywidgets import interact # Plotting support import matplotlib.pyplot as plt # seaborn is a statistical charting package import seaborn as sns # Inline charting get_ipython().run_line_magic('matplotlib', 'inline') # ## Simple correlations # # Derive random distribution of OCAS vs OES with a known correlation: # In[2]: # Based on https://stackoverflow.com/a/18684433/454773 def simpleCorrelation(corr=1.0, lower_ocas=30, upper_ocas=95, lower_oes=10, upper_oes=90): """Simple correlations, with sd limits on the distribution.""" ocas = np.array([lower_ocas, upper_ocas]) oes = np.array([lower_oes, upper_oes]) means = [ocas.mean(), oes.mean()] stds = [ocas.std() / 3, oes.std() / 3] covs = [[stds[0]**2, stds[0]*stds[1]*corr], [stds[0]*stds[1]*corr,stds[1]**2]] df = pd.DataFrame(np.random.multivariate_normal(means, covs, 1000)) df.columns = ['OCAS', 'OES'] df['OCAS'] = df['OCAS'].astype(int) df['OES'] = df['OES'].astype(int) df['Rank'] = (df['OCAS'] + df["OES"]) / 2 return df # Preview some of the data that is returned: # In[3]: df = simpleCorrelation(0.9) df.head() # We can can plot the data, with distributions, easily enough: # In[4]: g = sns.jointplot("OCAS", "OES", data=df, kind="reg", truncate=False, xlim=(0, 100), ylim=(0, 100), color="blue", height=7) # Do a thing to set boundaries: # In[5]: def get_spirit_of_rank(classification, ocas, oes, rank): """Generate spirit of rank lines. Grades based on: - the minimum final OCAS for that status; - the minimum final OES for that status; - the minimum Rank score for that status Rank defined for each student as the weighted average of their final OCAS and final OES. Rank > OCAS, Rank > OES To achieve a certain result status, a student must meet all three boundary scores. """ modx = (2 * rank) - oes mody = (2 * rank) - ocas _df = pd.DataFrame({'Class': classification, 'x': [100, modx, ocas, ocas], 'y': [oes, oes, mody, 100]}) return _df sor = get_spirit_of_rank('class1', 85, 79, 87) sor # These can be overplotted on the distribution: # In[6]: jointplot = sns.jointplot("OCAS", "OES", data=df, kind="reg", truncate=False, xlim=(0, 100), ylim=(0, 100), color="blue", height=7) sor1 = get_spirit_of_rank('class1', 85, 80, 87) plt.plot(sor1['x'], sor1['y'], color='red', linewidth=2); # Start to put together a thing to let us set grade boundaries (perhaps more convenient to make this interactive): # In[7]: # The numbers are set as they are to help debugging... boundaries = pd.DataFrame([(1, 80, 85, 90), (2, 65, 70, 75), (3, 50, 55, 60), (4, 35, 40, 45)], columns=['class', 'oes', 'ocas', 'rank']) boundaries # We can then overplot the grade boundaries onto the distribution. # # If we also add drop lines, these make it easier to see "false grade" areas, eg where incorrect grades are assigned if we just look at OCAS scores. # In[8]: jointplot = sns.jointplot("OCAS", "OES", data=df, kind="reg", truncate=False, xlim=(0, 100), ylim=(0, 100), color="blue", height=7) def overplot(row): _sor = get_spirit_of_rank(row['class'], row['ocas'], row['oes'], row['rank']) plt.plot(_sor['x'], _sor['y'], color='red', linewidth=2) # The drop lines help us identify misclassifications plt.plot([_sor['x'].iloc[2], _sor['x'].iloc[2]], [0, _sor['y'].iloc[2]], color='lightgrey') boundaries.apply(overplot, axis=1); # We can start to make a more interactive explorer, adding in a component that allows us to start to play with the distributions: # In[9]: @interact(corr=(0, 1, 0.05), lower_ocas=(0, 50, 1), upper_ocas=(50, 100, 1), lower_oes=(0, 50, 1), upper_oes=(50, 100, 1), ) def corrdata(corr=0.9, lower_ocas=30, upper_ocas=85, lower_oes=10, upper_oes=80): """Interactive correlation plot display.""" df = simpleCorrelation(corr, lower_ocas, upper_ocas, lower_oes, upper_oes) sns.jointplot("OCAS", "OES", data=df, kind="reg", truncate=False, xlim=(0, 100), ylim=(0, 100), color="blue", height=7) boundaries.apply(overplot, axis=1); # ### Identifying misapplied grades # # We can annotate the original marks dataframe with awards based on grade boundaries or just OCAS marks: # # First, lets get a simple OCAS based grade: # In[10]: ocas_bins = [0]+boundaries['ocas'].values[::-1].tolist()+[100] df['ocas_grade'] = pd.cut(df['OCAS'], bins=ocas_bins, labels=[5, 4, 3, 2, 1]).astype(int) df # The complete grade calculation is more complex (is there a more Pythonic way of doing this?): # In[19]: def grader(row, enforce_rank=True): """Generate overall and OCAS only grades.""" for ix, boundary in boundaries.iterrows(): _rank = (row['OES']+row['OCAS'])/2 if (row['OES'] >= boundary['oes']) and (row['OCAS'] >= boundary['ocas']): if enforce_rank: if (row['Rank'] >= boundary['rank']): return boundary['class'] else: return boundary['class'] if (row['OES'] >= boundary['oes']) and _rank >= boundary['rank']: return boundary['class'] elif (row['OCAS'] >= boundary['ocas']) and _rank >= boundary['rank']: return boundary['class'] return 5 # In[12]: df['overall_grade'] = df.apply(grader, axis=1) df # In[13]: df['misgrade'] = df['ocas_grade'] < df['overall_grade'] df['misgrade'] = df['misgrade'].astype(int) df # Update the original chart with coloured misgrades: # In[20]: cmap = sns.cubehelix_palette(5, light=1, hue=1, as_cmap=True) @interact(corr=(0, 1, 0.05), lower_ocas=(0, 50, 1), upper_ocas=(50, 100, 1), lower_oes=(0, 50, 1), upper_oes=(50, 100, 1), enforce_rank = True ) def corrdata(corr=0.9, lower_ocas=30, upper_ocas=85, lower_oes=10, upper_oes=80, enforce_rank=True): """Interactive correlation plot display.""" df = simpleCorrelation(corr, lower_ocas, upper_ocas, lower_oes, upper_oes) df['ocas_grade'] = pd.cut(df['OCAS'], bins=ocas_bins, labels=[9, 4, 3, 2, 1]).astype(int) df['overall_grade'] = df.apply(grader, enforce_rank=enforce_rank, axis=1) df['misgrade'] = df['ocas_grade'] < df['overall_grade'] df['misgrade'] = df['misgrade'].astype(int) g = sns.jointplot("OCAS", "OES", data=df, kind="reg", truncate=False, xlim=(0, 100), ylim=(0, 100), color="blue", height=7, scatter = False) g.ax_joint.scatter("OCAS", "OES", c="misgrade", marker="o", data=df, s=10) boundaries.apply(overplot, axis=1); # We can also do a plot to show areas where grades are misapplied and by how many grades. # # First, create a grid of all grades, calculate the "correct" grade and the grade derived purely from OCAS socres, and compare them: # In[15]: #Generate every OCAS/OES combination # Do we want to enforce the rank condition? enforce_rank = True all_scores = [(x, y) for x in range(1, 100) for y in range(1, 100)] df_all = pd.DataFrame(all_scores, columns=['OCAS', 'OES']) df_all['Rank'] = (df_all['OCAS'] + df_all["OES"]) / 2 df_all['ocas_grade'] = pd.cut(df_all['OCAS'], bins=ocas_bins, labels=[5, 4, 3, 2, 1]).astype(int) df_all['overall_grade'] = df_all.apply(grader, enforce_rank=enforce_rank, axis=1) df_all['misgrade_amount'] = df_all['overall_grade'] - df_all['ocas_grade'] df_all['misgrade'] = (df_all['misgrade_amount']!=0) # Chart the result: # In[16]: plt.rcParams["figure.figsize"] = (15, 15) ax = sns.scatterplot(x="OCAS", y="OES", style='misgrade', hue="misgrade_amount", data=df_all) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) boundaries.apply(overplot, axis=1); # ## Interactive Charts # # Note that if we want interactive charts, they're easy enough to create, eg using the `plotly` package: # In[17]: import plotly.express as px fig = px.scatter(x=df['OCAS'], y=df["OES"], color=df['misgrade']) fig.show() # In[ ]: