#!/usr/bin/env python
# coding: utf-8

# # Marking Models
# 
# Simple sketches showing different possible mark distributions.
# 
# Note that this is a working notebook, off the top of my head, and I haven't necessarily checked things properly... That said, even if there are errors, much of the stuff contained in the notebook could be useful when it comes to doing things properly... 

# In[1]:


# pandas is a package for working with 2d tabular data
import pandas as pd

# numpy is a package for doing things with numbers...
import numpy as np

# ipywidgets are interactive widgets
from ipywidgets import interact

# Plotting support
import matplotlib.pyplot as plt

# seaborn is a statistical charting package
import seaborn as sns

# Inline charting
get_ipython().run_line_magic('matplotlib', 'inline')


# ## Simple correlations
# 
# Derive random distribution of OCAS vs OES with a known correlation:

# In[2]:


# Based on https://stackoverflow.com/a/18684433/454773
def simpleCorrelation(corr=1.0,
                      lower_ocas=30, upper_ocas=95,
                      lower_oes=10, upper_oes=90):
    """Simple correlations, with sd limits on the distribution."""
    
    ocas = np.array([lower_ocas, upper_ocas])
    oes = np.array([lower_oes, upper_oes])

    means = [ocas.mean(), oes.mean()]  
    stds = [ocas.std() / 3, oes.std() / 3] 
    covs = [[stds[0]**2, stds[0]*stds[1]*corr], 
            [stds[0]*stds[1]*corr,stds[1]**2]] 

    df = pd.DataFrame(np.random.multivariate_normal(means, covs, 1000))

    df.columns = ['OCAS', 'OES']
    df['OCAS'] =  df['OCAS'].astype(int)
    df['OES'] =  df['OES'].astype(int)
    
    df['Rank'] = (df['OCAS'] + df["OES"]) / 2
    return df


# Preview some of the data that is returned:

# In[3]:


df = simpleCorrelation(0.9)
df.head()


# We can can plot the data, with distributions, easily enough:

# In[4]:


g = sns.jointplot("OCAS", "OES", data=df,
                  kind="reg", truncate=False,
                  xlim=(0, 100), ylim=(0, 100),
                  color="blue", height=7)


# Do a thing to set boundaries:

# In[5]:


def get_spirit_of_rank(classification, ocas, oes, rank):
    """Generate spirit of rank lines.

       Grades based on:
        - the minimum final OCAS for that status;
        - the minimum final OES for that status;
        - the minimum Rank score for that status

        Rank defined for each student as the weighted average of
        their final OCAS and final OES.
        
        Rank > OCAS, Rank > OES

        To achieve a certain result status, a student 
        must meet all three boundary scores.
    """
    modx = (2 * rank) - oes
    mody = (2 * rank) - ocas
    _df = pd.DataFrame({'Class': classification,
                         'x': [100, modx, ocas, ocas],
                         'y': [oes, oes, mody, 100]})
    return _df

sor = get_spirit_of_rank('class1', 85, 79, 87)
sor


# These can be overplotted on the distribution:

# In[6]:


jointplot = sns.jointplot("OCAS", "OES", data=df,
                          kind="reg", truncate=False,
                          xlim=(0, 100), ylim=(0, 100),
                          color="blue", height=7)

sor1 = get_spirit_of_rank('class1', 85, 80, 87)
plt.plot(sor1['x'], sor1['y'], color='red', linewidth=2);


# Start to put together a thing to let us set grade boundaries (perhaps more convenient to make this interactive):

# In[7]:


# The numbers are set as they are to help debugging...
boundaries = pd.DataFrame([(1, 80, 85, 90),
                           (2, 65, 70, 75),
                           (3, 50, 55, 60),
                           (4, 35, 40, 45)],
                          columns=['class', 'oes', 'ocas', 'rank'])
boundaries


# We can then overplot the grade boundaries onto the distribution.
# 
# If we also add drop lines, these make it easier to see "false grade" areas, eg where incorrect grades are assigned if we just look at OCAS scores.

# In[8]:


jointplot = sns.jointplot("OCAS", "OES", data=df,
                          kind="reg", truncate=False,
                          xlim=(0, 100), ylim=(0, 100),
                          color="blue", height=7)

def overplot(row):
    _sor = get_spirit_of_rank(row['class'], row['ocas'], row['oes'], row['rank'])
    plt.plot(_sor['x'], _sor['y'], color='red', linewidth=2)
    
    # The drop lines help us identify misclassifications
    plt.plot([_sor['x'].iloc[2], _sor['x'].iloc[2]], [0, _sor['y'].iloc[2]],
             color='lightgrey')
    
boundaries.apply(overplot, axis=1);


# We can start to make a more interactive explorer, adding in a component that allows us to start to play with the distributions:

# In[9]:


@interact(corr=(0, 1, 0.05), 
          lower_ocas=(0, 50, 1),
          upper_ocas=(50, 100, 1),
          lower_oes=(0, 50, 1),
          upper_oes=(50, 100, 1),
         )
def corrdata(corr=0.9,
             lower_ocas=30, upper_ocas=85, 
             lower_oes=10, upper_oes=80):
    """Interactive correlation plot display."""
    df = simpleCorrelation(corr, lower_ocas, upper_ocas, lower_oes, upper_oes)
    sns.jointplot("OCAS", "OES", data=df,
                  kind="reg", truncate=False,
                  xlim=(0, 100), ylim=(0, 100),
                  color="blue", height=7)
    boundaries.apply(overplot, axis=1);


# ### Identifying misapplied grades
# 
# We can annotate the original marks dataframe with awards based on grade boundaries or just OCAS marks:
# 
# First, lets get a simple OCAS based grade:

# In[10]:


ocas_bins = [0]+boundaries['ocas'].values[::-1].tolist()+[100]

df['ocas_grade'] = pd.cut(df['OCAS'], bins=ocas_bins,
                         labels=[5, 4, 3, 2, 1]).astype(int)
df


# The complete grade calculation is more complex (is there a more Pythonic way of doing this?):

# In[19]:


def grader(row, enforce_rank=True):
    """Generate overall and OCAS only grades."""
    for ix, boundary in boundaries.iterrows():
        _rank = (row['OES']+row['OCAS'])/2
        if (row['OES'] >= boundary['oes']) and (row['OCAS'] >= boundary['ocas']):
            if enforce_rank:
                if (row['Rank'] >= boundary['rank']):
                    return boundary['class']
            else:
                return boundary['class']
        if (row['OES'] >= boundary['oes']) and _rank >= boundary['rank']:
            return boundary['class']
        elif (row['OCAS'] >= boundary['ocas']) and _rank >= boundary['rank']:
            return boundary['class']
    
    return 5


# In[12]:


df['overall_grade'] = df.apply(grader, axis=1)
df


# In[13]:


df['misgrade'] = df['ocas_grade'] < df['overall_grade']
df['misgrade'] = df['misgrade'].astype(int) 
df


# Update the original chart with coloured misgrades:

# In[20]:


cmap = sns.cubehelix_palette(5, light=1, hue=1, as_cmap=True)
@interact(corr=(0, 1, 0.05), 
          lower_ocas=(0, 50, 1),
          upper_ocas=(50, 100, 1),
          lower_oes=(0, 50, 1),
          upper_oes=(50, 100, 1),
          enforce_rank = True
         )
def corrdata(corr=0.9,
             lower_ocas=30, upper_ocas=85, 
             lower_oes=10, upper_oes=80, enforce_rank=True):
    """Interactive correlation plot display."""
    df = simpleCorrelation(corr, lower_ocas, upper_ocas, lower_oes, upper_oes)

    df['ocas_grade'] = pd.cut(df['OCAS'], bins=ocas_bins,
                         labels=[9, 4, 3, 2, 1]).astype(int)
    
    df['overall_grade'] = df.apply(grader, enforce_rank=enforce_rank, axis=1)
    
    df['misgrade'] = df['ocas_grade'] < df['overall_grade']
    df['misgrade'] = df['misgrade'].astype(int) 

    g = sns.jointplot("OCAS", "OES", data=df,
                      kind="reg", truncate=False,
                      xlim=(0, 100), ylim=(0, 100),
                      color="blue",
                      height=7, scatter = False)
    
    g.ax_joint.scatter("OCAS", "OES", c="misgrade", marker="o", data=df,
                       s=10)
    boundaries.apply(overplot, axis=1);


# We can also do a plot to show areas where grades are misapplied and by how many grades.
# 
# First, create a grid of all grades, calculate the "correct" grade and the grade derived purely from OCAS socres, and compare them:

# In[15]:


#Generate every OCAS/OES combination

# Do we want to enforce the rank condition?
enforce_rank = True

all_scores = [(x, y) for x in range(1, 100) for y in range(1, 100)]
df_all = pd.DataFrame(all_scores, columns=['OCAS', 'OES'])
df_all['Rank'] = (df_all['OCAS'] + df_all["OES"]) / 2

df_all['ocas_grade'] = pd.cut(df_all['OCAS'], bins=ocas_bins,
                     labels=[5, 4, 3, 2, 1]).astype(int)

df_all['overall_grade'] = df_all.apply(grader, enforce_rank=enforce_rank, axis=1)

df_all['misgrade_amount'] = df_all['overall_grade'] - df_all['ocas_grade']
df_all['misgrade'] = (df_all['misgrade_amount']!=0)


# Chart the result:

# In[16]:


plt.rcParams["figure.figsize"] = (15, 15)

ax = sns.scatterplot(x="OCAS", y="OES", style='misgrade',
                hue="misgrade_amount", data=df_all)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
boundaries.apply(overplot, axis=1);


# ## Interactive Charts
# 
# Note that if we want interactive charts, they're easy enough to create, eg using the `plotly` package:

# In[17]:


import plotly.express as px

fig = px.scatter(x=df['OCAS'], y=df["OES"], color=df['misgrade'])
fig.show()


# In[ ]: