import pandas as pd
from __future__ import division
# this enables the 'toggle input' button
from display import extended_styles
from functools import partial
extended_styles(css=True)

# read in the data file that has "GOD" and "WORDSUM" variables
# data was gathered from: http://sda.berkeley.edu/sdaweb/analysis/?dataset=gss12
# DATA IS UNWEIGHTED
# na_values defines values of GOD and WORDSUM that are invalid codes
# invalid codes are defined as 0,8,9 for GOD (see GSS codebook)
#    note: I did find some 10's for GOD, and treated them as invalid/missing
# invalid codes are defined as -1,98,99 for WORDSUM

fname = "sub-data-by-year.txt"
data = pd.read_csv(fname,na_values = {'GOD':[0,8,9,10],'WORDSUM':[-1,98,99]})

# create a data set that has only valid cases
actData = data.dropna()

# make the IQ variable 
# (from http://aeolipera.wordpress.com/2013/02/13/conversion-table-from-wordsum-to-iq/)
actData["IQ"] = 15*((actData.WORDSUM - 5.965)/2.148) + 100

# plot histograms of WORDSUM for each GOD answer
# Histograms for IQ will look the same, since the transformation from WORDSUM to IQ is linear
# Also, the discrete WORDSUM values will result in discrete IQ values, with the same bin counts
# This is over all years in the dataset
#actData.WORDSUM.hist(by=actData.GOD,bins=range(0,11),figsize=(11,8),normed=True,sharey=True);
fig = pylab.figure(figsize=(11,8))
for i in range(0,6):
    ax = fig.add_subplot(3,2,i+1)
    d = actData.loc[actData.GOD == i+1,"WORDSUM"]
    ax.hist(d,bins=range(0,11),normed=True,align="right")
    ax.set_ylim(0,.30)
    ax.set_xlim(-0.5,10.5)
    ax.set_xticks(range(0,11))
    ax.set_title("GOD = %d"%(i+1))
    ax.text(0,.225,"Number of data points = %d"%len(d))
    #ax.set_xlabel("WORDSUM")
    if not i in [0,2,4]:
        ax.set_yticklabels([])
    else:
        ax.set_ylabel("Frequency")
    if not i in [4,5]:
        ax.set_xticklabels([])
    else:
        ax.set_xlabel("WORDSUM")
fig.tight_layout()

# get IQ data for plotting
iqVals = [0]*6
print "Total Number of Data Points: %d"%len(actData)
for i in range(1,7):
    iqVals[i-1] = actData.loc[actData.GOD == i, "IQ"].value_counts()
    print "Number of data points for God = %d : %d"%(i,sum(iqVals[i-1]))
    # convert to frequency, then percentage
    iqVals[i-1] /= iqVals[i-1].sum()
    iqVals[i-1] = iqVals[i-1]*100
    # sort for easy plotting
    iqVals[i-1] = iqVals[i-1].sort_index()

# plot
fig = pylab.figure(figsize=(10,7))
ax = fig.add_subplot(111)
for i in range(0,6):
    ax.plot(iqVals[i].index,iqVals[i],label = "GOD = %d"%(i+1))
ax.set_ylim(0,30)
ax.set_ylabel("Percentage")
ax.set_xlabel("IQ")
ax.legend(loc="best")
pylab.title("Comparison of IQ Values\nGSS 1972-2012");

# plot just two of the values for comparison
fig = pylab.figure(figsize=(10,7))
ax = fig.add_subplot(111)
ax.plot(iqVals[0].index,iqVals[0],marker='o',label = "GOD = 1")
ax.plot(iqVals[-1].index,iqVals[-1],marker='o',label = "GOD = 6")
ax.set_ylim(0,30)
ax.set_ylabel("Percentage")
ax.set_xlabel("IQ")
ax.legend(loc="best")
pylab.title("Comparison of IQ Values\nGSS 1972-2012");

# repeat, but restrict to 2012
# This data will have very few data points, so it's not very reliable
print "Total Number of Data Points: %d"%sum(actData.loc[(actData.YEAR == 2012), "IQ"].value_counts())
# get IQ data for plotting
iqValsByYear = [0]*6
for i in range(1,7):
    iqValsByYear[i-1] = actData.loc[(actData.GOD == i) & (actData.YEAR == 2012), "IQ"].value_counts()
    print "Number of data points for God = %d : %d"%(i,sum(iqValsByYear[i-1]))
    # convert to frequency, then percentage
    iqValsByYear[i-1] /= iqVals[i-1].sum()
    iqValsByYear[i-1] = iqVals[i-1]*100
    # sort for easy plotting
    iqValsByYear[i-1] = iqVals[i-1].sort_index()

# plot
fig = pylab.figure(figsize=(10,7))
ax = fig.add_subplot(111)
for i in range(0,6):
    ax.plot(iqValsByYear[i].index,iqValsByYear[i],label = "GOD = %d"%(i+1))
ax.set_ylim(0,30)
ax.set_ylabel("Percentage")
ax.set_xlabel("IQ")
ax.legend(loc="best")
pylab.title("Comparison of IQ Values\nGSS 2012");

# plot just two of the values for comparison
fig = pylab.figure(figsize=(10,7))
ax = fig.add_subplot(111)
ax.plot(iqValsByYear[0].index,iqValsByYear[0],marker='o',label = "GOD = 1")
ax.plot(iqValsByYear[-1].index,iqValsByYear[-1],marker='o',label = "GOD = 6")
ax.set_ylim(0,30)
ax.set_ylabel("Percentage")
ax.set_xlabel("IQ")
ax.legend(loc="best")
pylab.title("Comparison of IQ Values\nGSS 2012");

# this enables the nice CSS styling
from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()