Notebook

iScatter Configurator¶

Routines for generating draft iScatter charts from a pandas dataframe.

The iScatter library is a d3.js powered Javascript library for generating intereactive scatterplots that can be used to support simple visual exploratory statistical analysis of a small, grouped dataset.

iScatter parses datasets described according to Stephens' NOIR model. Scale definitions as then used a basis for supporting different statistical operations. The statistics supported are as follows (items higher up the stack inherit statistics from lower down the stack).

        case "ratio":
        case "interval":
            i = i.concat(["mean", "range", "midrange", "stddev"]);
        case "ordinal":
            i = i.concat(["min", "lq", "median", "uq", "max"]);
        case "nominal":
            i = i.concat(["count", "mode"])

To get a feel for the operations supported by iScatter charts, the chart embeds a tour, access via the ? button on the chart as in the example found on the iScatter homepage.

This notebook provides a routine for automatically drafting configuration and files (the data schema, a chart configuration file, a CSV data file) and publishing a quick draft iScatter chart for a dataset provided as a pandas dataframe.

iScatter is copyright © 2013–2014 by The Open University, UK. It was implemented by Michel Wermelinger with contributions from Sam Leicester and Callum Lester, and is written in Javascript, using d3.js, jQuery, Guiders-JS and Glyphicons.

In [1]:

#This is a simple HTML webpage template for publishing the chart
#There is a single piece of customisation required - specifying the name of the configuration file
#The configuration file name is currently constrained to be of the form: SLUG_config.js
#The configuration file name is currently constrained to be located in the data/ folder
iscatterTemplate='''
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8"/>
    <title></title>

    <!-- Include the iScatter and the help tour style sheets. Modify them to suit your web site style. -->
    <link type="text/css" rel="stylesheet" href="doc/iscatter.css"/>
    <link type="text/css" rel="stylesheet" href="lib/guiders.css"/>

</head>
<body>

<svg id="iScatterChart" height="400" width=700></svg>


<!-- include all the generic Javascript code -->
<script type="text/javascript" src="http://code.jquery.com/jquery-1.10.2.min.js"></script>
<script type="text/javascript" src="http://cdnjs.cloudflare.com/ajax/libs/d3/3.3.10/d3.min.js"></script>
<script type="text/javascript" src="lib/guiders.js"></script>
<script type="text/javascript" src="src/iscatter.js"></script>

<!-- include the code to create and configure the chart --> 
<script type="text/javascript" src="data/{0}_config.js"></script>
<script type="text/javascript" src="doc/iscatter_base.js"></script>

</body>
</html>
'''
#iscatterTemplate.format(SLUG)

In [2]:

#Minimal function to produce a dummy schema file based on the contents of a pandas dataframe

#Inspired by http://stackoverflow.com/a/25039627/454773
#If we have a hierarchical column definition, this could be brought in to the description?
#TO DO: improve the logic
#TO DO: either here or in main function, support a way of overriding estimated values eg of scale type.
import pandas as pd
def iscatter_dataProcess(dfi):
    C_AUTO_NOMINALS=[]
    C_AUTO_INTS=[]
    
    schemaCols=['id','name','description','unit','type','level']
    schema=pd.DataFrame(columns=schemaCols)

    z=dfi.dtypes.to_dict()
    ix=0
    for col in z:
        idx=col #col.replace(' ','').lower()
        tmp={'id':idx,
             'name':col,#.title(),
             'description':col,#.title(),
             'unit':'',
             'type':'string',
             'level':'nominal'}
        if issubclass(np.dtype(z[col]).type, np.number):
            tmp['type']='number'#No unit;
            tmp['level']='ratio' #use ratio as the default for a numeric?
        #Strings appear to be represented as objects
        elif np.dtype(z[col]).type == np.object_:
            tmp['type']='string'#No unit;
            tmp['level']='nominal' #use ratio as the default for a numeric?
            C_AUTO_NOMINALS.append(idx)
        schema=pd.concat([schema,pd.DataFrame(data=tmp,index=[ix])])
        ix+=1
    return schema[schemaCols],{'C_AUTO_NOMINALS':C_AUTO_NOMINALS,'C_AUTO_INTS':C_AUTO_INTS}

In [6]:

#This function generates a draft iScatter chart from a pandas dataframe
#The function estimates settings for the schema file from the the dataframe
#TO DO: generate config and schema files from an extended datapackage.json file that includes shcema information
#TO DO: generate a draft extended datapackage.json file from a pandas dataframe
def iScatterGen(tmp,STUB,name,desc,xinit,yinit,ainit='',colour='',xattr=[],yattr=[],xscale='',yscale='',yrange=''):
    ''' Generate a draft iScatter chart from a pandas dataframe. '''
    
    dfp=tmp

    C_COLOURGROUP=colour
    #--
    dfp.reset_index()
    
    PATH='./'
    
    #Save the data file
    dfp.to_csv(PATH+'data/'+STUB+'.txt', index =False)

    dfp=pd.read_csv(PATH+'data/'+STUB+'.txt')
    #dfp.rename(columns=lambda x: x.replace(' ','').lower(), inplace=True)

    sc,ac=iscatter_dataProcess(dfp)

    #Save the schema file
    sc.to_csv(PATH+'data/'+STUB+'_schema.txt',index=False)

    C_NAME=name
    C_DESC=desc

    C_SCHEMA_URL=PATH+'data/'+STUB+'_schema.txt'

    C_DATA_URL=PATH+'data/'+STUB+'.txt'

    C_XINIT=xinit
    C_XSCALE=xscale
    C_YINIT=yinit
    C_YSCALE=yscale
    C_YRANGE=yrange
    C_AINIT=ainit

    X_STATS=''
    Y_STATS='C_STANDARD_STATS'

    #-- PLAN IS NOT TO NEED ANYTHING BELOW HERE
    C_STANDARD_STATS= ['median', 'mean', 'stddev', 'range']
    C_AUTO_NOMINALS=ac['C_AUTO_NOMINALS']
    C_AUTO_INTS=ac['C_AUTO_INTS']

    

    C_YATTR=yattr
    #C_XATTR= list(set(dfp.columns.tolist()) - set(C_YATTR))
    C_XATTR=[]

    config={
        'C_SCHEMA': C_SCHEMA_URL,
        'C_DATA':C_DATA_URL,
        'C_NAME':C_NAME,
        'C_DESC':C_DESC,

        #Optional ID for chart block element
        'C_CHARTID':'', #default is 'iScatterChart'

        #To what extent can we automatically estimate these, or derive sensible defaults from schema?
        'C_XATTR':C_XATTR,
        'C_YATTR':C_YATTR,

        'C_XINIT':C_XINIT,
        'C_YINIT':C_YINIT,
        'C_AINIT':C_AINIT,

        'C_XSCALE':C_XSCALE,
        'C_YSCALE':C_YSCALE,

        'C_YRANGE':C_YRANGE,

        #Can we automagically guess these or derive sensible defaults from schema?
        #example: C_STANDARD_STATS ['mode', 'mean', 'stddev', 'range']
        'C_STATS': {
                C_XINIT: C_STANDARD_STATS if (X_STATS=='C_STANDARD_STATS') else [] ,
                C_YINIT: C_STANDARD_STATS if (Y_STATS=='C_STANDARD_STATS') else []
            },

        'C_REFERENCES' : {},

        #Can we automagically guess these or derive sensible defaults from schema? For example, from nominals
        #C_AUTO_NOMINALS
        'C_SUBSETS': C_AUTO_NOMINALS,

        'C_COLOURGROUP':C_COLOURGROUP,

        #Can we automagically guess these from schema?
        #C_AUTO_INTS
        'C_INTVARS':C_AUTO_INTS
    }
    import json
    #Save the config file
    with open(PATH+"data/"+STUB+"_config.js", "w") as f:
        f.write('CONFIG='+json.dumps(config, indent=4,sort_keys=True))

    #Save the html wrapper page
    with open(PATH+STUB+".html", "w") as f:
        f.write(iscatterTemplate.format(STUB))

    print('CONFIG=',json.dumps(config, indent=4,sort_keys=True))

Example¶

Simple example...

In [8]:

#Generate some dummy data
incd=pd.DataFrame(
    numpy.random.randint(low=5000, high=10000, size=5).tolist() +
    numpy.random.randint(low=10000, high=20000, size=4).tolist() +
    numpy.random.randint(low=20000, high=40000, size=3).tolist() +
    numpy.random.randint(low=40000, high=80000, size=2).tolist() +
    numpy.random.randint(low=80000, high=160000, size=1).tolist(),columns=['income'])
incd=incd.sort('income').reset_index(drop=True).reset_index()


name='Dummy pop'
desc='Dummy data'
xinit='index'
yinit='income'
ainit=''
xattr=[]#['income']
yattr=[]#['index']
colour=''
STUB='demo'

#Note: I extended the minified version of the iScatter js to accept axis range setting
iScatterGen(incd,STUB,name,desc,xinit,yinit,ainit,xattr=xattr,yattr=yattr,yrange=[0,175000])

CONFIG= {
    "C_AINIT": "",
    "C_CHARTID": "",
    "C_COLOURGROUP": "",
    "C_DATA": "./data/demo.txt",
    "C_DESC": "Dummy data",
    "C_INTVARS": [],
    "C_NAME": "Dummy pop",
    "C_REFERENCES": {},
    "C_SCHEMA": "./data/demo_schema.txt",
    "C_STATS": {
        "income": [
            "median",
            "mean",
            "stddev",
            "range"
        ],
        "index": []
    },
    "C_SUBSETS": [],
    "C_XATTR": [],
    "C_XINIT": "index",
    "C_XSCALE": "",
    "C_YATTR": [],
    "C_YINIT": "income",
    "C_YRANGE": [
        0,
        175000
    ],
    "C_YSCALE": "log"
}

In [18]:

incd=pd.DataFrame(
    numpy.random.randint(low=5000, high=10000, size=5).tolist() +
    numpy.random.randint(low=100000, high=500000, size=5).tolist() +
    numpy.random.randint(low=1000000, high=5000000, size=5).tolist() +
    numpy.random.randint(low=10000000, high=50000000, size=5).tolist(),columns=['income'])
incd=incd.sort('income').reset_index(drop=True).reset_index()

yscale='log'
STUB='demo2'
iScatterGen(incd,STUB,name,desc,xinit,yinit,ainit,xattr=xattr,yattr=yattr,yscale=yscale)

CONFIG= {
    "C_AINIT": "",
    "C_CHARTID": "",
    "C_COLOURGROUP": "",
    "C_DATA": "./data/demo2.txt",
    "C_DESC": "Dummy data",
    "C_INTVARS": [],
    "C_NAME": "Dummy pop",
    "C_REFERENCES": {},
    "C_SCHEMA": "./data/demo2_schema.txt",
    "C_STATS": {
        "income": [
            "median",
            "mean",
            "stddev",
            "range"
        ],
        "index": []
    },
    "C_SUBSETS": [],
    "C_XATTR": [],
    "C_XINIT": "index",
    "C_XSCALE": "",
    "C_YATTR": [],
    "C_YINIT": "income",
    "C_YRANGE": "",
    "C_YSCALE": "log"
}

In [ ]: