Routines for generating draft iScatter charts from a pandas dataframe.
The iScatter library is a d3.js powered Javascript library for generating intereactive scatterplots that can be used to support simple visual exploratory statistical analysis of a small, grouped dataset.
iScatter parses datasets described according to Stephens' NOIR model. Scale definitions as then used a basis for supporting different statistical operations. The statistics supported are as follows (items higher up the stack inherit statistics from lower down the stack).
case "ratio":
case "interval":
i = i.concat(["mean", "range", "midrange", "stddev"]);
case "ordinal":
i = i.concat(["min", "lq", "median", "uq", "max"]);
case "nominal":
i = i.concat(["count", "mode"])
To get a feel for the operations supported by iScatter charts, the chart embeds a tour, access via the ? button on the chart as in the example found on the iScatter homepage.
This notebook provides a routine for automatically drafting configuration and files (the data schema, a chart configuration file, a CSV data file) and publishing a quick draft iScatter chart for a dataset provided as a pandas dataframe.
iScatter is copyright © 2013–2014 by The Open University, UK. It was implemented by Michel Wermelinger with contributions from Sam Leicester and Callum Lester, and is written in Javascript, using d3.js, jQuery, Guiders-JS and Glyphicons.
#This is a simple HTML webpage template for publishing the chart
#There is a single piece of customisation required - specifying the name of the configuration file
#The configuration file name is currently constrained to be of the form: SLUG_config.js
#The configuration file name is currently constrained to be located in the data/ folder
iscatterTemplate='''
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title></title>
<!-- Include the iScatter and the help tour style sheets. Modify them to suit your web site style. -->
<link type="text/css" rel="stylesheet" href="doc/iscatter.css"/>
<link type="text/css" rel="stylesheet" href="lib/guiders.css"/>
</head>
<body>
<svg id="iScatterChart" height="400" width=700></svg>
<!-- include all the generic Javascript code -->
<script type="text/javascript" src="http://code.jquery.com/jquery-1.10.2.min.js"></script>
<script type="text/javascript" src="http://cdnjs.cloudflare.com/ajax/libs/d3/3.3.10/d3.min.js"></script>
<script type="text/javascript" src="lib/guiders.js"></script>
<script type="text/javascript" src="src/iscatter.js"></script>
<!-- include the code to create and configure the chart -->
<script type="text/javascript" src="data/{0}_config.js"></script>
<script type="text/javascript" src="doc/iscatter_base.js"></script>
</body>
</html>
'''
#iscatterTemplate.format(SLUG)
#Minimal function to produce a dummy schema file based on the contents of a pandas dataframe
#Inspired by http://stackoverflow.com/a/25039627/454773
#If we have a hierarchical column definition, this could be brought in to the description?
#TO DO: improve the logic
#TO DO: either here or in main function, support a way of overriding estimated values eg of scale type.
import pandas as pd
def iscatter_dataProcess(dfi):
C_AUTO_NOMINALS=[]
C_AUTO_INTS=[]
schemaCols=['id','name','description','unit','type','level']
schema=pd.DataFrame(columns=schemaCols)
z=dfi.dtypes.to_dict()
ix=0
for col in z:
idx=col #col.replace(' ','').lower()
tmp={'id':idx,
'name':col,#.title(),
'description':col,#.title(),
'unit':'',
'type':'string',
'level':'nominal'}
if issubclass(np.dtype(z[col]).type, np.number):
tmp['type']='number'#No unit;
tmp['level']='ratio' #use ratio as the default for a numeric?
#Strings appear to be represented as objects
elif np.dtype(z[col]).type == np.object_:
tmp['type']='string'#No unit;
tmp['level']='nominal' #use ratio as the default for a numeric?
C_AUTO_NOMINALS.append(idx)
schema=pd.concat([schema,pd.DataFrame(data=tmp,index=[ix])])
ix+=1
return schema[schemaCols],{'C_AUTO_NOMINALS':C_AUTO_NOMINALS,'C_AUTO_INTS':C_AUTO_INTS}
#This function generates a draft iScatter chart from a pandas dataframe
#The function estimates settings for the schema file from the the dataframe
#TO DO: generate config and schema files from an extended datapackage.json file that includes shcema information
#TO DO: generate a draft extended datapackage.json file from a pandas dataframe
def iScatterGen(tmp,STUB,name,desc,xinit,yinit,ainit='',colour='',xattr=[],yattr=[],xscale='',yscale='',yrange=''):
''' Generate a draft iScatter chart from a pandas dataframe. '''
dfp=tmp
C_COLOURGROUP=colour
#--
dfp.reset_index()
PATH='./'
#Save the data file
dfp.to_csv(PATH+'data/'+STUB+'.txt', index =False)
dfp=pd.read_csv(PATH+'data/'+STUB+'.txt')
#dfp.rename(columns=lambda x: x.replace(' ','').lower(), inplace=True)
sc,ac=iscatter_dataProcess(dfp)
#Save the schema file
sc.to_csv(PATH+'data/'+STUB+'_schema.txt',index=False)
C_NAME=name
C_DESC=desc
C_SCHEMA_URL=PATH+'data/'+STUB+'_schema.txt'
C_DATA_URL=PATH+'data/'+STUB+'.txt'
C_XINIT=xinit
C_XSCALE=xscale
C_YINIT=yinit
C_YSCALE=yscale
C_YRANGE=yrange
C_AINIT=ainit
X_STATS=''
Y_STATS='C_STANDARD_STATS'
#-- PLAN IS NOT TO NEED ANYTHING BELOW HERE
C_STANDARD_STATS= ['median', 'mean', 'stddev', 'range']
C_AUTO_NOMINALS=ac['C_AUTO_NOMINALS']
C_AUTO_INTS=ac['C_AUTO_INTS']
C_YATTR=yattr
#C_XATTR= list(set(dfp.columns.tolist()) - set(C_YATTR))
C_XATTR=[]
config={
'C_SCHEMA': C_SCHEMA_URL,
'C_DATA':C_DATA_URL,
'C_NAME':C_NAME,
'C_DESC':C_DESC,
#Optional ID for chart block element
'C_CHARTID':'', #default is 'iScatterChart'
#To what extent can we automatically estimate these, or derive sensible defaults from schema?
'C_XATTR':C_XATTR,
'C_YATTR':C_YATTR,
'C_XINIT':C_XINIT,
'C_YINIT':C_YINIT,
'C_AINIT':C_AINIT,
'C_XSCALE':C_XSCALE,
'C_YSCALE':C_YSCALE,
'C_YRANGE':C_YRANGE,
#Can we automagically guess these or derive sensible defaults from schema?
#example: C_STANDARD_STATS ['mode', 'mean', 'stddev', 'range']
'C_STATS': {
C_XINIT: C_STANDARD_STATS if (X_STATS=='C_STANDARD_STATS') else [] ,
C_YINIT: C_STANDARD_STATS if (Y_STATS=='C_STANDARD_STATS') else []
},
'C_REFERENCES' : {},
#Can we automagically guess these or derive sensible defaults from schema? For example, from nominals
#C_AUTO_NOMINALS
'C_SUBSETS': C_AUTO_NOMINALS,
'C_COLOURGROUP':C_COLOURGROUP,
#Can we automagically guess these from schema?
#C_AUTO_INTS
'C_INTVARS':C_AUTO_INTS
}
import json
#Save the config file
with open(PATH+"data/"+STUB+"_config.js", "w") as f:
f.write('CONFIG='+json.dumps(config, indent=4,sort_keys=True))
#Save the html wrapper page
with open(PATH+STUB+".html", "w") as f:
f.write(iscatterTemplate.format(STUB))
print('CONFIG=',json.dumps(config, indent=4,sort_keys=True))
Simple example...
#Generate some dummy data
incd=pd.DataFrame(
numpy.random.randint(low=5000, high=10000, size=5).tolist() +
numpy.random.randint(low=10000, high=20000, size=4).tolist() +
numpy.random.randint(low=20000, high=40000, size=3).tolist() +
numpy.random.randint(low=40000, high=80000, size=2).tolist() +
numpy.random.randint(low=80000, high=160000, size=1).tolist(),columns=['income'])
incd=incd.sort('income').reset_index(drop=True).reset_index()
name='Dummy pop'
desc='Dummy data'
xinit='index'
yinit='income'
ainit=''
xattr=[]#['income']
yattr=[]#['index']
colour=''
STUB='demo'
#Note: I extended the minified version of the iScatter js to accept axis range setting
iScatterGen(incd,STUB,name,desc,xinit,yinit,ainit,xattr=xattr,yattr=yattr,yrange=[0,175000])
CONFIG= { "C_AINIT": "", "C_CHARTID": "", "C_COLOURGROUP": "", "C_DATA": "./data/demo.txt", "C_DESC": "Dummy data", "C_INTVARS": [], "C_NAME": "Dummy pop", "C_REFERENCES": {}, "C_SCHEMA": "./data/demo_schema.txt", "C_STATS": { "income": [ "median", "mean", "stddev", "range" ], "index": [] }, "C_SUBSETS": [], "C_XATTR": [], "C_XINIT": "index", "C_XSCALE": "", "C_YATTR": [], "C_YINIT": "income", "C_YRANGE": [ 0, 175000 ], "C_YSCALE": "log" }
incd=pd.DataFrame(
numpy.random.randint(low=5000, high=10000, size=5).tolist() +
numpy.random.randint(low=100000, high=500000, size=5).tolist() +
numpy.random.randint(low=1000000, high=5000000, size=5).tolist() +
numpy.random.randint(low=10000000, high=50000000, size=5).tolist(),columns=['income'])
incd=incd.sort('income').reset_index(drop=True).reset_index()
yscale='log'
STUB='demo2'
iScatterGen(incd,STUB,name,desc,xinit,yinit,ainit,xattr=xattr,yattr=yattr,yscale=yscale)
CONFIG= { "C_AINIT": "", "C_CHARTID": "", "C_COLOURGROUP": "", "C_DATA": "./data/demo2.txt", "C_DESC": "Dummy data", "C_INTVARS": [], "C_NAME": "Dummy pop", "C_REFERENCES": {}, "C_SCHEMA": "./data/demo2_schema.txt", "C_STATS": { "income": [ "median", "mean", "stddev", "range" ], "index": [] }, "C_SUBSETS": [], "C_XATTR": [], "C_XINIT": "index", "C_XSCALE": "", "C_YATTR": [], "C_YINIT": "income", "C_YRANGE": "", "C_YSCALE": "log" }