#!/usr/bin/env python
# coding: utf-8
# ## Pecan Cookies
#
# ### Conditions for making maps based on the content in a column
#
# 1. **Category** -- any type of geometry
#
# + Is data type boolean (`true`, `false`, `null`) or string?
# + If no, it's not a **category** map
# + If yes, is x% of the data non-`null`?
# + If yes, is x% of the data in the first 10 columns?
# + Column name content scoring
# + Histogram analysis (distribution shape)
# + Check later if needed: Sparse versus dense (points per pixels threshold)
#
# 2. **Torque** -- point type geometry
#
# + If there's a date type column, make a basic torque map
# + If there's a date type column and a category with a high `w`, make a torque category map
# + If there's a number column where the numbers are increasing monotonically and there is no date column, make a torque map off of this column
#
# 3. **Choropleth** (any type of geometry) or **Bubble** (point type)
#
# + If number of distinct values is greater than three, then make a choropleth
# + If distribution is an L or J, use a gradual color ramp and head/tails quantification
# + If distribution is an A or U, use a divergent color ramp and jenks quantification
# + If data is an F, use a gradual color ramp and equal interval or quantile quantification
#
# ![in a diagram!](http://i.imgur.com/lSOJGLk.png)
#
# #### Methods
# 1. Look at the data as a histogram in seven bins of equal interval
# 2. Classify distributions according to [Galtung's Classification](http://en.wikipedia.org/wiki/Multimodal_distribution#Galtung.27s_classification)
#
# #### Ideas
#
# 1. Based on distribution in equal interval, guess the best quantification method
# 2. Generate different distributions (binormal, Poisson, gamma, pareto, powerlaw)
# 3. Classify distributions of data into:
# 1. Unimodal (A)
# 2. Decay left (J)
# 3. Bimodal (U)
# 4. Multimodal (S)
# 5. Decay right (L)
# 6. Flat (F)
# 4. Explore how kurtosis effects the quantification methods -- is this taken care of by looking at thresholds as small as 5%?
# 5. Machine learning classification: classify columns by hand as suitable or unsuitable for visualization and add this as an additional column in the metrics data set. Use this as a training set to build up a decision model for test sets...
# 6. Normalization: looking at the columns, find one that may be population by the variations in spellings
# 7. Create polygon choropleths that are normalized against: the area of the polygon or some other parameters associated with that row (such as autodetecting a population column?)
# + What units? $m^2$, $km^2$?
# + How do you know when it is ok to normalize and not normalize? [You don't want to reflect a population map if your intention is highlighting another variable](http://en.wikipedia.org/wiki/Choropleth_map).
# - Case 1: Census District Data -- most columns should be normalized by population, but normalizing by area may not make sense
# - Case 2: Map Pluto -- it doesn't make sense to normalize any of it by the area of the building footprint
# In[25]:
get_ipython().run_line_magic('pylab', 'inline')
import pandas as pd
from urllib.request import urlopen
from scipy.stats import describe as desc
from pandas import DataFrame, Series
import json
debug = False
def getRowInfo(acct,tableName):
return 'http://' + acct + '.cartodb.com/api/v2/sql?q=SELECT%20*%20FROM%20' + tableName + '%20LIMIT%200&format=json'
def mkUrl(acct,tableName,cols='*',limit='',fileType='csv'):
"""
acct: cartodb account name
tableName: name of table in account
geom: geometry type for normalization purposes (optional)
cols: columns of data to pull (optional, default: * (all columns))
limit: number of rows requested (optional, default: all rows)
fileType: format of returned data (optional, default: csv)
"""
stem = 'http://' + acct + '.cartodb.com/api/v2/sql?q='
params = '&format=' + fileType + \
'&filename=' + tableName
query = 'SELECT%20' + cols + '%20FROM%20' + tableName + '%20' + limit
url = stem + query + params
return url
def normalizeCols(cols):
m = {ord(','): '/a.area,'}
updatedCols = cols.translate(m) + '/a.area'
return updatedCols
def mkGeoUrl(acct,tableName):
return 'http://' + acct + '.cartodb.com/api/v2/sql' \
'?q=SELECT%20ST_GeometryType(the_geom)%20geometry_type%20FROM%20' + tableName + '%20WHERE%20the_geom%20IS%20NOT%20NULL%20LIMIT%201' \
'&format=json'
def simplifyType(g):
return {
'st_multipolygon': 'polygon',
'st_polygon': 'polygon',
'st_multilinestring': 'line',
'st_linestring': 'line',
'st_multipoint': 'point',
'st_point': 'point'
}[g.lower()]
# In[26]:
mkUrl('eschbacher','all_month',cols='mag,depth,place',fileType='json')
# In[27]:
testUrl1 = 'http://eschbacher.cartodb.com/api/v2/sql?q=SELECT%20mag,depth,place%20FROM%20all_month%20&format=json&filename=all_month'
testUrl2 = 'http://eschbacher.cartodb.com/api/v2/sql?q=WITH%20a%20AS%20(SELECT%201e6*ST_Area(the_geom::geography)%20area%20FROM%20all_month)%20SELECT%20mag/a.area,depth/a.area,place/a.area%20FROM%20all_month,a%20&format=csv&filename=all_month'
assert mkUrl('eschbacher','all_month',cols='mag,depth,place',fileType='json') == testUrl1
assert normalizeCols('mag,depth,place') == 'mag/a.area,depth/a.area,place/a.area'
# ## Category Maps -- Map Pluto Manhattan
# ### Grab all strings and bools
# In[28]:
acctName = 'common-data'
tableName = 'mnmappluto'
fileName = 'mnbuildings'
# Grab column information
url = getRowInfo(acctName,tableName)
if debug: print(url)
response = urlopen(url).readall().decode('utf-8')
column_data = json.loads(response)
# Filter out the string and boolean columns
str_columns = []
for c in column_data['fields']:
if ( column_data['fields'][c]['type'] in ('string','boolean') ):
str_columns.append(c)
print("Columns (" + str(len(str_columns)) + ") of type 'string' or 'boolean':\n",str_columns)
# In[29]:
# Create API request
a = mkUrl(acctName,tableName,cols=','.join(str_columns))
print(a)
# In[37]:
# Retrieve only approved columns
ad = pd.read_csv(a)
ad.head()
# In[38]:
metrics = DataFrame({'uniques': [len(ad[c].unique()) for c in ad],
'perc_nulls': [ad[c].isnull().sum() / len(ad) for c in ad]
},
index=ad.columns)
metrics.sort('uniques')
# ### Making the cut
#
# Making the cut means:
# 1. The number of `nan`\\`null`s is smaller than 80%
# 2. Number of unique entries is greater than one and less than eleven
# In[43]:
decision = (metrics['uniques'] >= 2) & (metrics['uniques'] <= 10) & (metrics['perc_nulls'] < 0.8)
winners = [c for c in metrics[decision].index]
metrics[decision].sort('uniques')
# In[44]:
print(sorted(winners))
# These Maps are here:
#
#
# ## Who shouldn't have made it?
#
# 1. `plutomapid`: 2 uniques and no nulls.
#
# **Solution:** We should next filter out data that is very strongly in one column. 95% threshold?
#
# ## Who should've made it?
#
# 1. `lottype`: failed because of 11 uniques -- visually interesting and the rest of the data can be accessed through infowindows
#
# **Solution:** See if a large proportion is in the first seven modes instead of too strongly weighting the number of unique entries. Maybe 85-90% in the first seven modes?
# ### How many are in each column?
# In[69]:
ad_win = ad[winners]
maxval = 0
for c in ad:
temp = 100.0 * ad[c].value_counts(dropna=False) / len(ad[c])
if len(temp) < 10:
maxval = len(temp)
else:
maxval = 10
sval = sum(temp.head(maxval))
print(c,sval,'in',maxval,'bins')
print(temp)
# print(100.0 * temp / len(ad[c]))
# In[64]:
temp.head(10)
# In[48]:
## Number patterns
for c in ad
i = 1
sumvals = 0
for r in ad_win[c].value_counts(dropna=False):
print('i',i)
print('sum',sumvals)
if (i > 7):
break
print(r)
sumvals = sumvals + r
i = i + 1
print(sumvals / len(ad_win))
# ### What to do if distribution is _very_ lopsided to one category?
#
# **Options**
# 1. If a large percentage (95%?) is in one category, drop this column
# 2.
# ## Choropleth Maps -- 2010 Census Data
# In[17]:
acctName = 'common-data'
tableName = 'tract_2010census_dp1' # 'all_day'
fileName = 'mnbuildings'
# Grab column data
url = mkUrl(acctName,tableName,limit='LIMIT%200',fileType='json')
response = urlopen(url).readall().decode('utf-8')
column_data = json.loads(response)
# Filter out the good ones, exclude bookkeeping ones
num_columns = []
banned = ['longitude','lat','latitude','long','lon','cartodb_id']
for c in column_data['fields']:
if (column_data['fields'][c]['type'] == 'number') & (c not in banned):
num_columns.append(c)
print("Columns (" + str(len(num_columns)) + ") of type 'number':\n",num_columns)
# In[19]:
# Create API request
num_url = mkUrl(acctName,tableName,cols=','.join(num_columns))
print(num_url)
# In[20]:
# Retrieve only approved columns
choro_df = pd.read_csv(num_url)
choro_df.head()
# ## Build up some metrics on each column
#
# Notes about metrics:
#
# * **kurtosis**: a measure of the 'peakedness' of a distribution. A large value corresponds to a sharper peak, a value of zero is the normal distribution, and a negative value is flatter than a normal distribution
# * **skewness**: a measure of the asymmetry of a distribution. A positive value corresponds to a 'left lean' while a negative value corresponds to a 'right lean'. For instance, `blddepth` peaks at small values and get smaller for larger values on it's histogram. It has a strong positive skewness.
# In[21]:
num_metrics = DataFrame({'uniques': [len(choro_df[c].unique()) for c in choro_df],
'skew': [choro_df[x].skew() for x in choro_df],
'kurtosis': [choro_df[x].kurtosis() for x in choro_df],
'mean': [choro_df[x].mean() for x in choro_df],
'median': [choro_df[x].median() for x in choro_df],
'stddev': [choro_df[x].std() for x in choro_df]
},
index=choro_df.columns)
l = len(choro_df)
cols = []
vals = []
perc_nulls = []
## Does 98% of the data lie in one column?
for c in choro_df:
cols.append(c)
perc_nulls.append(choro_df[c].isnull().sum())
v, b = np.histogram(choro_df[c],bins=7)
if (v / l > 0.98).any():
vals.append(True)
else:
vals.append(False)
## Are there outliers?
## Modality classes: L,J,A,U,F, etc. from:
# http://en.wikipedia.org/wiki/Multimodal_distribution#Galtung.27s_classification
num_metrics['num_98'] = Series(vals, index=cols)
num_metrics['perc_nulls'] = Series(perc_nulls, index=cols)
num_metrics['skew_abs'] = abs(num_metrics['skew'])
num_metrics.sort('uniques')
# In[23]:
# answers = {}
# for c in cols:
# msg = "do you like " + c + "'s map? "
# ans = input(msg)
# print('you answered',ans)
# answers[c] = ans
answers = {'aland10': 'y',
'awater10': 'y',
'dp0010001': 'y',
'dp0010002': 'y',
'dp0010003': 'y',
'dp0010004': 'n',
'dp0010005': 'y',
'dp0010006': 'y',
'dp0010007': 'y',
'dp0010008': 'y',
'dp0010009': 'y',
'dp0010010': 'y',
'dp0010011': 'y',
'dp0010012': 'y',
'dp0010013': 'y',
'dp0010014': 'y',
'dp0010015': 'y',
'dp0010016': 'y',
'dp0010017': 'y',
'dp0010018': 'y',
'dp0010019': 'n',
'dp0010020': 'y',
'dp0010021': 'y',
'dp0010022': 'y',
'dp0010023': 'y',
'dp0010024': 'y',
'dp0010025': 'y',
'dp0010026': 'y',
'dp0010027': 'y',
'dp0010028': 'y',
'dp0010029': 'y',
'dp0010030': 'y',
'dp0010031': 'y',
'dp0010032': 'y',
'dp0010033': 'y',
'dp0010034': 'y',
'dp0010035': 'n',
'dp0010036': 'y',
'dp0010037': 'n',
'dp0010038': 'n',
'dp0010039': 'y',
'dp0010040': 'y',
'dp0010041': 'y',
'dp0010042': 'y',
'dp0010043': 'y',
'dp0010044': 'n',
'dp0010045': 'y',
'dp0010046': 'y',
'dp0010047': 'y',
'dp0010048': 'y',
'dp0010049': 'y',
'dp0010050': 'y',
'dp0010051': 'y',
'dp0010052': 'y',
'dp0010053': 'y',
'dp0010054': 'y',
'dp0010055': 'y',
'dp0010056': 'y',
'dp0010057': 'y',
'dp0020001': 'y',
'dp0020002': 'y',
'dp0020003': 'y',
'dp0030001': 'y',
'dp0030002': 'y',
'dp0030003': 'y',
'dp0040001': 'y',
'dp0040002': 'y',
'dp0040003': 'y',
'dp0050001': 'y',
'dp0050002': 'y',
'dp0050003': 'y',
'dp0060001': 'y',
'dp0060002': 'y',
'dp0060003': 'y',
'dp0070001': 'y',
'dp0070002': 'y',
'dp0070003': 'y',
'dp0080001': 'y',
'dp0080002': 'y',
'dp0080003': 'y',
'dp0080004': 'y',
'dp0080005': 'y',
'dp0080006': 'y',
'dp0080007': 'n',
'dp0080008': 'n',
'dp0080009': 'n',
'dp0080010': 'n',
'dp0080011': 'n',
'dp0080012': 'n',
'dp0080013': 'n',
'dp0080014': 'n',
'dp0080015': 'n',
'dp0080016': 'n',
'dp0080017': 'n',
'dp0080018': 'n',
'dp0080019': 'n',
'dp0080020': 'n',
'dp0080021': 'y',
'dp0080022': 'n',
'dp0080023': 'n',
'dp0080024': 'n',
'dp0090001': 'y',
'dp0090002': 'y',
'dp0090003': 'y',
'dp0090004': 'n',
'dp0090005': 'n',
'dp0090006': 'y',
'dp0100001': 'y',
'dp0100002': 'y',
'dp0100003': 'y',
'dp0100004': 'n',
'dp0100005': 'n',
'dp0100006': 'n',
'dp0100007': 'y',
'dp0110001': 'y',
'dp0110002': 'y',
'dp0110003': 'y',
'dp0110004': 'n',
'dp0110005': 'y',
'dp0110006': 'n',
'dp0110007': 'n',
'dp0110008': 'y',
'dp0110009': 'n',
'dp0110010': 'y',
'dp0110011': 'y',
'dp0110012': 'y',
'dp0110013': 'y',
'dp0110014': 'n',
'dp0110015': 'n',
'dp0110016': 'n',
'dp0110017': 'y',
'dp0120001': 'y',
'dp0120002': 'y',
'dp0120003': 'y',
'dp0120004': 'y',
'dp0120005': 'y',
'dp0120006': 'y',
'dp0120007': 'y',
'dp0120008': 'y',
'dp0120009': 'y',
'dp0120010': 'y',
'dp0120011': 'y',
'dp0120012': 'y',
'dp0120013': 'y',
'dp0120014': 'y',
'dp0120015': 'y',
'dp0120016': 'y',
'dp0120017': 'y',
'dp0120018': 'n',
'dp0120019': 'n',
'dp0120020': 'n',
'dp0130001': 'y',
'dp0130002': 'y',
'dp0130003': 'y',
'dp0130004': 'y',
'dp0130005': 'y',
'dp0130006': 'n',
'dp0130007': 'y',
'dp0130008': 'y',
'dp0130009': 'y',
'dp0130010': 'y',
'dp0130011': 'y',
'dp0130012': 'y',
'dp0130013': 'y',
'dp0130014': 'y',
'dp0130015': 'y',
'dp0140001': 'n',
'dp0150001': 'y',
'dp0160001': 'y',
'dp0170001': 'y',
'dp0180001': 'y',
'dp0180002': 'y',
'dp0180003': 'y',
'dp0180004': 'n',
'dp0180005': 'n',
'dp0180006': 'y',
'dp0180007': 'y',
'dp0180008': 'y',
'dp0180009': 'y',
'dp0190001': 'n',
'dp0200001': 'y',
'dp0210001': 'y',
'dp0210002': 'y',
'dp0210003': 'y',
'dp0220001': 'y',
'dp0220002': 'y',
'dp0230001': 'y',
'dp0230002': 'y',
'shape_area': 'y',
'shape_leng': 'y'}
# In[26]:
num_metrics['approve'] = Series(answers)
len(num_metrics['approve'])
# In[29]:
colors = {'y': 'b', 'n': 'r'}
df = num_metrics.copy()
df["Color"] = df['approve'].apply(lambda x: colors[x])
df
df.plot(x='kurtosis',y='skew',kind='scatter',c=df.Color,figsize=(12,12),logx=True)
# In[32]:
df.groupby('approve')['kurtosis'].mean()
# ### Collecting columns by kurtosis
# In[33]:
k_max = num_metrics['kurtosis'].max() - 0.05 * abs(num_metrics['kurtosis'].max()) ## make it a little smaller
k_min = 1.05 * num_metrics['kurtosis'].min() ## make it a little larger
t = num_metrics['kurtosis'].groupby(pd.cut(num_metrics["kurtosis"], np.arange(k_min, k_max, 20)))
print('kurtosis ranges\n-------- ------')
for a in t.grouper.levels[0].values:
if a in t.indices:
print(a,': ',', '.join(t.get_group(a).index.values),'\n')
# In[185]:
df.plot(subplots=True,bins=7,layout=(9,5),figsize=(21,21),kind='hist',logy=True,legend=False);
# In[354]:
column_names = mkUrl(acctName,tableName,limit='LIMIT%200')
print(column_names)
t = pd.read_csv(column_names)
banned = ['the_geom','the_geom_webmercator','cartodb_id','created_at','updated_at']
cols = []
for c in t:
if (c not in banned) & (t[c].dtype != 'object'):
cols.append(c)
print(','.join(cols))
# Get more indepth stats using [scipy.stats.desc](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.describe.html)
# In[88]:
## Give back the stats of each column (via scipy.stats)
print('\t'.join(['clmn','num','minmax','mean','var','skew','kurtosis']))
for c in df:
n, mm, mu, s, sk, k = desc(df[c].replace([np.inf, -np.inf], np.nan).dropna())
print(c,'\t','\t'.join(str(x) for x in [n,mm,mu,s,sk,k]))
# ## Poisson Distribution (with seven bins)
# More [here](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.poisson.html)
# In[57]:
s = np.random.poisson(20,100000)
count, bins, ignored = plt.hist(s, bins=44, normed=True)
plt.show()
# ## Pareto Distribution
# More [here](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.pareto.html)
# In[60]:
a, m = 3., 1. # shape and mode
s = np.random.pareto(a,100000) + m
count, bins, ignored = plt.hist(s, 100, normed=True)
# ## Power
# More [here]()
# In[62]:
a = 5. # shape
samples = 100000
s = np.random.power(a, samples)
count, bins, ignored = plt.hist(s, bins=75)
# ## Normal Dist
# More [here]()
# In[65]:
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 100000)
count, bins, ignored = plt.hist(np.pi + s, bins=100, normed=True)
# ## Zipf/power law
# In[66]:
a = 2. # parameter
s = np.random.zipf(a, 100000)
count, bins, ignored = plt.hist(s[s<50], bins=75, normed=True)
# ## Classify Data by Distribution
#
# How to identify distributions of data?
#
# In[ ]: