#!/usr/bin/env python
# coding: utf-8

# ## Pecan Cookies
# 
# ### Conditions for making maps based on the content in a column
# 
# 1. **Category** -- any type of geometry
# 
#   + Is data type boolean (`true`, `false`, `null`) or string?
#     + If no, it's not a **category** map
#     + If yes, is x% of the data non-`null`?
#       + If yes, is x% of the data in the first 10 columns?
#     + Column name content scoring
#     + Histogram analysis (distribution shape)
#     + Check later if needed: Sparse versus dense (points per pixels threshold)
# 
# 2. **Torque** -- point type geometry
# 
#     + If there's a date type column, make a basic torque map
#     + If there's a date type column and a category with a high `w`, make a torque category map
#     + If there's a number column where the numbers are increasing monotonically and there is no date column, make a torque map off of this column
# 
# 3. **Choropleth** (any type of geometry) or **Bubble** (point type)
# 
#     + If number of distinct values is greater than three, then make a choropleth
#     + If distribution is an L or J, use a gradual color ramp and head/tails quantification
#     + If distribution is an A or U, use a divergent color ramp and jenks quantification
#     + If data is an F, use a gradual color ramp and equal interval or quantile quantification
# 
# ![in a diagram!](http://i.imgur.com/lSOJGLk.png)
# 
# #### Methods
# 1. Look at the data as a histogram in seven bins of equal interval
# 2. Classify distributions according to [Galtung's Classification](http://en.wikipedia.org/wiki/Multimodal_distribution#Galtung.27s_classification)
# 
# #### Ideas
# 
# 1. Based on distribution in equal interval, guess the best quantification method
# 2. Generate different distributions (binormal, Poisson, gamma, pareto, powerlaw)
# 3. Classify distributions of data into:
#     1. Unimodal (A)
#     2. Decay left (J)
#     3. Bimodal (U)
#     4. Multimodal (S)
#     5. Decay right (L)
#     6. Flat (F)
# 4. Explore how kurtosis effects the quantification methods -- is this taken care of by looking at thresholds as small as 5%?
# 5. Machine learning classification: classify columns by hand as suitable or unsuitable for visualization and add this as an additional column in the metrics data set. Use this as a training set to build up a decision model for test sets...
# 6. Normalization: looking at the columns, find one that may be population by the variations in spellings
# 7. Create polygon choropleths that are normalized against: the area of the polygon or some other parameters associated with that row (such as autodetecting a population column?)
#    + What units? $m^2$, $km^2$?
#    + How do you know when it is ok to normalize and not normalize? [You don't want to reflect a population map if your intention is highlighting another variable](http://en.wikipedia.org/wiki/Choropleth_map).
#        - Case 1: Census District Data -- most columns should be normalized by population, but normalizing by area may not make sense
#        - Case 2: Map Pluto -- it doesn't make sense to normalize any of it by the area of the building footprint

# In[25]:


get_ipython().run_line_magic('pylab', 'inline')
import pandas as pd
from urllib.request import urlopen
from scipy.stats import describe as desc
from pandas import DataFrame, Series
import json

debug = False

def getRowInfo(acct,tableName):
    return 'http://' + acct + '.cartodb.com/api/v2/sql?q=SELECT%20*%20FROM%20' + tableName + '%20LIMIT%200&format=json'

def mkUrl(acct,tableName,cols='*',limit='',fileType='csv'):
    """
        acct: cartodb account name
        tableName: name of table in account
        geom: geometry type for normalization purposes (optional)
        cols: columns of data to pull (optional, default: * (all columns))
        limit: number of rows requested (optional, default: all rows)
        fileType: format of returned data (optional, default: csv)
    """
    stem = 'http://' + acct + '.cartodb.com/api/v2/sql?q='

    params = '&format=' + fileType + \
             '&filename=' + tableName

    query = 'SELECT%20' + cols + '%20FROM%20' + tableName + '%20' + limit
    url = stem + query + params

    return url

def normalizeCols(cols):
    m = {ord(','): '/a.area,'}
    updatedCols = cols.translate(m) + '/a.area'
    return updatedCols

def mkGeoUrl(acct,tableName):
    return 'http://' + acct + '.cartodb.com/api/v2/sql' \
            '?q=SELECT%20ST_GeometryType(the_geom)%20geometry_type%20FROM%20' + tableName + '%20WHERE%20the_geom%20IS%20NOT%20NULL%20LIMIT%201' \
            '&format=json'

def simplifyType(g):
    return { 
        'st_multipolygon': 'polygon',
        'st_polygon': 'polygon',
        'st_multilinestring': 'line',
        'st_linestring': 'line',
        'st_multipoint': 'point',
        'st_point': 'point'
    }[g.lower()]


# In[26]:


mkUrl('eschbacher','all_month',cols='mag,depth,place',fileType='json')


# In[27]:


testUrl1 = 'http://eschbacher.cartodb.com/api/v2/sql?q=SELECT%20mag,depth,place%20FROM%20all_month%20&format=json&filename=all_month'
testUrl2 = 'http://eschbacher.cartodb.com/api/v2/sql?q=WITH%20a%20AS%20(SELECT%201e6*ST_Area(the_geom::geography)%20area%20FROM%20all_month)%20SELECT%20mag/a.area,depth/a.area,place/a.area%20FROM%20all_month,a%20&format=csv&filename=all_month'

assert mkUrl('eschbacher','all_month',cols='mag,depth,place',fileType='json') == testUrl1
assert normalizeCols('mag,depth,place') == 'mag/a.area,depth/a.area,place/a.area'


# ## Category Maps -- Map Pluto Manhattan
# ### Grab all strings and bools

# In[28]:


acctName = 'common-data'
tableName = 'mnmappluto'
fileName = 'mnbuildings'

# Grab column information
url = getRowInfo(acctName,tableName)
if debug: print(url)
response = urlopen(url).readall().decode('utf-8')
column_data = json.loads(response)

# Filter out the string and boolean columns
str_columns = []
for c in column_data['fields']:
    if ( column_data['fields'][c]['type'] in ('string','boolean') ):
        str_columns.append(c)

print("Columns (" + str(len(str_columns)) + ") of type 'string' or 'boolean':\n",str_columns)


# In[29]:


# Create API request
a = mkUrl(acctName,tableName,cols=','.join(str_columns))
print(a)


# In[37]:


# Retrieve only approved columns
ad = pd.read_csv(a)
ad.head()


# In[38]:


metrics = DataFrame({'uniques': [len(ad[c].unique()) for c in ad],
                     'perc_nulls': [ad[c].isnull().sum() / len(ad) for c in ad]
                    },
                    index=ad.columns)

metrics.sort('uniques')


# ### Making the cut
# 
# Making the cut means:
# 1. The number of `nan`\\`null`s is smaller than 80%
# 2. Number of unique entries is greater than one and less than eleven

# In[43]:


decision = (metrics['uniques'] >= 2) & (metrics['uniques'] <= 10) & (metrics['perc_nulls'] < 0.8)
winners = [c for c in metrics[decision].index]
metrics[decision].sort('uniques')


# In[44]:


print(sorted(winners))


# These Maps are here:
#     
# <table style="width: 900px; height: 900px;">
# <tr>
# <td><img src="http://i.imgur.com/xTTaAUx.png" /></td><td><img src="http://i.imgur.com/wYvbw5p.png" /></td><td><img src="http://i.imgur.com/XXtINJq.png" /></td></tr>
# <tr><td><img src="http://i.imgur.com/ak5DLQJ.png" /></td><td><img src="http://i.imgur.com/YUHpVgb.png" /></td><td><img src="http://i.imgur.com/2iNk9R9.png" /></td></tr>
# <tr><td><img src="http://i.imgur.com/DcKcLCC.png" /></td><td><img src="http://i.imgur.com/yoTO7nf.png" /></td><td><img src="http://i.imgur.com/URc2QRU.png" /></td></tr>
# </table>

# ## Who shouldn't have made it?
# 
# 1. `plutomapid`: 2 uniques and no nulls. 
# 
# **Solution:** We should next filter out data that is very strongly in one column. 95% threshold?
# 
# ## Who should've made it?
# 
# 1. `lottype`: failed because of 11 uniques -- visually interesting and the rest of the data can be accessed through infowindows
# 
# **Solution:** See if a large proportion is in the first seven modes instead of too strongly weighting the number of unique entries. Maybe 85-90% in the first seven modes?

# ### How many are in each column?

# In[69]:


ad_win = ad[winners]
maxval = 0
for c in ad:
    temp = 100.0 * ad[c].value_counts(dropna=False) / len(ad[c])
    if len(temp) < 10:
        maxval = len(temp)
    else:
        maxval = 10
    sval = sum(temp.head(maxval))
    print(c,sval,'in',maxval,'bins')
    print(temp)
    
    # print(100.0 * temp / len(ad[c]))


# In[64]:


temp.head(10)


# In[48]:


## Number patterns

for c in ad
i = 1
sumvals = 0
for r in ad_win[c].value_counts(dropna=False):
    print('i',i)
    print('sum',sumvals)
    if (i > 7):
        break
    print(r)
    sumvals = sumvals + r
    i = i + 1

print(sumvals / len(ad_win))


# ### What to do if distribution is _very_ lopsided to one category?
# 
# **Options**
# 1. If a large percentage (95%?) is in one category, drop this column
# 2. 

# ## Choropleth Maps -- 2010 Census Data

# In[17]:


acctName = 'common-data'
tableName = 'tract_2010census_dp1' # 'all_day'
fileName = 'mnbuildings'

# Grab column data
url = mkUrl(acctName,tableName,limit='LIMIT%200',fileType='json')
response = urlopen(url).readall().decode('utf-8')
column_data = json.loads(response)

# Filter out the good ones, exclude bookkeeping ones
num_columns = []
banned = ['longitude','lat','latitude','long','lon','cartodb_id']
for c in column_data['fields']:
    if (column_data['fields'][c]['type'] == 'number') & (c not in banned):
        num_columns.append(c)

print("Columns (" + str(len(num_columns)) + ") of type 'number':\n",num_columns)


# In[19]:


# Create API request
num_url = mkUrl(acctName,tableName,cols=','.join(num_columns))
print(num_url)


# In[20]:


# Retrieve only approved columns
choro_df = pd.read_csv(num_url)
choro_df.head()


# ## Build up some metrics on each column
# 
# Notes about metrics:
# 
# * **kurtosis**: a measure of the 'peakedness' of a distribution. A large value corresponds to a sharper peak, a value of zero is the normal distribution, and a negative value is flatter than a normal distribution
# * **skewness**: a measure of the asymmetry of a distribution. A positive value corresponds to a 'left lean' while a negative value corresponds to a 'right lean'. For instance, `blddepth` peaks at small values and get smaller for larger values on it's histogram. It has a strong positive skewness.

# In[21]:


num_metrics = DataFrame({'uniques': [len(choro_df[c].unique()) for c in choro_df],
                     'skew': [choro_df[x].skew() for x in choro_df],
                     'kurtosis': [choro_df[x].kurtosis() for x in choro_df],
                     'mean': [choro_df[x].mean() for x in choro_df],
                     'median': [choro_df[x].median() for x in choro_df],
                     'stddev': [choro_df[x].std() for x in choro_df]
                    },
                    index=choro_df.columns)
l = len(choro_df)
cols = []
vals = []
perc_nulls = []

## Does 98% of the data lie in one column?
for c in choro_df:
    cols.append(c)
    perc_nulls.append(choro_df[c].isnull().sum())
    v, b = np.histogram(choro_df[c],bins=7)
    if (v / l > 0.98).any():
        vals.append(True)
    else:
        vals.append(False)

## Are there outliers?

## Modality classes: L,J,A,U,F, etc. from:
# http://en.wikipedia.org/wiki/Multimodal_distribution#Galtung.27s_classification


num_metrics['num_98'] = Series(vals, index=cols)
num_metrics['perc_nulls'] = Series(perc_nulls, index=cols)
num_metrics['skew_abs'] = abs(num_metrics['skew'])
num_metrics.sort('uniques')


# In[23]:


# answers = {}
# for c in cols:
#     msg = "do you like " + c + "'s map? "
#     ans = input(msg)
#     print('you answered',ans)
#     answers[c] = ans

answers = {'aland10': 'y',
 'awater10': 'y',
 'dp0010001': 'y',
 'dp0010002': 'y',
 'dp0010003': 'y',
 'dp0010004': 'n',
 'dp0010005': 'y',
 'dp0010006': 'y',
 'dp0010007': 'y',
 'dp0010008': 'y',
 'dp0010009': 'y',
 'dp0010010': 'y',
 'dp0010011': 'y',
 'dp0010012': 'y',
 'dp0010013': 'y',
 'dp0010014': 'y',
 'dp0010015': 'y',
 'dp0010016': 'y',
 'dp0010017': 'y',
 'dp0010018': 'y',
 'dp0010019': 'n',
 'dp0010020': 'y',
 'dp0010021': 'y',
 'dp0010022': 'y',
 'dp0010023': 'y',
 'dp0010024': 'y',
 'dp0010025': 'y',
 'dp0010026': 'y',
 'dp0010027': 'y',
 'dp0010028': 'y',
 'dp0010029': 'y',
 'dp0010030': 'y',
 'dp0010031': 'y',
 'dp0010032': 'y',
 'dp0010033': 'y',
 'dp0010034': 'y',
 'dp0010035': 'n',
 'dp0010036': 'y',
 'dp0010037': 'n',
 'dp0010038': 'n',
 'dp0010039': 'y',
 'dp0010040': 'y',
 'dp0010041': 'y',
 'dp0010042': 'y',
 'dp0010043': 'y',
 'dp0010044': 'n',
 'dp0010045': 'y',
 'dp0010046': 'y',
 'dp0010047': 'y',
 'dp0010048': 'y',
 'dp0010049': 'y',
 'dp0010050': 'y',
 'dp0010051': 'y',
 'dp0010052': 'y',
 'dp0010053': 'y',
 'dp0010054': 'y',
 'dp0010055': 'y',
 'dp0010056': 'y',
 'dp0010057': 'y',
 'dp0020001': 'y',
 'dp0020002': 'y',
 'dp0020003': 'y',
 'dp0030001': 'y',
 'dp0030002': 'y',
 'dp0030003': 'y',
 'dp0040001': 'y',
 'dp0040002': 'y',
 'dp0040003': 'y',
 'dp0050001': 'y',
 'dp0050002': 'y',
 'dp0050003': 'y',
 'dp0060001': 'y',
 'dp0060002': 'y',
 'dp0060003': 'y',
 'dp0070001': 'y',
 'dp0070002': 'y',
 'dp0070003': 'y',
 'dp0080001': 'y',
 'dp0080002': 'y',
 'dp0080003': 'y',
 'dp0080004': 'y',
 'dp0080005': 'y',
 'dp0080006': 'y',
 'dp0080007': 'n',
 'dp0080008': 'n',
 'dp0080009': 'n',
 'dp0080010': 'n',
 'dp0080011': 'n',
 'dp0080012': 'n',
 'dp0080013': 'n',
 'dp0080014': 'n',
 'dp0080015': 'n',
 'dp0080016': 'n',
 'dp0080017': 'n',
 'dp0080018': 'n',
 'dp0080019': 'n',
 'dp0080020': 'n',
 'dp0080021': 'y',
 'dp0080022': 'n',
 'dp0080023': 'n',
 'dp0080024': 'n',
 'dp0090001': 'y',
 'dp0090002': 'y',
 'dp0090003': 'y',
 'dp0090004': 'n',
 'dp0090005': 'n',
 'dp0090006': 'y',
 'dp0100001': 'y',
 'dp0100002': 'y',
 'dp0100003': 'y',
 'dp0100004': 'n',
 'dp0100005': 'n',
 'dp0100006': 'n',
 'dp0100007': 'y',
 'dp0110001': 'y',
 'dp0110002': 'y',
 'dp0110003': 'y',
 'dp0110004': 'n',
 'dp0110005': 'y',
 'dp0110006': 'n',
 'dp0110007': 'n',
 'dp0110008': 'y',
 'dp0110009': 'n',
 'dp0110010': 'y',
 'dp0110011': 'y',
 'dp0110012': 'y',
 'dp0110013': 'y',
 'dp0110014': 'n',
 'dp0110015': 'n',
 'dp0110016': 'n',
 'dp0110017': 'y',
 'dp0120001': 'y',
 'dp0120002': 'y',
 'dp0120003': 'y',
 'dp0120004': 'y',
 'dp0120005': 'y',
 'dp0120006': 'y',
 'dp0120007': 'y',
 'dp0120008': 'y',
 'dp0120009': 'y',
 'dp0120010': 'y',
 'dp0120011': 'y',
 'dp0120012': 'y',
 'dp0120013': 'y',
 'dp0120014': 'y',
 'dp0120015': 'y',
 'dp0120016': 'y',
 'dp0120017': 'y',
 'dp0120018': 'n',
 'dp0120019': 'n',
 'dp0120020': 'n',
 'dp0130001': 'y',
 'dp0130002': 'y',
 'dp0130003': 'y',
 'dp0130004': 'y',
 'dp0130005': 'y',
 'dp0130006': 'n',
 'dp0130007': 'y',
 'dp0130008': 'y',
 'dp0130009': 'y',
 'dp0130010': 'y',
 'dp0130011': 'y',
 'dp0130012': 'y',
 'dp0130013': 'y',
 'dp0130014': 'y',
 'dp0130015': 'y',
 'dp0140001': 'n',
 'dp0150001': 'y',
 'dp0160001': 'y',
 'dp0170001': 'y',
 'dp0180001': 'y',
 'dp0180002': 'y',
 'dp0180003': 'y',
 'dp0180004': 'n',
 'dp0180005': 'n',
 'dp0180006': 'y',
 'dp0180007': 'y',
 'dp0180008': 'y',
 'dp0180009': 'y',
 'dp0190001': 'n',
 'dp0200001': 'y',
 'dp0210001': 'y',
 'dp0210002': 'y',
 'dp0210003': 'y',
 'dp0220001': 'y',
 'dp0220002': 'y',
 'dp0230001': 'y',
 'dp0230002': 'y',
 'shape_area': 'y',
 'shape_leng': 'y'}


# In[26]:


num_metrics['approve'] = Series(answers)
len(num_metrics['approve'])


# In[29]:


colors = {'y': 'b', 'n': 'r'}
df = num_metrics.copy()
df["Color"] = df['approve'].apply(lambda x: colors[x])
df
df.plot(x='kurtosis',y='skew',kind='scatter',c=df.Color,figsize=(12,12),logx=True)


# In[32]:


df.groupby('approve')['kurtosis'].mean()


# ### Collecting columns by kurtosis

# In[33]:


k_max = num_metrics['kurtosis'].max() - 0.05 * abs(num_metrics['kurtosis'].max()) ## make it a little smaller
k_min = 1.05 * num_metrics['kurtosis'].min() ## make it a little larger
t = num_metrics['kurtosis'].groupby(pd.cut(num_metrics["kurtosis"], np.arange(k_min, k_max, 20)))

print('kurtosis ranges\n-------- ------')
for a in t.grouper.levels[0].values:
    if a in t.indices:
        print(a,': ',', '.join(t.get_group(a).index.values),'\n')


# In[185]:


df.plot(subplots=True,bins=7,layout=(9,5),figsize=(21,21),kind='hist',logy=True,legend=False);


# In[354]:


column_names = mkUrl(acctName,tableName,limit='LIMIT%200')
print(column_names)
t = pd.read_csv(column_names)
banned = ['the_geom','the_geom_webmercator','cartodb_id','created_at','updated_at']
cols = []
for c in t:
    if (c not in banned) & (t[c].dtype != 'object'):
        cols.append(c)

print(','.join(cols))


# Get more indepth stats using [scipy.stats.desc](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.describe.html)

# In[88]:


## Give back the stats of each column (via scipy.stats)
print('\t'.join(['clmn','num','minmax','mean','var','skew','kurtosis']))
for c in df:
    n, mm, mu, s, sk, k = desc(df[c].replace([np.inf, -np.inf], np.nan).dropna())
    print(c,'\t','\t'.join(str(x) for x in [n,mm,mu,s,sk,k]))


# ## Poisson Distribution (with seven bins)
# More [here](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.poisson.html)

# In[57]:


s = np.random.poisson(20,100000)
count, bins, ignored = plt.hist(s, bins=44, normed=True)
plt.show()


# ## Pareto Distribution
# More [here](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.pareto.html)

# In[60]:


a, m = 3., 1. # shape and mode
s = np.random.pareto(a,100000) + m
count, bins, ignored = plt.hist(s, 100, normed=True)


# ## Power 
# More [here]()

# In[62]:


a = 5. # shape
samples = 100000
s = np.random.power(a, samples)
count, bins, ignored = plt.hist(s, bins=75)


# ## Normal Dist
# More [here]()

# In[65]:


mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 100000)
count, bins, ignored = plt.hist(np.pi + s, bins=100, normed=True)


# ## Zipf/power law

# In[66]:


a = 2. # parameter
s = np.random.zipf(a, 100000)
count, bins, ignored = plt.hist(s[s<50], bins=75, normed=True)


# ## Classify Data by Distribution
# 
# How to identify distributions of data?
# 

# In[ ]: