#!/usr/bin/env python # coding: utf-8 # ## Pecan Cookies # # ### Conditions for making maps based on the content in a column # # 1. **Category** -- any type of geometry # # + Is data type boolean (`true`, `false`, `null`) or string? # + If no, it's not a **category** map # + If yes, is x% of the data non-`null`? # + If yes, is x% of the data in the first 10 columns? # + Column name content scoring # + Histogram analysis (distribution shape) # + Check later if needed: Sparse versus dense (points per pixels threshold) # # 2. **Torque** -- point type geometry # # + If there's a date type column, make a basic torque map # + If there's a date type column and a category with a high `w`, make a torque category map # + If there's a number column where the numbers are increasing monotonically and there is no date column, make a torque map off of this column # # 3. **Choropleth** (any type of geometry) or **Bubble** (point type) # # + If number of distinct values is greater than three, then make a choropleth # + If distribution is an L or J, use a gradual color ramp and head/tails quantification # + If distribution is an A or U, use a divergent color ramp and jenks quantification # + If data is an F, use a gradual color ramp and equal interval or quantile quantification # # ![in a diagram!](http://i.imgur.com/lSOJGLk.png) # # #### Methods # 1. Look at the data as a histogram in seven bins of equal interval # 2. Classify distributions according to [Galtung's Classification](http://en.wikipedia.org/wiki/Multimodal_distribution#Galtung.27s_classification) # # #### Ideas # # 1. Based on distribution in equal interval, guess the best quantification method # 2. Generate different distributions (binormal, Poisson, gamma, pareto, powerlaw) # 3. Classify distributions of data into: # 1. Unimodal (A) # 2. Decay left (J) # 3. Bimodal (U) # 4. Multimodal (S) # 5. Decay right (L) # 6. Flat (F) # 4. Explore how kurtosis effects the quantification methods -- is this taken care of by looking at thresholds as small as 5%? # 5. Machine learning classification: classify columns by hand as suitable or unsuitable for visualization and add this as an additional column in the metrics data set. Use this as a training set to build up a decision model for test sets... # 6. Normalization: looking at the columns, find one that may be population by the variations in spellings # 7. Create polygon choropleths that are normalized against: the area of the polygon or some other parameters associated with that row (such as autodetecting a population column?) # + What units? $m^2$, $km^2$? # + How do you know when it is ok to normalize and not normalize? [You don't want to reflect a population map if your intention is highlighting another variable](http://en.wikipedia.org/wiki/Choropleth_map). # - Case 1: Census District Data -- most columns should be normalized by population, but normalizing by area may not make sense # - Case 2: Map Pluto -- it doesn't make sense to normalize any of it by the area of the building footprint # In[25]: get_ipython().run_line_magic('pylab', 'inline') import pandas as pd from urllib.request import urlopen from scipy.stats import describe as desc from pandas import DataFrame, Series import json debug = False def getRowInfo(acct,tableName): return 'http://' + acct + '.cartodb.com/api/v2/sql?q=SELECT%20*%20FROM%20' + tableName + '%20LIMIT%200&format=json' def mkUrl(acct,tableName,cols='*',limit='',fileType='csv'): """ acct: cartodb account name tableName: name of table in account geom: geometry type for normalization purposes (optional) cols: columns of data to pull (optional, default: * (all columns)) limit: number of rows requested (optional, default: all rows) fileType: format of returned data (optional, default: csv) """ stem = 'http://' + acct + '.cartodb.com/api/v2/sql?q=' params = '&format=' + fileType + \ '&filename=' + tableName query = 'SELECT%20' + cols + '%20FROM%20' + tableName + '%20' + limit url = stem + query + params return url def normalizeCols(cols): m = {ord(','): '/a.area,'} updatedCols = cols.translate(m) + '/a.area' return updatedCols def mkGeoUrl(acct,tableName): return 'http://' + acct + '.cartodb.com/api/v2/sql' \ '?q=SELECT%20ST_GeometryType(the_geom)%20geometry_type%20FROM%20' + tableName + '%20WHERE%20the_geom%20IS%20NOT%20NULL%20LIMIT%201' \ '&format=json' def simplifyType(g): return { 'st_multipolygon': 'polygon', 'st_polygon': 'polygon', 'st_multilinestring': 'line', 'st_linestring': 'line', 'st_multipoint': 'point', 'st_point': 'point' }[g.lower()] # In[26]: mkUrl('eschbacher','all_month',cols='mag,depth,place',fileType='json') # In[27]: testUrl1 = 'http://eschbacher.cartodb.com/api/v2/sql?q=SELECT%20mag,depth,place%20FROM%20all_month%20&format=json&filename=all_month' testUrl2 = 'http://eschbacher.cartodb.com/api/v2/sql?q=WITH%20a%20AS%20(SELECT%201e6*ST_Area(the_geom::geography)%20area%20FROM%20all_month)%20SELECT%20mag/a.area,depth/a.area,place/a.area%20FROM%20all_month,a%20&format=csv&filename=all_month' assert mkUrl('eschbacher','all_month',cols='mag,depth,place',fileType='json') == testUrl1 assert normalizeCols('mag,depth,place') == 'mag/a.area,depth/a.area,place/a.area' # ## Category Maps -- Map Pluto Manhattan # ### Grab all strings and bools # In[28]: acctName = 'common-data' tableName = 'mnmappluto' fileName = 'mnbuildings' # Grab column information url = getRowInfo(acctName,tableName) if debug: print(url) response = urlopen(url).readall().decode('utf-8') column_data = json.loads(response) # Filter out the string and boolean columns str_columns = [] for c in column_data['fields']: if ( column_data['fields'][c]['type'] in ('string','boolean') ): str_columns.append(c) print("Columns (" + str(len(str_columns)) + ") of type 'string' or 'boolean':\n",str_columns) # In[29]: # Create API request a = mkUrl(acctName,tableName,cols=','.join(str_columns)) print(a) # In[37]: # Retrieve only approved columns ad = pd.read_csv(a) ad.head() # In[38]: metrics = DataFrame({'uniques': [len(ad[c].unique()) for c in ad], 'perc_nulls': [ad[c].isnull().sum() / len(ad) for c in ad] }, index=ad.columns) metrics.sort('uniques') # ### Making the cut # # Making the cut means: # 1. The number of `nan`\\`null`s is smaller than 80% # 2. Number of unique entries is greater than one and less than eleven # In[43]: decision = (metrics['uniques'] >= 2) & (metrics['uniques'] <= 10) & (metrics['perc_nulls'] < 0.8) winners = [c for c in metrics[decision].index] metrics[decision].sort('uniques') # In[44]: print(sorted(winners)) # These Maps are here: # # # # # # #
# ## Who shouldn't have made it? # # 1. `plutomapid`: 2 uniques and no nulls. # # **Solution:** We should next filter out data that is very strongly in one column. 95% threshold? # # ## Who should've made it? # # 1. `lottype`: failed because of 11 uniques -- visually interesting and the rest of the data can be accessed through infowindows # # **Solution:** See if a large proportion is in the first seven modes instead of too strongly weighting the number of unique entries. Maybe 85-90% in the first seven modes? # ### How many are in each column? # In[69]: ad_win = ad[winners] maxval = 0 for c in ad: temp = 100.0 * ad[c].value_counts(dropna=False) / len(ad[c]) if len(temp) < 10: maxval = len(temp) else: maxval = 10 sval = sum(temp.head(maxval)) print(c,sval,'in',maxval,'bins') print(temp) # print(100.0 * temp / len(ad[c])) # In[64]: temp.head(10) # In[48]: ## Number patterns for c in ad i = 1 sumvals = 0 for r in ad_win[c].value_counts(dropna=False): print('i',i) print('sum',sumvals) if (i > 7): break print(r) sumvals = sumvals + r i = i + 1 print(sumvals / len(ad_win)) # ### What to do if distribution is _very_ lopsided to one category? # # **Options** # 1. If a large percentage (95%?) is in one category, drop this column # 2. # ## Choropleth Maps -- 2010 Census Data # In[17]: acctName = 'common-data' tableName = 'tract_2010census_dp1' # 'all_day' fileName = 'mnbuildings' # Grab column data url = mkUrl(acctName,tableName,limit='LIMIT%200',fileType='json') response = urlopen(url).readall().decode('utf-8') column_data = json.loads(response) # Filter out the good ones, exclude bookkeeping ones num_columns = [] banned = ['longitude','lat','latitude','long','lon','cartodb_id'] for c in column_data['fields']: if (column_data['fields'][c]['type'] == 'number') & (c not in banned): num_columns.append(c) print("Columns (" + str(len(num_columns)) + ") of type 'number':\n",num_columns) # In[19]: # Create API request num_url = mkUrl(acctName,tableName,cols=','.join(num_columns)) print(num_url) # In[20]: # Retrieve only approved columns choro_df = pd.read_csv(num_url) choro_df.head() # ## Build up some metrics on each column # # Notes about metrics: # # * **kurtosis**: a measure of the 'peakedness' of a distribution. A large value corresponds to a sharper peak, a value of zero is the normal distribution, and a negative value is flatter than a normal distribution # * **skewness**: a measure of the asymmetry of a distribution. A positive value corresponds to a 'left lean' while a negative value corresponds to a 'right lean'. For instance, `blddepth` peaks at small values and get smaller for larger values on it's histogram. It has a strong positive skewness. # In[21]: num_metrics = DataFrame({'uniques': [len(choro_df[c].unique()) for c in choro_df], 'skew': [choro_df[x].skew() for x in choro_df], 'kurtosis': [choro_df[x].kurtosis() for x in choro_df], 'mean': [choro_df[x].mean() for x in choro_df], 'median': [choro_df[x].median() for x in choro_df], 'stddev': [choro_df[x].std() for x in choro_df] }, index=choro_df.columns) l = len(choro_df) cols = [] vals = [] perc_nulls = [] ## Does 98% of the data lie in one column? for c in choro_df: cols.append(c) perc_nulls.append(choro_df[c].isnull().sum()) v, b = np.histogram(choro_df[c],bins=7) if (v / l > 0.98).any(): vals.append(True) else: vals.append(False) ## Are there outliers? ## Modality classes: L,J,A,U,F, etc. from: # http://en.wikipedia.org/wiki/Multimodal_distribution#Galtung.27s_classification num_metrics['num_98'] = Series(vals, index=cols) num_metrics['perc_nulls'] = Series(perc_nulls, index=cols) num_metrics['skew_abs'] = abs(num_metrics['skew']) num_metrics.sort('uniques') # In[23]: # answers = {} # for c in cols: # msg = "do you like " + c + "'s map? " # ans = input(msg) # print('you answered',ans) # answers[c] = ans answers = {'aland10': 'y', 'awater10': 'y', 'dp0010001': 'y', 'dp0010002': 'y', 'dp0010003': 'y', 'dp0010004': 'n', 'dp0010005': 'y', 'dp0010006': 'y', 'dp0010007': 'y', 'dp0010008': 'y', 'dp0010009': 'y', 'dp0010010': 'y', 'dp0010011': 'y', 'dp0010012': 'y', 'dp0010013': 'y', 'dp0010014': 'y', 'dp0010015': 'y', 'dp0010016': 'y', 'dp0010017': 'y', 'dp0010018': 'y', 'dp0010019': 'n', 'dp0010020': 'y', 'dp0010021': 'y', 'dp0010022': 'y', 'dp0010023': 'y', 'dp0010024': 'y', 'dp0010025': 'y', 'dp0010026': 'y', 'dp0010027': 'y', 'dp0010028': 'y', 'dp0010029': 'y', 'dp0010030': 'y', 'dp0010031': 'y', 'dp0010032': 'y', 'dp0010033': 'y', 'dp0010034': 'y', 'dp0010035': 'n', 'dp0010036': 'y', 'dp0010037': 'n', 'dp0010038': 'n', 'dp0010039': 'y', 'dp0010040': 'y', 'dp0010041': 'y', 'dp0010042': 'y', 'dp0010043': 'y', 'dp0010044': 'n', 'dp0010045': 'y', 'dp0010046': 'y', 'dp0010047': 'y', 'dp0010048': 'y', 'dp0010049': 'y', 'dp0010050': 'y', 'dp0010051': 'y', 'dp0010052': 'y', 'dp0010053': 'y', 'dp0010054': 'y', 'dp0010055': 'y', 'dp0010056': 'y', 'dp0010057': 'y', 'dp0020001': 'y', 'dp0020002': 'y', 'dp0020003': 'y', 'dp0030001': 'y', 'dp0030002': 'y', 'dp0030003': 'y', 'dp0040001': 'y', 'dp0040002': 'y', 'dp0040003': 'y', 'dp0050001': 'y', 'dp0050002': 'y', 'dp0050003': 'y', 'dp0060001': 'y', 'dp0060002': 'y', 'dp0060003': 'y', 'dp0070001': 'y', 'dp0070002': 'y', 'dp0070003': 'y', 'dp0080001': 'y', 'dp0080002': 'y', 'dp0080003': 'y', 'dp0080004': 'y', 'dp0080005': 'y', 'dp0080006': 'y', 'dp0080007': 'n', 'dp0080008': 'n', 'dp0080009': 'n', 'dp0080010': 'n', 'dp0080011': 'n', 'dp0080012': 'n', 'dp0080013': 'n', 'dp0080014': 'n', 'dp0080015': 'n', 'dp0080016': 'n', 'dp0080017': 'n', 'dp0080018': 'n', 'dp0080019': 'n', 'dp0080020': 'n', 'dp0080021': 'y', 'dp0080022': 'n', 'dp0080023': 'n', 'dp0080024': 'n', 'dp0090001': 'y', 'dp0090002': 'y', 'dp0090003': 'y', 'dp0090004': 'n', 'dp0090005': 'n', 'dp0090006': 'y', 'dp0100001': 'y', 'dp0100002': 'y', 'dp0100003': 'y', 'dp0100004': 'n', 'dp0100005': 'n', 'dp0100006': 'n', 'dp0100007': 'y', 'dp0110001': 'y', 'dp0110002': 'y', 'dp0110003': 'y', 'dp0110004': 'n', 'dp0110005': 'y', 'dp0110006': 'n', 'dp0110007': 'n', 'dp0110008': 'y', 'dp0110009': 'n', 'dp0110010': 'y', 'dp0110011': 'y', 'dp0110012': 'y', 'dp0110013': 'y', 'dp0110014': 'n', 'dp0110015': 'n', 'dp0110016': 'n', 'dp0110017': 'y', 'dp0120001': 'y', 'dp0120002': 'y', 'dp0120003': 'y', 'dp0120004': 'y', 'dp0120005': 'y', 'dp0120006': 'y', 'dp0120007': 'y', 'dp0120008': 'y', 'dp0120009': 'y', 'dp0120010': 'y', 'dp0120011': 'y', 'dp0120012': 'y', 'dp0120013': 'y', 'dp0120014': 'y', 'dp0120015': 'y', 'dp0120016': 'y', 'dp0120017': 'y', 'dp0120018': 'n', 'dp0120019': 'n', 'dp0120020': 'n', 'dp0130001': 'y', 'dp0130002': 'y', 'dp0130003': 'y', 'dp0130004': 'y', 'dp0130005': 'y', 'dp0130006': 'n', 'dp0130007': 'y', 'dp0130008': 'y', 'dp0130009': 'y', 'dp0130010': 'y', 'dp0130011': 'y', 'dp0130012': 'y', 'dp0130013': 'y', 'dp0130014': 'y', 'dp0130015': 'y', 'dp0140001': 'n', 'dp0150001': 'y', 'dp0160001': 'y', 'dp0170001': 'y', 'dp0180001': 'y', 'dp0180002': 'y', 'dp0180003': 'y', 'dp0180004': 'n', 'dp0180005': 'n', 'dp0180006': 'y', 'dp0180007': 'y', 'dp0180008': 'y', 'dp0180009': 'y', 'dp0190001': 'n', 'dp0200001': 'y', 'dp0210001': 'y', 'dp0210002': 'y', 'dp0210003': 'y', 'dp0220001': 'y', 'dp0220002': 'y', 'dp0230001': 'y', 'dp0230002': 'y', 'shape_area': 'y', 'shape_leng': 'y'} # In[26]: num_metrics['approve'] = Series(answers) len(num_metrics['approve']) # In[29]: colors = {'y': 'b', 'n': 'r'} df = num_metrics.copy() df["Color"] = df['approve'].apply(lambda x: colors[x]) df df.plot(x='kurtosis',y='skew',kind='scatter',c=df.Color,figsize=(12,12),logx=True) # In[32]: df.groupby('approve')['kurtosis'].mean() # ### Collecting columns by kurtosis # In[33]: k_max = num_metrics['kurtosis'].max() - 0.05 * abs(num_metrics['kurtosis'].max()) ## make it a little smaller k_min = 1.05 * num_metrics['kurtosis'].min() ## make it a little larger t = num_metrics['kurtosis'].groupby(pd.cut(num_metrics["kurtosis"], np.arange(k_min, k_max, 20))) print('kurtosis ranges\n-------- ------') for a in t.grouper.levels[0].values: if a in t.indices: print(a,': ',', '.join(t.get_group(a).index.values),'\n') # In[185]: df.plot(subplots=True,bins=7,layout=(9,5),figsize=(21,21),kind='hist',logy=True,legend=False); # In[354]: column_names = mkUrl(acctName,tableName,limit='LIMIT%200') print(column_names) t = pd.read_csv(column_names) banned = ['the_geom','the_geom_webmercator','cartodb_id','created_at','updated_at'] cols = [] for c in t: if (c not in banned) & (t[c].dtype != 'object'): cols.append(c) print(','.join(cols)) # Get more indepth stats using [scipy.stats.desc](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.describe.html) # In[88]: ## Give back the stats of each column (via scipy.stats) print('\t'.join(['clmn','num','minmax','mean','var','skew','kurtosis'])) for c in df: n, mm, mu, s, sk, k = desc(df[c].replace([np.inf, -np.inf], np.nan).dropna()) print(c,'\t','\t'.join(str(x) for x in [n,mm,mu,s,sk,k])) # ## Poisson Distribution (with seven bins) # More [here](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.poisson.html) # In[57]: s = np.random.poisson(20,100000) count, bins, ignored = plt.hist(s, bins=44, normed=True) plt.show() # ## Pareto Distribution # More [here](http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.pareto.html) # In[60]: a, m = 3., 1. # shape and mode s = np.random.pareto(a,100000) + m count, bins, ignored = plt.hist(s, 100, normed=True) # ## Power # More [here]() # In[62]: a = 5. # shape samples = 100000 s = np.random.power(a, samples) count, bins, ignored = plt.hist(s, bins=75) # ## Normal Dist # More [here]() # In[65]: mu, sigma = 0, 0.1 # mean and standard deviation s = np.random.normal(mu, sigma, 100000) count, bins, ignored = plt.hist(np.pi + s, bins=100, normed=True) # ## Zipf/power law # In[66]: a = 2. # parameter s = np.random.zipf(a, 100000) count, bins, ignored = plt.hist(s[s<50], bins=75, normed=True) # ## Classify Data by Distribution # # How to identify distributions of data? # # In[ ]: