To parse text files with fixed columns such as Census DataDict.txt
Reading off the columns
[u'Data_Item', u'Item_Description', u'Unit', u'Decimal', u'US_Total', u'Minimum', u'Maximum', u'Source']
Hint: if you use requests to read the file, you may need to turn verify off for requests.get: http://stackoverflow.com/questions/10667960/python-requests-throwing-up-sslerror
from itertools import islice
import requests
import StringIO
import os
cafile = os.path.join(os.pardir, "data/cacert.pem")
datadict_url = "https://raw.github.com/rdhyee/working-open-data/5ef3932b4ff7cadf1f06ca01eb852ad71361894a/data/census/DataDict.txt"
r = requests.get(datadict_url, verify=cafile)
f = StringIO.StringIO(r.content.decode("iso-8859-1"))
r.text
u"Data_Item Item_Description UnitDecimal US_Total Minimum Maximum Source\nSTATECOU FIPS State and County code\nPST045212 Resident total population estimate (July 1) 2012 ABS 0 313914040 576412 313914040 CENSUS\nPST045211 Resident total population estimate (July 1) 2011 ABS 0 311587816 90 311587816 CENSUS\nPST040210 Resident total population, estimates base (April 1) 2010 ABS 0 308747508 82 308747508 CENSUS\nPST120212 Resident total population, percent change - April 1, 2010 to July 1, 2012 PCT 1 1.7 -0.2 5.1 CENSUS\nPST120211 Resident total population, percent change - April 1, 2010 to July 1, 2011 PCT 1 0.9 -18.1 14.6 CENSUS\nPOP010210 Resident population (April 1 - complete count) 2010 ABS 0 308745538 82 308745538 CENSUS\nAGE135211 Resident population under 5 years, percent, 2011 PCT 1 6.5 0.0 13.3 CENSUS\nAGE295211 Resident population under 18 years, percent, 2011 PCT 1 23.7 0.0 41.0 CENSUS\nAGE775211 Resident population 65 years and over, percent, 2011 PCT 1 13.3 3.7 45.5 CENSUS\nSEX255211 Resident population: total females, percent, 2011 PCT 1 50.8 28.7 56.8 CENSUS\nRHI125211 Resident population: White alone, percent, 2011 PCT 1 78.1 3.9 99.7 CENSUS\nRHI225211 Resident population: Black alone, percent, 2011 PCT 1 13.1 0.0 84.7 CENSUS\nRHI325211 Resident population: American Indian and Alaska Native alone, percent, 2011 PCT 1 1.2 0.0 93.3 CENSUS\nRHI425211 Resident population: Asian alone, percent, 2011 PCT 1 5.0 0.0 43.6 CENSUS\nRHI525211 Resident population: Native Hawaiian and Other Pacific Islander alone, percent, 2011 PCT 1 0.2 0.0 48.9 CENSUS\nRHI625211 Resident population: Two or more races, percent, 2011 PCT 1 2.3 0.0 29.2 CENSUS\nRHI725211 Resident population: Hispanic or Latino Origin, percent, 2011 PCT 1 16.7 0.0 95.6 CENSUS\nRHI825211 Resident population: Not Hispanic, White alone, percent, 2011 PCT 1 63.4 3.2 98.9 CENSUS\nPOP715211 Population 1 year and over by residence - same house, one year ago, percent, 2007-2011 PCT 1 84.6 49.0 100.0 CENSUS\nPOP645211 Place of birth, foreign born, percent, 2007-2011 PCT 1 12.8 0.0 63.4 CENSUS\nPOP815211 Population 5 years and over, percent speaking language other than English at home, 2007-2011 PCT 1 20.3 0.0 95.9 CENSUS\nEDU635211 Educational attainment - persons 25 years and over - percent high school graduate or higher, 2007-2011 PCT 1 85.4 46.3 98.6 CENSUS\nEDU685211 Educational attainment - persons 25 years and over - percent bachelor's degree or higher, 2007-2011 PCT 1 28.2 4.2 72.0 CENSUS\nVET605211 Veterans - total, 2007-2011 ABS 0 22215303 0 22215303 CENSUS\nLFE305211 Average travel time to work for workers 16 years and over not working at home, 2007-2011 MIN 1 25.4 4.3 42.5 CENSUS\nHSG010211 Housing unit estimates, 2011 ABS 0 132312404 48 132312404 CENSUS\nHSG445211 Owner-occupied housing units - percent of total occupied housing units, 2007-2011 PCT 1 66.1 0.0 93.7 CENSUS\nHSG096211 Housing units by units in structure - multi-dwelling structure, percent, 2007-2011 PCT 1 25.9 0.0 98.4 CENSUS\nHSG495211 Median value of specified owner-occupied housing units, 2007-2011 DOL 0 186200 0 993900 CENSUS\nHSD410211 Households, 2007-2011 ABS 0 114761359 27 114761359 CENSUS\nHSD310211 Average household size, 2007-2011 AVG 2 2.60 1.20 4.77 CENSUS\nINC910211 Per capita income in the past 12 months (in 2011 inflation-adjusted dollars), 2007-2011 DOL 0 27915 7887 61290 CENSUS\nINC110211 Median household income, 2007-2011 DOL 0 52762 19344 120332 CENSUS\nPVY020211 People of all ages in poverty, percent, 2007-2011 PCT 1 14.3 0.0 53.5 CENSUS\nBZA010210 Private nonfarm establishments, 2010 ABS 0 7396628 0 7396628 CENSUS\nBZA110210 Private nonfarm employment for pay period including March 12, 2010 ABS 0 111970095 0 111970095 CENSUS\nBZA115210 Private nonfarm employment for pay period including March 12, 2010, percent change, 2000-2010 PCT 1 -1.8 -82.7 385.8 CENSUS\nNES010210 Nonemployer: total (NAICS 00) - establishments, 2010 ABS 0 22110628 17 22110628 CENSUS\nSBO001207 Total number of firms, 2007 ABS 0 27092908 27 27092908 CENSUS\nSBO315207 Total Black-owned firms, percent, 2007 PCT 1 7.1 0.2 66.7 CENSUS\nSBO115207 Total American Indian- and Alaska Native-owned firms, percent, 2007 PCT 1 0.9 0.2 71.8 CENSUS\nSBO215207 Total Asian-owned firms, percent, 2007 PCT 1 5.7 0.3 56.6 CENSUS\nSBO515207 Total Native Hawaiian- and Other Pacific Islander-owned firms, percent, 2007 PCT 1 0.1 0.0 10.5 CENSUS\nSBO415207 Total Hispanic-owned firms, percent, 2007 PCT 1 8.3 0.3 78.0 CENSUS\nSBO015207 Total Women-owned firms, percent, 2007 PCT 1 28.8 6.5 56.2 CENSUS\nMAN450207 Manufacturing: total (NAICS 31-33) - value of shipments, 2007 TH$ 0 5338306501 56903 5338306501 CENSUS\nWTN220207 Wholesale trade: merchant wholesalers (NAICS 42) - sales of establishments with payroll, 2007 TH$ 0 4174286516 0 4174286516 CENSUS\nRTN130207 Retail trade: total (NAICS 44-45) - sales of establishments with payroll, 2007 TH$ 0 3917663456 0 3917663456 CENSUS\nRTN131207 Retail trade: total (NAICS 44-45) - sales of establishments with payroll per capita, 2007 DOL 0 12990 0 80800 CENSUS\nAFN120207 Accommodation and Food Services: total (NAICS 72) - sales of establishments with payroll, 2007 TH$ 0 613795732 0 613795732 CENSUS\nBPS030211 New private housing units authorized by building permits - total, 2011 (20,000-place universe) ABS 0 624061 0 624061 CENSUS\nLND110210 Land area in square miles, 2010 SQM 2 3531905.43 2 3531905.43 CENSUS\nPOP060210 Population per square mile, 2010 RTE 1 87.4 0.0 69467.5 CENSUS\n"
import os
import codecs
from itertools import islice, izip
from pandas import DataFrame
datadict_path = os.path.join(os.pardir, "data/census/DataDict.txt")
# f = islice(codecs.open(datadict_path, mode="rU", encoding="iso-8859-1"), None)
cafile = os.path.join(os.pardir, "data/cacert.pem")
datadict_url = "https://raw.github.com/rdhyee/working-open-data/5ef3932b4ff7cadf1f06ca01eb852ad71361894a/data/census/DataDict.txt"
r = requests.get(datadict_url, verify=cafile)
f = StringIO.StringIO(r.content.decode("iso-8859-1"))
header_row = f.next()
header_row.split()
headers = [u'Data_Item',
u'Item_Description',
u'Unit',
u'Decimal',
u'US_Total',
u'Minimum',
u'Maximum',
u'Source']
column_boundaries = [(0, 8),
(10, 111),
(115, 117),
(122, 122),
(129, 137),
(144, 149),
(153, 161),
(164, 169)
]
header_to_columns = dict(izip(headers, column_boundaries))
# skip 2nd row -- exceptional
rows = islice(f, 1, None)
parsed_rows = []
for row in rows:
row_dict = {}
for (header, bound) in header_to_columns.iteritems():
row_dict[header] = row[bound[0]:bound[1]+1]
parsed_rows.append(row_dict)
data_dict_df = DataFrame(parsed_rows)
# TEST
assert set(data_dict_df.columns) == set([u'Decimal',
u'Maximum',
u'Source',
u'Minimum',
u'Unit',
u'US_Total',
u'Data_Item',
u'Item_Description'])
assert set(data_dict_df["Data_Item"]) == set([u'RHI125211',
u'SBO415207', u'VET605211', u'RHI225211', u'PVY020211', u'HSD310211',
u'POP645211', u'EDU635211', u'EDU685211', u'RHI625211', u'SBO215207',
u'PST045212', u'SBO015207', u'POP715211', u'PST120211', u'PST120212',
u'POP010210', u'PST045211', u'SBO315207', u'POP060210', u'RHI425211',
u'POP815211', u'HSD410211', u'HSG495211', u'BZA010210', u'LFE305211',
u'BZA110210', u'AGE775211', u'HSG096211', u'RHI525211', u'LND110210',
u'PST040210', u'RHI825211', u'BZA115210', u'NES010210', u'MAN450207',
u'AGE135211', u'RTN131207', u'RHI725211', u'BPS030211', u'INC110211',
u'AGE295211', u'SBO115207', u'INC910211', u'RHI325211', u'WTN220207',
u'HSG445211', u'SBO515207', u'AFN120207', u'RTN130207', u'HSG010211',
u'SEX255211', u'SBO001207'])
k = np.array(list('hi there! '), np.dtype('U1'))
k.dtype
dtype('<U1')
len(k)
10
k[0]
u'h'
k[1]
u'i'
np.char.isspace(k)
array([False, False, True, False, False, False, False, False, False, True], dtype=bool)
r1 = "12 123 456"
r2 = "23 455 xx"
a1= np.frombuffer(r1, dtype='S1')
a2 = np.frombuffer(r1, dtype='S1')
np.char.isspace(a1) & np.char.isspace(a2)
array([False, False, True, False, False, False, True, False, False, False], dtype=bool)
u"1".encode('utf-8')
'1'
rows = ["12 123 456",
"23 455 xx",
" 4 789 333"]
m = np.char.isspace(np.vstack((np.frombuffer(row, dtype='S1') for row in rows)))
m
array([[False, False, True, False, False, False, True, False, False, False], [False, False, True, False, False, False, True, True, False, False], [ True, False, True, False, False, False, True, False, False, False]], dtype=bool)
m.shape
(3, 10)
import pandas as pd
cols_isspace = pd.Series([np.all(col) for col in m.T])
cols_isspace[cols_isspace].index
Int64Index([2, 6], dtype=int64)
m
array([[False, False, True, False, False, False, True, False, False, False], [False, False, True, False, False, False, True, True, False, False], [ True, False, True, False, False, False, True, False, False, False]], dtype=bool)
np.where(np.all(m, 0))
(array([2, 6]),)
from itertools import islice, izip, groupby
import re
import os
import codecs
import operator
DATA_DIR = os.path.join(os.pardir, "data")
f = codecs.open(os.path.join(DATA_DIR, "census/DataDict.txt"), encoding="iso-8859-1")
f_sliced = islice(f, None)
head_row = f_sliced.next()
headers = head_row.split()
print headers
# Actually, Unit should be broken off from Decimal
# skip the second row also
header2 = f_sliced.next()
# read in all the rows
rows = list([r[:-1] for r in islice(f_sliced,None)])
# What's the max length of rows?
max_len = max([len(row) for row in rows])
print max_len
# loop through all rows, looking for which columns have spaces exclusively
cols_with_space = set(range(max_len))
for row in rows:
cols_with_space_in_row = set([m.start() for m in re.finditer(' ', row)])
cols_with_space.intersection_update(cols_with_space_in_row)
cols_with_data = set(range(max_len)) - cols_with_space
# print sorted(cols_with_data)
# http://code.activestate.com/recipes/496682-make-ranges-of-contiguous-numbers-from-a-list-of-i/#c2
ranges = [map(operator.itemgetter(1), g) for k, g in groupby(enumerate(sorted(cols_with_data)), lambda (i,x):i-x) ]
print [(r[0], r[-1]+1) for r in ranges]
for row in rows:
print [row[r[0]:r[-1]+1].strip() for r in ranges]
[u'Data_Item', u'Item_Description', u'UnitDecimal', u'US_Total', u'Minimum', u'Maximum', u'Source'] 170 [(0, 9), (10, 112), (115, 118), (122, 123), (128, 138), (144, 150), (152, 162), (164, 170)] [u'PST045212', u'Resident total population estimate (July 1) 2012', u'ABS', u'0', u'313914040', u'576412', u'313914040', u'CENSUS'] [u'PST045211', u'Resident total population estimate (July 1) 2011', u'ABS', u'0', u'311587816', u'90', u'311587816', u'CENSUS'] [u'PST040210', u'Resident total population, estimates base (April 1) 2010', u'ABS', u'0', u'308747508', u'82', u'308747508', u'CENSUS'] [u'PST120212', u'Resident total population, percent change - April 1, 2010 to July 1, 2012', u'PCT', u'1', u'1.7', u'-0.2', u'5.1', u'CENSUS'] [u'PST120211', u'Resident total population, percent change - April 1, 2010 to July 1, 2011', u'PCT', u'1', u'0.9', u'-18.1', u'14.6', u'CENSUS'] [u'POP010210', u'Resident population (April 1 - complete count) 2010', u'ABS', u'0', u'308745538', u'82', u'308745538', u'CENSUS'] [u'AGE135211', u'Resident population under 5 years, percent, 2011', u'PCT', u'1', u'6.5', u'0.0', u'13.3', u'CENSUS'] [u'AGE295211', u'Resident population under 18 years, percent, 2011', u'PCT', u'1', u'23.7', u'0.0', u'41.0', u'CENSUS'] [u'AGE775211', u'Resident population 65 years and over, percent, 2011', u'PCT', u'1', u'13.3', u'3.7', u'45.5', u'CENSUS'] [u'SEX255211', u'Resident population: total females, percent, 2011', u'PCT', u'1', u'50.8', u'28.7', u'56.8', u'CENSUS'] [u'RHI125211', u'Resident population: White alone, percent, 2011', u'PCT', u'1', u'78.1', u'3.9', u'99.7', u'CENSUS'] [u'RHI225211', u'Resident population: Black alone, percent, 2011', u'PCT', u'1', u'13.1', u'0.0', u'84.7', u'CENSUS'] [u'RHI325211', u'Resident population: American Indian and Alaska Native alone, percent, 2011', u'PCT', u'1', u'1.2', u'0.0', u'93.3', u'CENSUS'] [u'RHI425211', u'Resident population: Asian alone, percent, 2011', u'PCT', u'1', u'5.0', u'0.0', u'43.6', u'CENSUS'] [u'RHI525211', u'Resident population: Native Hawaiian and Other Pacific Islander alone, percent, 2011', u'PCT', u'1', u'0.2', u'0.0', u'48.9', u'CENSUS'] [u'RHI625211', u'Resident population: Two or more races, percent, 2011', u'PCT', u'1', u'2.3', u'0.0', u'29.2', u'CENSUS'] [u'RHI725211', u'Resident population: Hispanic or Latino Origin, percent, 2011', u'PCT', u'1', u'16.7', u'0.0', u'95.6', u'CENSUS'] [u'RHI825211', u'Resident population: Not Hispanic, White alone, percent, 2011', u'PCT', u'1', u'63.4', u'3.2', u'98.9', u'CENSUS'] [u'POP715211', u'Population 1 year and over by residence - same house, one year ago, percent, 2007-2011', u'PCT', u'1', u'84.6', u'49.0', u'100.0', u'CENSUS'] [u'POP645211', u'Place of birth, foreign born, percent, 2007-2011', u'PCT', u'1', u'12.8', u'0.0', u'63.4', u'CENSUS'] [u'POP815211', u'Population 5 years and over, percent speaking language other than English at home, 2007-2011', u'PCT', u'1', u'20.3', u'0.0', u'95.9', u'CENSUS'] [u'EDU635211', u'Educational attainment - persons 25 years and over - percent high school graduate or higher, 2007-2011', u'PCT', u'1', u'85.4', u'46.3', u'98.6', u'CENSUS'] [u'EDU685211', u"Educational attainment - persons 25 years and over - percent bachelor's degree or higher, 2007-2011", u'PCT', u'1', u'28.2', u'4.2', u'72.0', u'CENSUS'] [u'VET605211', u'Veterans - total, 2007-2011', u'ABS', u'0', u'22215303', u'0', u'22215303', u'CENSUS'] [u'LFE305211', u'Average travel time to work for workers 16 years and over not working at home, 2007-2011', u'MIN', u'1', u'25.4', u'4.3', u'42.5', u'CENSUS'] [u'HSG010211', u'Housing unit estimates, 2011', u'ABS', u'0', u'132312404', u'48', u'132312404', u'CENSUS'] [u'HSG445211', u'Owner-occupied housing units - percent of total occupied housing units, 2007-2011', u'PCT', u'1', u'66.1', u'0.0', u'93.7', u'CENSUS'] [u'HSG096211', u'Housing units by units in structure - multi-dwelling structure, percent, 2007-2011', u'PCT', u'1', u'25.9', u'0.0', u'98.4', u'CENSUS'] [u'HSG495211', u'Median value of specified owner-occupied housing units, 2007-2011', u'DOL', u'0', u'186200', u'0', u'993900', u'CENSUS'] [u'HSD410211', u'Households, 2007-2011', u'ABS', u'0', u'114761359', u'27', u'114761359', u'CENSUS'] [u'HSD310211', u'Average household size, 2007-2011', u'AVG', u'2', u'2.60', u'1.20', u'4.77', u'CENSUS'] [u'INC910211', u'Per capita income in the past 12 months (in 2011 inflation-adjusted dollars), 2007-2011', u'DOL', u'0', u'27915', u'7887', u'61290', u'CENSUS'] [u'INC110211', u'Median household income, 2007-2011', u'DOL', u'0', u'52762', u'19344', u'120332', u'CENSUS'] [u'PVY020211', u'People of all ages in poverty, percent, 2007-2011', u'PCT', u'1', u'14.3', u'0.0', u'53.5', u'CENSUS'] [u'BZA010210', u'Private nonfarm establishments, 2010', u'ABS', u'0', u'7396628', u'0', u'7396628', u'CENSUS'] [u'BZA110210', u'Private nonfarm employment for pay period including March 12, 2010', u'ABS', u'0', u'111970095', u'0', u'111970095', u'CENSUS'] [u'BZA115210', u'Private nonfarm employment for pay period including March 12, 2010, percent change, 2000-2010', u'PCT', u'1', u'-1.8', u'-82.7', u'385.8', u'CENSUS'] [u'NES010210', u'Nonemployer: total (NAICS 00) - establishments, 2010', u'ABS', u'0', u'22110628', u'17', u'22110628', u'CENSUS'] [u'SBO001207', u'Total number of firms, 2007', u'ABS', u'0', u'27092908', u'27', u'27092908', u'CENSUS'] [u'SBO315207', u'Total Black-owned firms, percent, 2007', u'PCT', u'1', u'7.1', u'0.2', u'66.7', u'CENSUS'] [u'SBO115207', u'Total American Indian- and Alaska Native-owned firms, percent, 2007', u'PCT', u'1', u'0.9', u'0.2', u'71.8', u'CENSUS'] [u'SBO215207', u'Total Asian-owned firms, percent, 2007', u'PCT', u'1', u'5.7', u'0.3', u'56.6', u'CENSUS'] [u'SBO515207', u'Total Native Hawaiian- and Other Pacific Islander-owned firms, percent, 2007', u'PCT', u'1', u'0.1', u'0.0', u'10.5', u'CENSUS'] [u'SBO415207', u'Total Hispanic-owned firms, percent, 2007', u'PCT', u'1', u'8.3', u'0.3', u'78.0', u'CENSUS'] [u'SBO015207', u'Total Women-owned firms, percent, 2007', u'PCT', u'1', u'28.8', u'6.5', u'56.2', u'CENSUS'] [u'MAN450207', u'Manufacturing: total (NAICS 31-33) - value of shipments, 2007', u'TH$', u'0', u'5338306501', u'56903', u'5338306501', u'CENSUS'] [u'WTN220207', u'Wholesale trade: merchant wholesalers (NAICS 42) - sales of establishments with payroll, 2007', u'TH$', u'0', u'4174286516', u'0', u'4174286516', u'CENSUS'] [u'RTN130207', u'Retail trade: total (NAICS 44-45) - sales of establishments with payroll, 2007', u'TH$', u'0', u'3917663456', u'0', u'3917663456', u'CENSUS'] [u'RTN131207', u'Retail trade: total (NAICS 44-45) - sales of establishments with payroll per capita, 2007', u'DOL', u'0', u'12990', u'0', u'80800', u'CENSUS'] [u'AFN120207', u'Accommodation and Food Services: total (NAICS 72) - sales of establishments with payroll, 2007', u'TH$', u'0', u'613795732', u'0', u'613795732', u'CENSUS'] [u'BPS030211', u'New private housing units authorized by building permits - total, 2011 (20,000-place universe)', u'ABS', u'0', u'624061', u'0', u'624061', u'CENSUS'] [u'LND110210', u'Land area in square miles, 2010', u'SQM', u'2', u'3531905.43', u'2', u'3531905.43', u'CENSUS'] [u'POP060210', u'Population per square mile, 2010', u'RTE', u'1', u'87.4', u'0.0', u'69467.5', u'CENSUS']
how to cast into standard size string?
s = "hello there folks"
print [i for (i, k) in enumerate(list(s)) if k == ' ']
# http://stackoverflow.com/a/4664889/7782
import re
print [m.start() for m in re.finditer(' ', s)]
[5, 11] [5, 11]
import sets
a = set(range(10))
a.intersection_update([2,3])
a
set([2, 3])
# http://code.activestate.com/recipes/496682-make-ranges-of-contiguous-numbers-from-a-list-of-i/#c2
from itertools import groupby
import operator
data = [ 1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28]
for k, g in groupby(enumerate(data), lambda (i,x):i-x):
print map(operator.itemgetter(1), g)
[1] [4, 5, 6] [10] [15, 16, 17, 18] [22] [25, 26, 27, 28]
# masking
# http://docs.scipy.org/doc/numpy/reference/maskedarray.generic.html
import numpy as np
import numpy.ma as ma
x = np.array([1, 2, 3, -1, 5])
mx = ma.masked_array(x, mask=[0, 0, 0, 1, 0])
mx
masked_array(data = [1 2 3 -- 5], mask = [False False False True False], fill_value = 999999)
mx.count()
4
# np.ma.clump_masked
[mx[s].data for s in np.ma.clump_unmasked(mx)]
[array([1, 2, 3]), array([5])]
# http://stackoverflow.com/a/14606271/7782
import numpy as np
nan = np.nan
def using_clump(a):
return [a[s] for s in np.ma.clump_unmasked(np.ma.masked_invalid(a))]
x = [nan,nan, 1 , 2 , 3 , nan, nan, 10, 11 , nan, nan, nan, 23, 1, nan, 7, 8]
using_clump(x)
[[1, 2, 3], [10, 11], [23, 1], [7, 8]]
from itertools import islice, izip, groupby
import re
import os
import codecs
import operator
import pandas as pd
DATA_DIR = os.path.join(os.pardir, "data")
f = codecs.open(os.path.join(DATA_DIR, "census/DataDict.txt"), encoding="iso-8859-1")
f_sliced = islice(f, None)
head_row = f_sliced.next()
headers = head_row.split()
print headers
# Actually, Unit should be broken off from Decimal
# skip the second row also
header2 = f_sliced.next()
# read in all the rows
rows = list([r[:-1] for r in islice(f_sliced,None)])
# What's the max length of rows?
max_len = max([len(row) for row in rows])
print max_len, len(rows)
rows_array = np.vstack((np.array(list(row), dtype='S1') for row in rows))
m = np.char.isspace(rows_array)
print m.shape
mask = np.all(m,0)
#mask = np.where(np.all(m,0))
mask
# mask == np.array([np.all(col) for col in m.T])
df = pd.DataFrame([["".join(list(rows_array_row[s])).strip() for s in np.ma.clump_unmasked(np.ma.array(rows_array_row, mask =mask))] for rows_array_row in rows_array],
columns = [u'Data_Item', u'Item_Description', u'Unit', u'Decimal', u'US_Total', u'Minimum', u'Maximum', u'Source'])
df
[u'Data_Item', u'Item_Description', u'UnitDecimal', u'US_Total', u'Minimum', u'Maximum', u'Source'] 170 53 (53, 170)
Data_Item | Item_Description | Unit | Decimal | US_Total | Minimum | Maximum | Source | |
---|---|---|---|---|---|---|---|---|
0 | PST045212 | Resident total population estimate (July 1) 2012 | ABS | 0 | 313914040 | 576412 | 313914040 | CENSUS |
1 | PST045211 | Resident total population estimate (July 1) 2011 | ABS | 0 | 311587816 | 90 | 311587816 | CENSUS |
2 | PST040210 | Resident total population, estimates base (Apr... | ABS | 0 | 308747508 | 82 | 308747508 | CENSUS |
3 | PST120212 | Resident total population, percent change - Ap... | PCT | 1 | 1.7 | -0.2 | 5.1 | CENSUS |
4 | PST120211 | Resident total population, percent change - Ap... | PCT | 1 | 0.9 | -18.1 | 14.6 | CENSUS |
5 | POP010210 | Resident population (April 1 - complete count)... | ABS | 0 | 308745538 | 82 | 308745538 | CENSUS |
6 | AGE135211 | Resident population under 5 years, percent, 2011 | PCT | 1 | 6.5 | 0.0 | 13.3 | CENSUS |
7 | AGE295211 | Resident population under 18 years, percent, 2011 | PCT | 1 | 23.7 | 0.0 | 41.0 | CENSUS |
8 | AGE775211 | Resident population 65 years and over, percent... | PCT | 1 | 13.3 | 3.7 | 45.5 | CENSUS |
9 | SEX255211 | Resident population: total females, percent, 2011 | PCT | 1 | 50.8 | 28.7 | 56.8 | CENSUS |
10 | RHI125211 | Resident population: White alone, percent, 2011 | PCT | 1 | 78.1 | 3.9 | 99.7 | CENSUS |
11 | RHI225211 | Resident population: Black alone, percent, 2011 | PCT | 1 | 13.1 | 0.0 | 84.7 | CENSUS |
12 | RHI325211 | Resident population: American Indian and Alask... | PCT | 1 | 1.2 | 0.0 | 93.3 | CENSUS |
13 | RHI425211 | Resident population: Asian alone, percent, 2011 | PCT | 1 | 5.0 | 0.0 | 43.6 | CENSUS |
14 | RHI525211 | Resident population: Native Hawaiian and Other... | PCT | 1 | 0.2 | 0.0 | 48.9 | CENSUS |
15 | RHI625211 | Resident population: Two or more races, percen... | PCT | 1 | 2.3 | 0.0 | 29.2 | CENSUS |
16 | RHI725211 | Resident population: Hispanic or Latino Origin... | PCT | 1 | 16.7 | 0.0 | 95.6 | CENSUS |
17 | RHI825211 | Resident population: Not Hispanic, White alone... | PCT | 1 | 63.4 | 3.2 | 98.9 | CENSUS |
18 | POP715211 | Population 1 year and over by residence - same... | PCT | 1 | 84.6 | 49.0 | 100.0 | CENSUS |
19 | POP645211 | Place of birth, foreign born, percent, 2007-2011 | PCT | 1 | 12.8 | 0.0 | 63.4 | CENSUS |
20 | POP815211 | Population 5 years and over, percent speaking ... | PCT | 1 | 20.3 | 0.0 | 95.9 | CENSUS |
21 | EDU635211 | Educational attainment - persons 25 years and ... | PCT | 1 | 85.4 | 46.3 | 98.6 | CENSUS |
22 | EDU685211 | Educational attainment - persons 25 years and ... | PCT | 1 | 28.2 | 4.2 | 72.0 | CENSUS |
23 | VET605211 | Veterans - total, 2007-2011 | ABS | 0 | 22215303 | 0 | 22215303 | CENSUS |
24 | LFE305211 | Average travel time to work for workers 16 yea... | MIN | 1 | 25.4 | 4.3 | 42.5 | CENSUS |
25 | HSG010211 | Housing unit estimates, 2011 | ABS | 0 | 132312404 | 48 | 132312404 | CENSUS |
26 | HSG445211 | Owner-occupied housing units - percent of tota... | PCT | 1 | 66.1 | 0.0 | 93.7 | CENSUS |
27 | HSG096211 | Housing units by units in structure - multi-dw... | PCT | 1 | 25.9 | 0.0 | 98.4 | CENSUS |
28 | HSG495211 | Median value of specified owner-occupied housi... | DOL | 0 | 186200 | 0 | 993900 | CENSUS |
29 | HSD410211 | Households, 2007-2011 | ABS | 0 | 114761359 | 27 | 114761359 | CENSUS |
30 | HSD310211 | Average household size, 2007-2011 | AVG | 2 | 2.60 | 1.20 | 4.77 | CENSUS |
31 | INC910211 | Per capita income in the past 12 months (in 20... | DOL | 0 | 27915 | 7887 | 61290 | CENSUS |
32 | INC110211 | Median household income, 2007-2011 | DOL | 0 | 52762 | 19344 | 120332 | CENSUS |
33 | PVY020211 | People of all ages in poverty, percent, 2007-2011 | PCT | 1 | 14.3 | 0.0 | 53.5 | CENSUS |
34 | BZA010210 | Private nonfarm establishments, 2010 | ABS | 0 | 7396628 | 0 | 7396628 | CENSUS |
35 | BZA110210 | Private nonfarm employment for pay period incl... | ABS | 0 | 111970095 | 0 | 111970095 | CENSUS |
36 | BZA115210 | Private nonfarm employment for pay period incl... | PCT | 1 | -1.8 | -82.7 | 385.8 | CENSUS |
37 | NES010210 | Nonemployer: total (NAICS 00) - establishments... | ABS | 0 | 22110628 | 17 | 22110628 | CENSUS |
38 | SBO001207 | Total number of firms, 2007 | ABS | 0 | 27092908 | 27 | 27092908 | CENSUS |
39 | SBO315207 | Total Black-owned firms, percent, 2007 | PCT | 1 | 7.1 | 0.2 | 66.7 | CENSUS |
40 | SBO115207 | Total American Indian- and Alaska Native-owned... | PCT | 1 | 0.9 | 0.2 | 71.8 | CENSUS |
41 | SBO215207 | Total Asian-owned firms, percent, 2007 | PCT | 1 | 5.7 | 0.3 | 56.6 | CENSUS |
42 | SBO515207 | Total Native Hawaiian- and Other Pacific Islan... | PCT | 1 | 0.1 | 0.0 | 10.5 | CENSUS |
43 | SBO415207 | Total Hispanic-owned firms, percent, 2007 | PCT | 1 | 8.3 | 0.3 | 78.0 | CENSUS |
44 | SBO015207 | Total Women-owned firms, percent, 2007 | PCT | 1 | 28.8 | 6.5 | 56.2 | CENSUS |
45 | MAN450207 | Manufacturing: total (NAICS 31-33) - value of ... | TH$ | 0 | 5338306501 | 56903 | 5338306501 | CENSUS |
46 | WTN220207 | Wholesale trade: merchant wholesalers (NAICS 4... | TH$ | 0 | 4174286516 | 0 | 4174286516 | CENSUS |
47 | RTN130207 | Retail trade: total (NAICS 44-45) - sales of e... | TH$ | 0 | 3917663456 | 0 | 3917663456 | CENSUS |
48 | RTN131207 | Retail trade: total (NAICS 44-45) - sales of e... | DOL | 0 | 12990 | 0 | 80800 | CENSUS |
49 | AFN120207 | Accommodation and Food Services: total (NAICS ... | TH$ | 0 | 613795732 | 0 | 613795732 | CENSUS |
50 | BPS030211 | New private housing units authorized by buildi... | ABS | 0 | 624061 | 0 | 624061 | CENSUS |
51 | LND110210 | Land area in square miles, 2010 | SQM | 2 | 3531905.43 | 2 | 3531905.43 | CENSUS |
52 | POP060210 | Population per square mile, 2010 | RTE | 1 | 87.4 | 0.0 | 69467.5 | CENSUS |
# compare rows / m
from itertools import izip
#for (i, (r0, m0)) in enumerate(izip(rows, m)):
# print r0, m0
# # print i, np.all([c == ' ' for c in r0] == m0)
rnum = 1
all([c == ' ' for c in rows[rnum]] == m[rnum])
[c == ' ' for c in rows[rnum]] == m[rnum]
array([ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], dtype=bool)
rows_array[0]
array(['P', 'S', 'T', '0', '4', '5', '2', '1', '2', ' ', 'R', 'e', 's', 'i', 'd', 'e', 'n', 't', ' ', 't', 'o', 't', 'a', 'l', ' ', 'p', 'o', 'p', 'u', 'l', 'a', 't', 'i', 'o', 'n', ' ', 'e', 's', 't', 'i', 'm', 'a', 't', 'e', ' ', '(', 'J', 'u', 'l', 'y', ' ', '1', ')', ' ', '2', '0', '1', '2', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'A', 'B', 'S', ' ', ' ', ' ', ' ', '0', ' ', ' ', ' ', ' ', ' ', ' ', '3', '1', '3', '9', '1', '4', '0', '4', '0', ' ', ' ', ' ', ' ', ' ', ' ', '5', '7', '6', '4', '1', '2', ' ', ' ', ' ', '3', '1', '3', '9', '1', '4', '0', '4', '0', ' ', ' ', 'C', 'E', 'N', 'S', 'U', 'S'], dtype='|S1')
np.ma.clump_unmasked
<function numpy.ma.extras.clump_unmasked>
["".join(list(rows_array[0][s])).strip() for s in np.ma.clump_unmasked(np.ma.array(rows_array[0], mask =mask))]
['PST045212', 'Resident total population estimate (July 1) 2012', 'ABS', '0', '313914040', '576412', '313914040', 'CENSUS']
0...9 a...i A...I
repeat...
import string
from itertools import islice
upper = string.uppercase + "0"
lower = string.lowercase
def column_marker(start=0, stop=None):
n = start
while stop is None or n < stop:
k = n % 10
if k > 0:
yield unicode(k)
else:
if n % 100 == 0:
yield unicode(upper[(n % 1000) / 100 -1 ])
elif n % 10 == 0 :
yield unicode(lower[(n % 100) / 10 - 1])
n += 1
len(list(column_marker(0,12)))
12
print "".join(column_marker(0,180))
0123456789a123456789b123456789c123456789d123456789e123456789f123456789g123456789h123456789i123456789A123456789a123456789b123456789c123456789d123456789e123456789f123456789g123456789
0123456789a123456789b123456789c123456789d123456789e123456789f123456789g123456789h123456789i123456789A123456789a123456789b123456789c123456789d123456789e123456789f123456789 PST045212 Resident total population estimate (July 1) 2012 ABS 0 313914040 576412 313914040 CENSUS PST045211 Resident total population estimate (July 1) 2011 ABS 0 311587816 90 311587816 CENSUS PST040210 Resident total population, estimates base (April 1) 2010 ABS 0 308747508 82 308747508 CENSUS PST120212 Resident total population, percent change - April 1, 2010 to July 1, 2012 PCT 1 1.7 -0.2 5.1 CENSUS PST120211 Resident total population, percent change - April 1, 2010 to July 1, 2011 PCT 1 0.9 -18.1 14.6 CENSUS POP010210 Resident population (April 1 - complete count) 2010 ABS 0 308745538 82 308745538 CENSUS AGE135211 Resident population under 5 years, percent, 2011 PCT 1 6.5 0.0 13.3 CENSUS A
Reading off the columns
[u'Data_Item', u'Item_Description', u'Unit', u'Decimal', u'US_Total', u'Minimum', u'Maximum', u'Source']