PfDA
, Chap 1 Preliminaries, especially the installation instructions for EPD Free for your computer platform. I want you to try installing EPD Free (or EPD Academic) before class on Thursday.PfDA
, Chap 3PfDA
, Appendix: Python Language Essentials -- to help remind yourself of key elements of standard PythonPfDA
, Chap 2 Introductory ExamplesOn Tuesday, I asked you to discuss the population of California. If you do a Google search...you might end up at California QuickFacts from the US Census Bureau. Compare to the quickfacts about Alameda County.
Today we download the data for the USA, states, and counties:
The entire State and County QuickFacts dataset, with U.S., state, and county data is available for download. Downloadable data files for cities may be issued later. The current downloadable data set may include items not displayed on QuickFacts tables.
Download 3 files into a directory....perhaps where you launched iPython:
# YOU NEED TO FILL IN data_dir for your own directory path
data_dir = "/Users/raymondyee/D/Document/Working_with_Open_Data/day02/"
dataset_fname = data_dir + "DataSet.txt"
datadict_fname = data_dir + "DataDict.txt"
fips_fname = data_dir + "FIPS_CountyName.txt"
# on Mac, Linux system
!head $fips_fname
!grep -i California $fips_fname
!grep Alameda $fips_fname
# You might do something like this....
data_dir = "/Users/raymondyee/D/Document/Working_with_Open_Data/day02/"
fips_fname = data_dir + "FIPS_CountyName.txt"
# PfDA p. 430 for brief explanation of file open
f = open(fips_fname)
for row in f:
print row
data_dir = "/Users/raymondyee/D/Document/Working_with_Open_Data/day02/"
fips_fname = data_dir + "FIPS_CountyName.txt"
from itertools import islice
# PfDA p. 430 for brief explanation of file open
f = open(fips_fname)
[row for row in islice(f,5)]
data_dir = "/Users/raymondyee/D/Document/Working_with_Open_Data/day02/"
fips_fname = data_dir + "FIPS_CountyName.txt"
f = open(fips_fname)
for (i, row) in enumerate(f):
try:
a = row.decode('ascii')
except Exception as e:
print i, row, e
http://www.doughellmann.com/PyMOTW/codecs/#working-with-files
encodings: http://docs.python.org/2/library/codecs.html#standard-encodings
'ascii' vs 'utf-8' vs 'iso-8859-1'
import codecs
from itertools import islice
# YOU NEED TO FILL IN data_dir for your own directory path
data_dir = "/Users/raymondyee/D/Document/Working_with_Open_Data/day02/"
fips_fname = data_dir + "FIPS_CountyName.txt"
f = codecs.open(fips_fname, encoding='latin8')
for (i, row) in enumerate(islice(f, None)):
pass
import codecs
from itertools import islice
# YOU NEED TO FILL IN data_dir for your own directory path
data_dir = "/Users/raymondyee/D/Document/Working_with_Open_Data/day02/"
fips_fname = data_dir + "FIPS_CountyName.txt"
f = codecs.open(fips_fname, encoding='iso-8859-1')
fips = dict()
# FILL IN
print fips
# check on CA and Alameda County
print fips['06000'], fips['06001']
# work out hierarchy w/ FIPS
# not CSV -- parse on first space
import codecs
from itertools import islice
import re
import string
# YOU NEED TO FILL IN data_dir for your own directory path
data_dir = ""
fips_fname = data_dir + "FIPS_CountyName.txt"
f = codecs.open(fips_fname, encoding='iso-8859-1')
fips = dict()
# FILL IN
states_fips = []
print states_fips
# to check
states_fips == [u'01000', u'02000', u'04000', u'05000', u'06000', u'08000',
u'09000', u'10000', u'11000', u'12000', u'13000', u'15000', u'16000', u'17000',
u'18000', u'19000', u'20000', u'21000', u'22000', u'23000', u'24000', u'25000',
u'26000', u'27000', u'28000', u'29000', u'30000', u'31000', u'32000', u'33000',
u'34000', u'35000', u'36000', u'37000', u'38000', u'39000', u'40000', u'41000',
u'42000', u'44000', u'45000', u'46000', u'47000', u'48000', u'49000', u'50000',
u'51000', u'53000', u'54000', u'55000', u'56000']
# check
counties_count_by_state['06'] == 58 #CA
# check for CA
print [(k, fips[k]) for k in list(county_fips_for_state('06000')) ]
CA_county_fips = set([u'06099',
u'06057', u'06069', u'06093', u'06095', u'06097', u'06011', u'06013',
u'06015', u'06017', u'06115', u'06019', u'06079', u'06111', u'06047',
u'06113', u'06077', u'06039', u'06073', u'06071', u'06033', u'06031',
u'06037', u'06035', u'06091', u'06051', u'06065', u'06089', u'06087',
u'06085', u'06083', u'06041', u'06081', u'06007', u'06005', u'06075',
u'06003', u'06001', u'06109', u'06107', u'06105', u'06103', u'06009',
u'06101', u'06029', u'06067', u'06061', u'06045', u'06063', u'06021',
u'06059', u'06023', u'06025', u'06027', u'06043', u'06055', u'06053',
u'06049'])
print set(county_fips_for_state('06000')) == CA_county_fips
Suggestion: use csv.DictReader to fread in file into dataset dict
# now load dataset
import codecs
import csv
from itertools import islice
# YOU NEED TO FILL IN data_dir for your own directory path
data_dir = "/Users/raymondyee/D/Document/Working_with_Open_Data/day02/"
dataset_fname = data_dir + "DataSet.txt"
f = codecs.open(dataset_fname, encoding='utf-8')
reader = csv.DictReader(f)
dataset = dict([(row["fips"], row) for row in islice(reader, None)])
# check number of keys and population of US
print len(dataset.keys()) == 3195
print dataset['00000']['POP010210'] == '308745538'
int(dataset['06000']['POP010210']) # total 2010 population of CA