import pandas as pd from pandas import Series, DataFrame from itertools import islice import datetime from itertools import islice import codecs import re import csv import os if os.getcwd() == '/home/picloud/notebook': ON_PICLOUD = True DATA_DIR = '/home/picloud/working-open-data/data' PYDATA_DIR = '/home/picloud/pydata-book/' else: ON_PICLOUD = False DATA_DIR = os.path.join(os.pardir, "data") PYDATA_DIR = os.path.join(os.pardir, "pydata-book") dataset_fname = os.path.join (DATA_DIR, "census/DataSet.txt") datadict_fname = os.path.join (DATA_DIR, "census/DataDict.txt") fips_fname = os.path.join (DATA_DIR, "census/FIPS_CountyName.txt") assert os.path.exists(DATA_DIR) assert os.path.exists(PYDATA_DIR) assert os.path.exists(dataset_fname) assert os.path.exists(datadict_fname) assert os.path.exists(fips_fname) # read in fips code fips_file = codecs.open(fips_fname, encoding='iso-8859-1') fips = dict() for row in islice(fips_file, None): fips[row[:5]] = row[6:-1] # read in data set ds_file = codecs.open(dataset_fname, encoding='iso-8859-1') reader = csv.DictReader(ds_file) dataset = dict([(row["fips"], row) for row in islice(reader, None)]) states_fips = sorted([k for k in fips.keys() if k[-3:] == '000' and k != '00000']) !head $fips_fname dataset["00000"]["POP010210"] for f in states_fips[:5]: print f, fips[f] list(islice(sorted(dataset.keys()),5)) fips['06000'] # number of counties in CA from collections import Counter print Counter([k[0:2] for k in dataset.keys() if k[2:5] != '000'])['06'] import string alphabet = string.lowercase alphabet print "alphabet:", alphabet print "alphabet[0]:", alphabet[0] print "alphabet[-1], alphabet[0:5], alphabet[-2:]:", alphabet[-1], alphabet[0:5], alphabet[-2:] alphabet[5] alphabet[0:3] alphabet[1:4:2] alphabet[-6:] alphabet[-1:-3:-1] a = array([0,1,2,3]) a + 5 sum(2*a) # round off atomic weight elements = DataFrame([{'number': 1, 'name': 'hydrogen', 'weight':1}, {'number': 2, 'name': 'helium', 'weight':4}, {'number': 3, 'name': 'lithium', 'weight':7}, {'number': 4, 'name': 'beryllium', 'weight':9}, {'number': 5, 'name': 'boron', 'weight':11}, {'number': 6, 'name': 'carbon', 'weight':12}, ], index= ['H', 'He', 'Li', 'Be', 'B', 'C']) # add group information elements['group'] = Series([1, 18, 1, 2, 13, 14], index = ['H', 'He', 'Li', 'Be', 'B', 'C']) elements len(elements.index) elements[elements.number > 4]["weight"].sum() set(elements[elements['group'] == 1].name) elements.sort_index(by='weight')['number'][::-1][:2].sum() comments = Series(['first and most common element', 'the C in organic'], index=['H', 'C']) elements['comments'] = comments elements elements.comments.dropna().count() "".join(elements.comments.dropna().apply(lambda x: x[0]).values) "".join(['a','b']) elements.number.apply(lambda x: 2*x) "".join(elements.number.apply(lambda x: str(2*x)).values) import datetime (datetime.datetime.now() + datetime.timedelta(days=20)).month