import numpy as np from pandas import* import json path = 'ch02_pandas/usagov_bitly_data2012-03-16-1331923249.txt' path # To open a file in python console located at path defined by 'path' # Readline is a object to check one line of the loaded file terminated by new line '\n' open(path).readline() # Specializing JSON object decoding: # Each line is loaded as a seperate record which can be decoded as indiviual json object records = [json.loads(line) for line in open(path)] # Sanity check for json decode records[0] # Accessing indiviual values: string level parsing time_zones = [rec['tz'] for rec in records if 'tz' in rec] time_zones[0:10] #sequence = time_zones def get_counts(sequence): counts = {} for x in sequence: if x in counts: counts[x] += 1 else: counts[x] = 1 return counts counts = get_counts(time_zones) counts['America/New_York'] len(time_zones) time_zones.sort() time_zones[-10:] # Using dictionary modules for couting elements from collections import defaultdict def top_counts(count_dict, n=10): value_key_pairs = [(count, tz) for tz, count in count_dict.items()] value_key_pairs.sort() return value_key_pairs[-n:] top_counts(counts) value_key_pairs = [(count, ts) for ts, count in counts.items()] value_key_pairs.sort() value_key_pairs[-10:] # Python data analytics library import pandas as pd from pandas import DataFrame, Series #help(pandas.DataFrame) # Inserting all records stored in form of lists in to 'pandas DataFrame' frame = DataFrame(records) frame # Output shown for the 'frame' is the summary view used for large datasets # Series object is returned by frame['tz'] frame['tz'][:10] # Series object is returned by frame['tz'] has a method "value_counts" that gives us counts for this particular object tz_counts = frame['tz'].value_counts() tz_counts[:10] # Little Data munging to fill in a substitute value for unknown and missing time zone data in the records. # The fillna function can replace missing (NA) values and unknown(empty strings) values can be replaced by boolean array indexing clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'Unknown' tz_counts = clean_tz.value_counts() tz_counts[:50] # PLot a bar graph after data munging, for top 10 values and rotate by 90 deg. #import pylab as pl import matplotlib.pyplot as plt tz_counts[:10].plot(kind='barh', rot=0) plt.show() tz_counts.plot(kind='barh')