import numpy as np
from pandas import*
import json
path = 'ch02_pandas/usagov_bitly_data2012-03-16-1331923249.txt'
path
'ch02_pandas/usagov_bitly_data2012-03-16-1331923249.txt'
# To open a file in python console located at path defined by 'path'
# Readline is a object to check one line of the loaded file terminated by new line '\n'
open(path).readline()
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
# Specializing JSON object decoding:
# Each line is loaded as a seperate record which can be decoded as indiviual json object
records = [json.loads(line) for line in open(path)]
# Sanity check for json decode
records[0]
{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', u'al': u'en-US,en;q=0.8', u'c': u'US', u'cy': u'Danvers', u'g': u'A6qOVH', u'gr': u'MA', u'h': u'wfLQtf', u'hc': 1331822918, u'hh': u'1.usa.gov', u'l': u'orofrog', u'll': [42.576698, -70.954903], u'nk': 1, u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf', u't': 1331923247, u'tz': u'America/New_York', u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
# Accessing indiviual values: string level parsing
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[0:10]
[u'America/New_York', u'America/Denver', u'America/New_York', u'America/Sao_Paulo', u'America/New_York', u'America/New_York', u'Europe/Warsaw', u'', u'', u'']
#sequence = time_zones
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
counts = get_counts(time_zones)
counts['America/New_York']
1251
len(time_zones)
3440
time_zones.sort()
time_zones[-10:]
[u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu', u'Pacific/Honolulu']
# Using dictionary modules for couting elements
from collections import defaultdict
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
[(33, u'America/Sao_Paulo'), (35, u'Europe/Madrid'), (36, u'Pacific/Honolulu'), (37, u'Asia/Tokyo'), (74, u'Europe/London'), (191, u'America/Denver'), (382, u'America/Los_Angeles'), (400, u'America/Chicago'), (521, u''), (1251, u'America/New_York')]
value_key_pairs = [(count, ts) for ts, count in counts.items()]
value_key_pairs.sort()
value_key_pairs[-10:]
[(33, u'America/Sao_Paulo'), (35, u'Europe/Madrid'), (36, u'Pacific/Honolulu'), (37, u'Asia/Tokyo'), (74, u'Europe/London'), (191, u'America/Denver'), (382, u'America/Los_Angeles'), (400, u'America/Chicago'), (521, u''), (1251, u'America/New_York')]
# Python data analytics library
import pandas as pd
from pandas import DataFrame, Series
#help(pandas.DataFrame)
# Inserting all records stored in form of lists in to 'pandas DataFrame'
frame = DataFrame(records)
frame
<class 'pandas.core.frame.DataFrame'> Int64Index: 3560 entries, 0 to 3559 Data columns (total 18 columns): _heartbeat_ 120 non-null values a 3440 non-null values al 3094 non-null values c 2919 non-null values cy 2919 non-null values g 3440 non-null values gr 2919 non-null values h 3440 non-null values hc 3440 non-null values hh 3440 non-null values kw 93 non-null values l 3440 non-null values ll 2919 non-null values nk 3440 non-null values r 3440 non-null values t 3440 non-null values tz 3440 non-null values u 3440 non-null values dtypes: float64(4), object(14)
# Output shown for the 'frame' is the summary view used for large datasets
# Series object is returned by frame['tz']
frame['tz'][:10]
0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 8 9 Name: tz, dtype: object
# Series object is returned by frame['tz'] has a method "value_counts" that gives us counts for this particular object
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 dtype: int64
# Little Data munging to fill in a substitute value for unknown and missing time zone data in the records.
# The fillna function can replace missing (NA) values and unknown(empty strings) values can be replaced by boolean array indexing
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:50]
America/New_York 1251 Unknown 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 Europe/Berlin 28 Europe/Rome 27 America/Rainy_River 25 Europe/Amsterdam 22 America/Indianapolis 20 America/Phoenix 20 Europe/Warsaw 16 America/Mexico_City 15 Europe/Stockholm 14 Europe/Paris 14 America/Vancouver 12 Pacific/Auckland 11 Asia/Hong_Kong 10 Europe/Prague 10 Europe/Helsinki 10 America/Puerto_Rico 10 Europe/Oslo 10 Europe/Moscow 10 Asia/Calcutta 9 Asia/Istanbul 9 America/Montreal 9 Europe/Lisbon 8 Australia/NSW 6 Europe/Athens 6 Asia/Bangkok 6 Europe/Vienna 6 Chile/Continental 6 America/Edmonton 6 Europe/Budapest 5 America/Anchorage 5 Europe/Copenhagen 5 Asia/Seoul 5 Asia/Dubai 4 Europe/Bucharest 4 Asia/Beirut 4 Europe/Zurich 4 America/Winnipeg 4 Europe/Brussels 4 America/Halifax 4 dtype: int64
# PLot a bar graph after data munging, for top 10 values and rotate by 90 deg.
#import pylab as pl
import matplotlib.pyplot as plt
tz_counts[:10].plot(kind='barh', rot=0)
plt.show()
tz_counts.plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0xda480b8>