[Topic]: Introduction to "Pandas" Python data analytics library¶

[MG]: This is my 3rd IPython NB for CS109 Course [Author]: manoj [Location]: IIITD India¶

In [1]:

import numpy as np
from pandas import* 
import json

In [2]:

path = 'ch02_pandas/usagov_bitly_data2012-03-16-1331923249.txt'

In [3]:

path

Out[3]:

'ch02_pandas/usagov_bitly_data2012-03-16-1331923249.txt'

In [4]:

# To open a file in python console located at path defined by 'path'
# Readline is a object to check one line of the loaded file terminated by new line '\n'
open(path).readline()

Out[4]:

'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

In [5]:

# Specializing JSON object decoding: 
# Each line is loaded as a seperate record which can be decoded as indiviual json object
records = [json.loads(line) for line in open(path)]

In [6]:

# Sanity check for json decode
records[0]

Out[6]:

{u'a': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 u'al': u'en-US,en;q=0.8',
 u'c': u'US',
 u'cy': u'Danvers',
 u'g': u'A6qOVH',
 u'gr': u'MA',
 u'h': u'wfLQtf',
 u'hc': 1331822918,
 u'hh': u'1.usa.gov',
 u'l': u'orofrog',
 u'll': [42.576698, -70.954903],
 u'nk': 1,
 u'r': u'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 u't': 1331923247,
 u'tz': u'America/New_York',
 u'u': u'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

In [7]:

# Accessing indiviual values: string level parsing
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[0:10]

Out[7]:

[u'America/New_York',
 u'America/Denver',
 u'America/New_York',
 u'America/Sao_Paulo',
 u'America/New_York',
 u'America/New_York',
 u'Europe/Warsaw',
 u'',
 u'',
 u'']

In [8]:

#sequence = time_zones
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

In [9]:

counts = get_counts(time_zones)
counts['America/New_York']

Out[9]:

In [10]:

len(time_zones)

Out[10]:

In [11]:

time_zones.sort()

In [12]:

time_zones[-10:]

Out[12]:

[u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu',
 u'Pacific/Honolulu']

In [13]:

# Using dictionary modules for couting elements
from collections import defaultdict

In [14]:

def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [15]:

top_counts(counts)

Out[15]:

[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1251, u'America/New_York')]

In [16]:

value_key_pairs = [(count, ts) for ts, count in counts.items()]

In [17]:

value_key_pairs.sort()
value_key_pairs[-10:]

Out[17]:

[(33, u'America/Sao_Paulo'),
 (35, u'Europe/Madrid'),
 (36, u'Pacific/Honolulu'),
 (37, u'Asia/Tokyo'),
 (74, u'Europe/London'),
 (191, u'America/Denver'),
 (382, u'America/Los_Angeles'),
 (400, u'America/Chicago'),
 (521, u''),
 (1251, u'America/New_York')]

In [18]:

# Python data analytics library
import pandas as pd
from pandas import DataFrame, Series

In [19]:

#help(pandas.DataFrame)

In [20]:

# Inserting all records stored in form of lists in to 'pandas DataFrame'
frame = DataFrame(records)

In [21]:

frame

Out[21]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3560 entries, 0 to 3559
Data columns (total 18 columns):
_heartbeat_    120  non-null values
a              3440  non-null values
al             3094  non-null values
c              2919  non-null values
cy             2919  non-null values
g              3440  non-null values
gr             2919  non-null values
h              3440  non-null values
hc             3440  non-null values
hh             3440  non-null values
kw             93  non-null values
l              3440  non-null values
ll             2919  non-null values
nk             3440  non-null values
r              3440  non-null values
t              3440  non-null values
tz             3440  non-null values
u              3440  non-null values
dtypes: float64(4), object(14)

In [22]:

# Output shown for the 'frame' is the summary view used for large datasets 
# Series object is returned by frame['tz']
frame['tz'][:10]

Out[22]:

0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
5     America/New_York
6        Europe/Warsaw
7                     
8                     
9                     
Name: tz, dtype: object

In [23]:

# Series object is returned by frame['tz'] has a method "value_counts" that gives us counts for this particular object
tz_counts = frame['tz'].value_counts()
tz_counts[:10]

Out[23]:

America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
dtype: int64

In [24]:

# Little Data munging to fill in a substitute value for unknown and missing time zone data in the records. 
# The fillna function can replace missing (NA) values and unknown(empty strings) values can be replaced by boolean array indexing
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'

In [25]:

tz_counts = clean_tz.value_counts()
tz_counts[:50]

Out[25]:

America/New_York        1251
Unknown                  521
America/Chicago          400
America/Los_Angeles      382
America/Denver           191
Missing                  120
Europe/London             74
Asia/Tokyo                37
Pacific/Honolulu          36
Europe/Madrid             35
America/Sao_Paulo         33
Europe/Berlin             28
Europe/Rome               27
America/Rainy_River       25
Europe/Amsterdam          22
America/Indianapolis      20
America/Phoenix           20
Europe/Warsaw             16
America/Mexico_City       15
Europe/Stockholm          14
Europe/Paris              14
America/Vancouver         12
Pacific/Auckland          11
Asia/Hong_Kong            10
Europe/Prague             10
Europe/Helsinki           10
America/Puerto_Rico       10
Europe/Oslo               10
Europe/Moscow             10
Asia/Calcutta              9
Asia/Istanbul              9
America/Montreal           9
Europe/Lisbon              8
Australia/NSW              6
Europe/Athens              6
Asia/Bangkok               6
Europe/Vienna              6
Chile/Continental          6
America/Edmonton           6
Europe/Budapest            5
America/Anchorage          5
Europe/Copenhagen          5
Asia/Seoul                 5
Asia/Dubai                 4
Europe/Bucharest           4
Asia/Beirut                4
Europe/Zurich              4
America/Winnipeg           4
Europe/Brussels            4
America/Halifax            4
dtype: int64

In [27]:

# PLot a bar graph after data munging, for top 10 values and rotate by 90 deg.
#import pylab as pl
import matplotlib.pyplot as plt 
tz_counts[:10].plot(kind='barh', rot=0)
plt.show()

In [75]:

tz_counts.plot(kind='barh')

Out[75]:

<matplotlib.axes.AxesSubplot at 0xda480b8>

In [ ]: