In [102]:

# CSV file for demonstration
# Backup your own NikePlus data to CSV by using the following:
# API wrapper: https://github.com/durden/nikeplus
# Video explaining concept behind API wrapper: http://www.youtube.com/watch?v=jA0dwPtiu7c

In [103]:

cat nikeplus.csv | head

device,miles,steps,pace,fuel,duration,kilometers,calories,start_time,distance

In [104]:

# Lets use the pandas library to explore this data in memory
# in a data structure that called a DataFrame, which you can
# think of as similar to an Excel spreadsheet.
import pandas as pd

In [105]:

nike = pd.read_csv('nikeplus.csv')
nike

Out[105]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 528 entries, 0 to 527
Data columns (total 10 columns):
device        528  non-null values
miles         528  non-null values
steps         528  non-null values
pace          528  non-null values
fuel          528  non-null values
duration      528  non-null values
kilometers    528  non-null values
calories      528  non-null values
start_time    528  non-null values
distance      528  non-null values
dtypes: float64(3), int64(3), object(4)

In [106]:

# Use date column as index instead of a normal column so we can plot with it and
# anchor all data based on a date since our data only has 1 entry per day.
nike = pd.read_csv('nikeplus.csv', index_col=8)
nike

Out[106]:

<class 'pandas.core.frame.DataFrame'>
Index: 528 entries, 2013-08-15T05:00:00Z to 2012-02-28T06:00:00Z
Data columns (total 9 columns):
device        528  non-null values
miles         528  non-null values
steps         528  non-null values
pace          528  non-null values
fuel          528  non-null values
duration      528  non-null values
kilometers    528  non-null values
calories      528  non-null values
distance      528  non-null values
dtypes: float64(3), int64(3), object(3)

In [107]:

# Pandas DataFrame's support easy indexing by column names and traditional Python
# list style slicing. We'll use this trick a lot in this demonstration just
# to prevent showing lots of data at once.
nike['miles'][:5]

Out[107]:

start_time
2013-08-15T05:00:00Z    2.831880
2013-08-14T05:00:00Z    3.888698
2013-08-13T05:00:00Z    2.640087
2013-08-12T05:00:00Z    3.943007
2013-08-11T05:00:00Z    3.181217
Name: miles, dtype: float64

In [108]:

# Pandas can turn our DataFrame into html, but displaying is not as nice in
# this notebook interface by default.
nike[:5].to_html()

Out[108]:

u'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>device</th>\n      <th>miles</th>\n      <th>steps</th>\n      <th>pace</th>\n      <th>fuel</th>\n      <th>duration</th>\n      <th>kilometers</th>\n      <th>calories</th>\n      <th>distance</th>\n    </tr>\n    <tr>\n      <th>start_time</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2013-08-15T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 2.831880</td>\n      <td> 5788</td>\n      <td> (23\'47/mi)</td>\n      <td> 3032</td>\n      <td> 12:27:00</td>\n      <td> 4.557471</td>\n      <td>  938</td>\n      <td> 4.557471</td>\n    </tr>\n    <tr>\n      <th>2013-08-14T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 3.888698</td>\n      <td> 7948</td>\n      <td>  (7\'43/mi)</td>\n      <td> 3469</td>\n      <td> 12:10:00</td>\n      <td> 6.258255</td>\n      <td> 1074</td>\n      <td> 6.258255</td>\n    </tr>\n    <tr>\n      <th>2013-08-13T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 2.640087</td>\n      <td> 5396</td>\n      <td> (51\'39/mi)</td>\n      <td> 2797</td>\n      <td> 12:50:00</td>\n      <td> 4.248810</td>\n      <td>  865</td>\n      <td> 4.248810</td>\n    </tr>\n    <tr>\n      <th>2013-08-12T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 3.943007</td>\n      <td> 8059</td>\n      <td>  (2\'21/mi)</td>\n      <td> 3097</td>\n      <td> 11:59:00</td>\n      <td> 6.345656</td>\n      <td>  960</td>\n      <td> 6.345656</td>\n    </tr>\n    <tr>\n      <th>2013-08-11T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 3.181217</td>\n      <td> 6502</td>\n      <td> (52\'35/mi)</td>\n      <td> 2935</td>\n      <td>  9:09:00</td>\n      <td> 5.119675</td>\n      <td>  908</td>\n      <td> 5.119675</td>\n    </tr>\n  </tbody>\n</table>'

In [109]:

# IPython is several things all rolled into one including, but not limited to:
#  1. Python library that can do all sorts of things including displaying html
#  2. Provides this notebook interface to execute code and display results
from IPython.core.display import display_html
display_html(nike[:5].to_html(), raw=True)

	device	miles	steps	pace	fuel	duration	kilometers	calories	distance
start_time
2013-08-15T05:00:00Z	FUELBAND	2.831880	5788	(23'47/mi)	3032	12:27:00	4.557471	938	4.557471
2013-08-14T05:00:00Z	FUELBAND	3.888698	7948	(7'43/mi)	3469	12:10:00	6.258255	1074	6.258255
2013-08-13T05:00:00Z	FUELBAND	2.640087	5396	(51'39/mi)	2797	12:50:00	4.248810	865	4.248810
2013-08-12T05:00:00Z	FUELBAND	3.943007	8059	(2'21/mi)	3097	11:59:00	6.345656	960	6.345656
2013-08-11T05:00:00Z	FUELBAND	3.181217	6502	(52'35/mi)	2935	9:09:00	5.119675	908	5.119675

In [110]:

# Typically you could just plot a DataFrame like this, which by default
# plots all columns as a separate line on our graph. However, our
# data contains some non-numeric columns like 'device' and 'start_time'.

# Uncomment this line and run this cell with shift-enter to see
# the associated error.
#nike.plot()

In [111]:

# However, we can index by column names and plot only a single column,
# a Pandas.DataSeries data structure, to focus on a single column
# of data. This will work because we now the miles column is all numerical
# data. Again, still could be prettier, maybe there's a problem with our
# importing?
nike['miles'].plot()

Out[111]:

<matplotlib.axes.AxesSubplot at 0x72c7150>

In [112]:

# Pandas has A LOT of arguments to tweak how CSV files are imported
# including the ability to parse dates into Python datetime objects
# instead of strings
nike = pd.read_csv('nikeplus.csv', index_col=8, parse_dates=[8])
nike

Out[112]:

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 528 entries, 2013-08-15 05:00:00 to 2012-02-28 06:00:00
Data columns (total 9 columns):
device        528  non-null values
miles         528  non-null values
steps         528  non-null values
pace          528  non-null values
fuel          528  non-null values
duration      528  non-null values
kilometers    528  non-null values
calories      528  non-null values
distance      528  non-null values
dtypes: float64(3), int64(3), object(3)

In [113]:

# Note that IPython code cells keep a global state, so we don't
# need to import this function again if we run this cell after
# the cell above with the import in it.
display_html(nike[:5].to_html(), raw=True)

	device	miles	steps	pace	fuel	duration	kilometers	calories	distance
start_time
2013-08-15 05:00:00	FUELBAND	2.831880	5788	(23'47/mi)	3032	12:27:00	4.557471	938	4.557471
2013-08-14 05:00:00	FUELBAND	3.888698	7948	(7'43/mi)	3469	12:10:00	6.258255	1074	6.258255
2013-08-13 05:00:00	FUELBAND	2.640087	5396	(51'39/mi)	2797	12:50:00	4.248810	865	4.248810
2013-08-12 05:00:00	FUELBAND	3.943007	8059	(2'21/mi)	3097	11:59:00	6.345656	960	6.345656
2013-08-11 05:00:00	FUELBAND	3.181217	6502	(52'35/mi)	2935	9:09:00	5.119675	908	5.119675

In [114]:

# Now our plot will look much better because Pandas and matplotlib
# know our index column is actually a datetime object, not a string.

# Notice this is just too much data to show in the default
# width, but IPython provides the ability to drag the bottom
# right corner to increase the size of a plot.
nike['miles'].plot()

Out[114]:

<matplotlib.axes.AxesSubplot at 0x73e8f90>

In [115]:

# However, showing less data looks even better in this window.
nike['miles'][:20].plot()

Out[115]:

<matplotlib.axes.AxesSubplot at 0x74dad50>

In [116]:

# Again, the plot function passes information directly to matplotlib
# so there are lots of arguments to tweak the display. For example,
# we can add a title.
nike['miles'][:30].plot(title='Miles')

Out[116]:

<matplotlib.axes.AxesSubplot at 0x72a73d0>

In [117]:

# Maybe we don't want to deal with all those columns and are only
# interested in a DataFrame with a few columns.
nike2 = nike.reindex(columns=['calories', 'fuel'])
nike2

Out[117]:

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 528 entries, 2013-08-15 05:00:00 to 2012-02-28 06:00:00
Data columns (total 2 columns):
calories    528  non-null values
fuel        528  non-null values
dtypes: int64(2)

In [118]:

# Our new DataFrame, nike2, only has numerical columns now. So, the default
# Pandas plotting will work and can automatically make a new line for
# each of our columns.
nike2.plot()

Out[118]:

<matplotlib.axes.AxesSubplot at 0x7447b50>

In [119]:

# We can further control what data is actually shown on the x/y axes.
# So, this plot effectively shows that calories and Nike's proprietary
# fuel measurement have a linear relationship.
nike2[:30].plot(x='calories', y='fuel')

Out[119]:

<matplotlib.axes.AxesSubplot at 0x7cca9d0>

In [120]:

# Of course, we aren't limited to just line plots, there are
# all sorts provided by matplotlib.
nike2[:30].plot(kind='bar')

Out[120]:

<matplotlib.axes.AxesSubplot at 0x7d5cb50>

In [121]:

# Finally, we could remove the above reindexing step and save memory and
# time up front if we know we're only interested in a few columns at the
# time of reading the data from the CSV. Remember, Pandas read_csv has
# ALOT of arguments...

# Note that with usecols the index_col is RELATIVE to the columns in usecols argument!
nike = pd.read_csv('nikeplus.csv', usecols=['miles', 'steps', 'fuel', 'calories', 'start_time'], index_col=4)
nike

Out[121]:

<class 'pandas.core.frame.DataFrame'>
Index: 528 entries, 2013-08-15T05:00:00Z to 2012-02-28T06:00:00Z
Data columns (total 4 columns):
miles       528  non-null values
steps       528  non-null values
fuel        528  non-null values
calories    528  non-null values
dtypes: float64(1), int64(3)