In [102]:
# CSV file for demonstration
# Backup your own NikePlus data to CSV by using the following:
# API wrapper: https://github.com/durden/nikeplus
# Video explaining concept behind API wrapper: http://www.youtube.com/watch?v=jA0dwPtiu7c
In [103]:
cat nikeplus.csv | head
device,miles,steps,pace,fuel,duration,kilometers,calories,start_time,distance










In [104]:
# Lets use the pandas library to explore this data in memory
# in a data structure that called a DataFrame, which you can
# think of as similar to an Excel spreadsheet.
import pandas as pd
In [105]:
nike = pd.read_csv('nikeplus.csv')
nike
Out[105]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 528 entries, 0 to 527
Data columns (total 10 columns):
device        528  non-null values
miles         528  non-null values
steps         528  non-null values
pace          528  non-null values
fuel          528  non-null values
duration      528  non-null values
kilometers    528  non-null values
calories      528  non-null values
start_time    528  non-null values
distance      528  non-null values
dtypes: float64(3), int64(3), object(4)
In [106]:
# Use date column as index instead of a normal column so we can plot with it and
# anchor all data based on a date since our data only has 1 entry per day.
nike = pd.read_csv('nikeplus.csv', index_col=8)
nike
Out[106]:
<class 'pandas.core.frame.DataFrame'>
Index: 528 entries, 2013-08-15T05:00:00Z to 2012-02-28T06:00:00Z
Data columns (total 9 columns):
device        528  non-null values
miles         528  non-null values
steps         528  non-null values
pace          528  non-null values
fuel          528  non-null values
duration      528  non-null values
kilometers    528  non-null values
calories      528  non-null values
distance      528  non-null values
dtypes: float64(3), int64(3), object(3)
In [107]:
# Pandas DataFrame's support easy indexing by column names and traditional Python
# list style slicing. We'll use this trick a lot in this demonstration just
# to prevent showing lots of data at once.
nike['miles'][:5]
Out[107]:
start_time
2013-08-15T05:00:00Z    2.831880
2013-08-14T05:00:00Z    3.888698
2013-08-13T05:00:00Z    2.640087
2013-08-12T05:00:00Z    3.943007
2013-08-11T05:00:00Z    3.181217
Name: miles, dtype: float64
In [108]:
# Pandas can turn our DataFrame into html, but displaying is not as nice in
# this notebook interface by default.
nike[:5].to_html()
Out[108]:
u'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>device</th>\n      <th>miles</th>\n      <th>steps</th>\n      <th>pace</th>\n      <th>fuel</th>\n      <th>duration</th>\n      <th>kilometers</th>\n      <th>calories</th>\n      <th>distance</th>\n    </tr>\n    <tr>\n      <th>start_time</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2013-08-15T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 2.831880</td>\n      <td> 5788</td>\n      <td> (23\'47/mi)</td>\n      <td> 3032</td>\n      <td> 12:27:00</td>\n      <td> 4.557471</td>\n      <td>  938</td>\n      <td> 4.557471</td>\n    </tr>\n    <tr>\n      <th>2013-08-14T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 3.888698</td>\n      <td> 7948</td>\n      <td>  (7\'43/mi)</td>\n      <td> 3469</td>\n      <td> 12:10:00</td>\n      <td> 6.258255</td>\n      <td> 1074</td>\n      <td> 6.258255</td>\n    </tr>\n    <tr>\n      <th>2013-08-13T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 2.640087</td>\n      <td> 5396</td>\n      <td> (51\'39/mi)</td>\n      <td> 2797</td>\n      <td> 12:50:00</td>\n      <td> 4.248810</td>\n      <td>  865</td>\n      <td> 4.248810</td>\n    </tr>\n    <tr>\n      <th>2013-08-12T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 3.943007</td>\n      <td> 8059</td>\n      <td>  (2\'21/mi)</td>\n      <td> 3097</td>\n      <td> 11:59:00</td>\n      <td> 6.345656</td>\n      <td>  960</td>\n      <td> 6.345656</td>\n    </tr>\n    <tr>\n      <th>2013-08-11T05:00:00Z</th>\n      <td> FUELBAND</td>\n      <td> 3.181217</td>\n      <td> 6502</td>\n      <td> (52\'35/mi)</td>\n      <td> 2935</td>\n      <td>  9:09:00</td>\n      <td> 5.119675</td>\n      <td>  908</td>\n      <td> 5.119675</td>\n    </tr>\n  </tbody>\n</table>'
In [109]:
# IPython is several things all rolled into one including, but not limited to:
#  1. Python library that can do all sorts of things including displaying html
#  2. Provides this notebook interface to execute code and display results
from IPython.core.display import display_html
display_html(nike[:5].to_html(), raw=True)
device miles steps pace fuel duration kilometers calories distance
start_time
2013-08-15T05:00:00Z FUELBAND 2.831880 5788 (23'47/mi) 3032 12:27:00 4.557471 938 4.557471
2013-08-14T05:00:00Z FUELBAND 3.888698 7948 (7'43/mi) 3469 12:10:00 6.258255 1074 6.258255
2013-08-13T05:00:00Z FUELBAND 2.640087 5396 (51'39/mi) 2797 12:50:00 4.248810 865 4.248810
2013-08-12T05:00:00Z FUELBAND 3.943007 8059 (2'21/mi) 3097 11:59:00 6.345656 960 6.345656
2013-08-11T05:00:00Z FUELBAND 3.181217 6502 (52'35/mi) 2935 9:09:00 5.119675 908 5.119675
In [110]:
# Typically you could just plot a DataFrame like this, which by default
# plots all columns as a separate line on our graph. However, our
# data contains some non-numeric columns like 'device' and 'start_time'.

# Uncomment this line and run this cell with shift-enter to see
# the associated error.
#nike.plot()
In [111]:
# However, we can index by column names and plot only a single column,
# a Pandas.DataSeries data structure, to focus on a single column
# of data. This will work because we now the miles column is all numerical
# data. Again, still could be prettier, maybe there's a problem with our
# importing?
nike['miles'].plot()
Out[111]:
<matplotlib.axes.AxesSubplot at 0x72c7150>
In [112]:
# Pandas has A LOT of arguments to tweak how CSV files are imported
# including the ability to parse dates into Python datetime objects
# instead of strings
nike = pd.read_csv('nikeplus.csv', index_col=8, parse_dates=[8])
nike
Out[112]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 528 entries, 2013-08-15 05:00:00 to 2012-02-28 06:00:00
Data columns (total 9 columns):
device        528  non-null values
miles         528  non-null values
steps         528  non-null values
pace          528  non-null values
fuel          528  non-null values
duration      528  non-null values
kilometers    528  non-null values
calories      528  non-null values
distance      528  non-null values
dtypes: float64(3), int64(3), object(3)
In [113]:
# Note that IPython code cells keep a global state, so we don't
# need to import this function again if we run this cell after
# the cell above with the import in it.
display_html(nike[:5].to_html(), raw=True)
device miles steps pace fuel duration kilometers calories distance
start_time
2013-08-15 05:00:00 FUELBAND 2.831880 5788 (23'47/mi) 3032 12:27:00 4.557471 938 4.557471
2013-08-14 05:00:00 FUELBAND 3.888698 7948 (7'43/mi) 3469 12:10:00 6.258255 1074 6.258255
2013-08-13 05:00:00 FUELBAND 2.640087 5396 (51'39/mi) 2797 12:50:00 4.248810 865 4.248810
2013-08-12 05:00:00 FUELBAND 3.943007 8059 (2'21/mi) 3097 11:59:00 6.345656 960 6.345656
2013-08-11 05:00:00 FUELBAND 3.181217 6502 (52'35/mi) 2935 9:09:00 5.119675 908 5.119675
In [114]:
# Now our plot will look much better because Pandas and matplotlib
# know our index column is actually a datetime object, not a string.

# Notice this is just too much data to show in the default
# width, but IPython provides the ability to drag the bottom
# right corner to increase the size of a plot.
nike['miles'].plot()
Out[114]:
<matplotlib.axes.AxesSubplot at 0x73e8f90>
In [115]:
# However, showing less data looks even better in this window.
nike['miles'][:20].plot()
Out[115]:
<matplotlib.axes.AxesSubplot at 0x74dad50>
In [116]:
# Again, the plot function passes information directly to matplotlib
# so there are lots of arguments to tweak the display. For example,
# we can add a title.
nike['miles'][:30].plot(title='Miles')
Out[116]:
<matplotlib.axes.AxesSubplot at 0x72a73d0>
In [117]:
# Maybe we don't want to deal with all those columns and are only
# interested in a DataFrame with a few columns.
nike2 = nike.reindex(columns=['calories', 'fuel'])
nike2
Out[117]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 528 entries, 2013-08-15 05:00:00 to 2012-02-28 06:00:00
Data columns (total 2 columns):
calories    528  non-null values
fuel        528  non-null values
dtypes: int64(2)
In [118]:
# Our new DataFrame, nike2, only has numerical columns now. So, the default
# Pandas plotting will work and can automatically make a new line for
# each of our columns.
nike2.plot()
Out[118]:
<matplotlib.axes.AxesSubplot at 0x7447b50>
In [119]:
# We can further control what data is actually shown on the x/y axes.
# So, this plot effectively shows that calories and Nike's proprietary
# fuel measurement have a linear relationship.
nike2[:30].plot(x='calories', y='fuel')
Out[119]:
<matplotlib.axes.AxesSubplot at 0x7cca9d0>
In [120]:
# Of course, we aren't limited to just line plots, there are
# all sorts provided by matplotlib.
nike2[:30].plot(kind='bar')
Out[120]:
<matplotlib.axes.AxesSubplot at 0x7d5cb50>
In [121]:
# Finally, we could remove the above reindexing step and save memory and
# time up front if we know we're only interested in a few columns at the
# time of reading the data from the CSV. Remember, Pandas read_csv has
# ALOT of arguments...

# Note that with usecols the index_col is RELATIVE to the columns in usecols argument!
nike = pd.read_csv('nikeplus.csv', usecols=['miles', 'steps', 'fuel', 'calories', 'start_time'], index_col=4)
nike
Out[121]:
<class 'pandas.core.frame.DataFrame'>
Index: 528 entries, 2013-08-15T05:00:00Z to 2012-02-28T06:00:00Z
Data columns (total 4 columns):
miles       528  non-null values
steps       528  non-null values
fuel        528  non-null values
calories    528  non-null values
dtypes: float64(1), int64(3)