# CSV file for demonstration
# Backup your own NikePlus data to CSV by using the following:
# API wrapper: https://github.com/durden/nikeplus
# Video explaining concept behind API wrapper: http://www.youtube.com/watch?v=jA0dwPtiu7c
cat nikeplus.csv | head
device,miles,steps,pace,fuel,duration,kilometers,calories,start_time,distance
# Lets use the pandas library to explore this data in memory
# in a data structure that called a DataFrame, which you can
# think of as similar to an Excel spreadsheet.
import pandas as pd
nike = pd.read_csv('nikeplus.csv')
nike
<class 'pandas.core.frame.DataFrame'> Int64Index: 528 entries, 0 to 527 Data columns (total 10 columns): device 528 non-null values miles 528 non-null values steps 528 non-null values pace 528 non-null values fuel 528 non-null values duration 528 non-null values kilometers 528 non-null values calories 528 non-null values start_time 528 non-null values distance 528 non-null values dtypes: float64(3), int64(3), object(4)
# Use date column as index instead of a normal column so we can plot with it and
# anchor all data based on a date since our data only has 1 entry per day.
nike = pd.read_csv('nikeplus.csv', index_col=8)
nike
<class 'pandas.core.frame.DataFrame'> Index: 528 entries, 2013-08-15T05:00:00Z to 2012-02-28T06:00:00Z Data columns (total 9 columns): device 528 non-null values miles 528 non-null values steps 528 non-null values pace 528 non-null values fuel 528 non-null values duration 528 non-null values kilometers 528 non-null values calories 528 non-null values distance 528 non-null values dtypes: float64(3), int64(3), object(3)
# Pandas DataFrame's support easy indexing by column names and traditional Python
# list style slicing. We'll use this trick a lot in this demonstration just
# to prevent showing lots of data at once.
nike['miles'][:5]
start_time 2013-08-15T05:00:00Z 2.831880 2013-08-14T05:00:00Z 3.888698 2013-08-13T05:00:00Z 2.640087 2013-08-12T05:00:00Z 3.943007 2013-08-11T05:00:00Z 3.181217 Name: miles, dtype: float64
# Pandas can turn our DataFrame into html, but displaying is not as nice in
# this notebook interface by default.
nike[:5].to_html()
u'<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: right;">\n <th></th>\n <th>device</th>\n <th>miles</th>\n <th>steps</th>\n <th>pace</th>\n <th>fuel</th>\n <th>duration</th>\n <th>kilometers</th>\n <th>calories</th>\n <th>distance</th>\n </tr>\n <tr>\n <th>start_time</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>2013-08-15T05:00:00Z</th>\n <td> FUELBAND</td>\n <td> 2.831880</td>\n <td> 5788</td>\n <td> (23\'47/mi)</td>\n <td> 3032</td>\n <td> 12:27:00</td>\n <td> 4.557471</td>\n <td> 938</td>\n <td> 4.557471</td>\n </tr>\n <tr>\n <th>2013-08-14T05:00:00Z</th>\n <td> FUELBAND</td>\n <td> 3.888698</td>\n <td> 7948</td>\n <td> (7\'43/mi)</td>\n <td> 3469</td>\n <td> 12:10:00</td>\n <td> 6.258255</td>\n <td> 1074</td>\n <td> 6.258255</td>\n </tr>\n <tr>\n <th>2013-08-13T05:00:00Z</th>\n <td> FUELBAND</td>\n <td> 2.640087</td>\n <td> 5396</td>\n <td> (51\'39/mi)</td>\n <td> 2797</td>\n <td> 12:50:00</td>\n <td> 4.248810</td>\n <td> 865</td>\n <td> 4.248810</td>\n </tr>\n <tr>\n <th>2013-08-12T05:00:00Z</th>\n <td> FUELBAND</td>\n <td> 3.943007</td>\n <td> 8059</td>\n <td> (2\'21/mi)</td>\n <td> 3097</td>\n <td> 11:59:00</td>\n <td> 6.345656</td>\n <td> 960</td>\n <td> 6.345656</td>\n </tr>\n <tr>\n <th>2013-08-11T05:00:00Z</th>\n <td> FUELBAND</td>\n <td> 3.181217</td>\n <td> 6502</td>\n <td> (52\'35/mi)</td>\n <td> 2935</td>\n <td> 9:09:00</td>\n <td> 5.119675</td>\n <td> 908</td>\n <td> 5.119675</td>\n </tr>\n </tbody>\n</table>'
# IPython is several things all rolled into one including, but not limited to:
# 1. Python library that can do all sorts of things including displaying html
# 2. Provides this notebook interface to execute code and display results
from IPython.core.display import display_html
display_html(nike[:5].to_html(), raw=True)
device | miles | steps | pace | fuel | duration | kilometers | calories | distance | |
---|---|---|---|---|---|---|---|---|---|
start_time | |||||||||
2013-08-15T05:00:00Z | FUELBAND | 2.831880 | 5788 | (23'47/mi) | 3032 | 12:27:00 | 4.557471 | 938 | 4.557471 |
2013-08-14T05:00:00Z | FUELBAND | 3.888698 | 7948 | (7'43/mi) | 3469 | 12:10:00 | 6.258255 | 1074 | 6.258255 |
2013-08-13T05:00:00Z | FUELBAND | 2.640087 | 5396 | (51'39/mi) | 2797 | 12:50:00 | 4.248810 | 865 | 4.248810 |
2013-08-12T05:00:00Z | FUELBAND | 3.943007 | 8059 | (2'21/mi) | 3097 | 11:59:00 | 6.345656 | 960 | 6.345656 |
2013-08-11T05:00:00Z | FUELBAND | 3.181217 | 6502 | (52'35/mi) | 2935 | 9:09:00 | 5.119675 | 908 | 5.119675 |
# Typically you could just plot a DataFrame like this, which by default
# plots all columns as a separate line on our graph. However, our
# data contains some non-numeric columns like 'device' and 'start_time'.
# Uncomment this line and run this cell with shift-enter to see
# the associated error.
#nike.plot()
# However, we can index by column names and plot only a single column,
# a Pandas.DataSeries data structure, to focus on a single column
# of data. This will work because we now the miles column is all numerical
# data. Again, still could be prettier, maybe there's a problem with our
# importing?
nike['miles'].plot()
<matplotlib.axes.AxesSubplot at 0x72c7150>
# Pandas has A LOT of arguments to tweak how CSV files are imported
# including the ability to parse dates into Python datetime objects
# instead of strings
nike = pd.read_csv('nikeplus.csv', index_col=8, parse_dates=[8])
nike
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 528 entries, 2013-08-15 05:00:00 to 2012-02-28 06:00:00 Data columns (total 9 columns): device 528 non-null values miles 528 non-null values steps 528 non-null values pace 528 non-null values fuel 528 non-null values duration 528 non-null values kilometers 528 non-null values calories 528 non-null values distance 528 non-null values dtypes: float64(3), int64(3), object(3)
# Note that IPython code cells keep a global state, so we don't
# need to import this function again if we run this cell after
# the cell above with the import in it.
display_html(nike[:5].to_html(), raw=True)
device | miles | steps | pace | fuel | duration | kilometers | calories | distance | |
---|---|---|---|---|---|---|---|---|---|
start_time | |||||||||
2013-08-15 05:00:00 | FUELBAND | 2.831880 | 5788 | (23'47/mi) | 3032 | 12:27:00 | 4.557471 | 938 | 4.557471 |
2013-08-14 05:00:00 | FUELBAND | 3.888698 | 7948 | (7'43/mi) | 3469 | 12:10:00 | 6.258255 | 1074 | 6.258255 |
2013-08-13 05:00:00 | FUELBAND | 2.640087 | 5396 | (51'39/mi) | 2797 | 12:50:00 | 4.248810 | 865 | 4.248810 |
2013-08-12 05:00:00 | FUELBAND | 3.943007 | 8059 | (2'21/mi) | 3097 | 11:59:00 | 6.345656 | 960 | 6.345656 |
2013-08-11 05:00:00 | FUELBAND | 3.181217 | 6502 | (52'35/mi) | 2935 | 9:09:00 | 5.119675 | 908 | 5.119675 |
# Now our plot will look much better because Pandas and matplotlib
# know our index column is actually a datetime object, not a string.
# Notice this is just too much data to show in the default
# width, but IPython provides the ability to drag the bottom
# right corner to increase the size of a plot.
nike['miles'].plot()
<matplotlib.axes.AxesSubplot at 0x73e8f90>
# However, showing less data looks even better in this window.
nike['miles'][:20].plot()
<matplotlib.axes.AxesSubplot at 0x74dad50>
# Again, the plot function passes information directly to matplotlib
# so there are lots of arguments to tweak the display. For example,
# we can add a title.
nike['miles'][:30].plot(title='Miles')
<matplotlib.axes.AxesSubplot at 0x72a73d0>
# Maybe we don't want to deal with all those columns and are only
# interested in a DataFrame with a few columns.
nike2 = nike.reindex(columns=['calories', 'fuel'])
nike2
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 528 entries, 2013-08-15 05:00:00 to 2012-02-28 06:00:00 Data columns (total 2 columns): calories 528 non-null values fuel 528 non-null values dtypes: int64(2)
# Our new DataFrame, nike2, only has numerical columns now. So, the default
# Pandas plotting will work and can automatically make a new line for
# each of our columns.
nike2.plot()
<matplotlib.axes.AxesSubplot at 0x7447b50>
# We can further control what data is actually shown on the x/y axes.
# So, this plot effectively shows that calories and Nike's proprietary
# fuel measurement have a linear relationship.
nike2[:30].plot(x='calories', y='fuel')
<matplotlib.axes.AxesSubplot at 0x7cca9d0>
# Of course, we aren't limited to just line plots, there are
# all sorts provided by matplotlib.
nike2[:30].plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x7d5cb50>
# Finally, we could remove the above reindexing step and save memory and
# time up front if we know we're only interested in a few columns at the
# time of reading the data from the CSV. Remember, Pandas read_csv has
# ALOT of arguments...
# Note that with usecols the index_col is RELATIVE to the columns in usecols argument!
nike = pd.read_csv('nikeplus.csv', usecols=['miles', 'steps', 'fuel', 'calories', 'start_time'], index_col=4)
nike
<class 'pandas.core.frame.DataFrame'> Index: 528 entries, 2013-08-15T05:00:00Z to 2012-02-28T06:00:00Z Data columns (total 4 columns): miles 528 non-null values steps 528 non-null values fuel 528 non-null values calories 528 non-null values dtypes: float64(1), int64(3)