# CSV file for demonstration # Backup your own NikePlus data to CSV by using the following: # API wrapper: https://github.com/durden/nikeplus # Video explaining concept behind API wrapper: http://www.youtube.com/watch?v=jA0dwPtiu7c cat nikeplus.csv | head # Lets use the pandas library to explore this data in memory # in a data structure that called a DataFrame, which you can # think of as similar to an Excel spreadsheet. import pandas as pd nike = pd.read_csv('nikeplus.csv') nike # Use date column as index instead of a normal column so we can plot with it and # anchor all data based on a date since our data only has 1 entry per day. nike = pd.read_csv('nikeplus.csv', index_col=8) nike # Pandas DataFrame's support easy indexing by column names and traditional Python # list style slicing. We'll use this trick a lot in this demonstration just # to prevent showing lots of data at once. nike['miles'][:5] # Pandas can turn our DataFrame into html, but displaying is not as nice in # this notebook interface by default. nike[:5].to_html() # IPython is several things all rolled into one including, but not limited to: # 1. Python library that can do all sorts of things including displaying html # 2. Provides this notebook interface to execute code and display results from IPython.core.display import display_html display_html(nike[:5].to_html(), raw=True) # Typically you could just plot a DataFrame like this, which by default # plots all columns as a separate line on our graph. However, our # data contains some non-numeric columns like 'device' and 'start_time'. # Uncomment this line and run this cell with shift-enter to see # the associated error. #nike.plot() # However, we can index by column names and plot only a single column, # a Pandas.DataSeries data structure, to focus on a single column # of data. This will work because we now the miles column is all numerical # data. Again, still could be prettier, maybe there's a problem with our # importing? nike['miles'].plot() # Pandas has A LOT of arguments to tweak how CSV files are imported # including the ability to parse dates into Python datetime objects # instead of strings nike = pd.read_csv('nikeplus.csv', index_col=8, parse_dates=[8]) nike # Note that IPython code cells keep a global state, so we don't # need to import this function again if we run this cell after # the cell above with the import in it. display_html(nike[:5].to_html(), raw=True) # Now our plot will look much better because Pandas and matplotlib # know our index column is actually a datetime object, not a string. # Notice this is just too much data to show in the default # width, but IPython provides the ability to drag the bottom # right corner to increase the size of a plot. nike['miles'].plot() # However, showing less data looks even better in this window. nike['miles'][:20].plot() # Again, the plot function passes information directly to matplotlib # so there are lots of arguments to tweak the display. For example, # we can add a title. nike['miles'][:30].plot(title='Miles') # Maybe we don't want to deal with all those columns and are only # interested in a DataFrame with a few columns. nike2 = nike.reindex(columns=['calories', 'fuel']) nike2 # Our new DataFrame, nike2, only has numerical columns now. So, the default # Pandas plotting will work and can automatically make a new line for # each of our columns. nike2.plot() # We can further control what data is actually shown on the x/y axes. # So, this plot effectively shows that calories and Nike's proprietary # fuel measurement have a linear relationship. nike2[:30].plot(x='calories', y='fuel') # Of course, we aren't limited to just line plots, there are # all sorts provided by matplotlib. nike2[:30].plot(kind='bar') # Finally, we could remove the above reindexing step and save memory and # time up front if we know we're only interested in a few columns at the # time of reading the data from the CSV. Remember, Pandas read_csv has # ALOT of arguments... # Note that with usecols the index_col is RELATIVE to the columns in usecols argument! nike = pd.read_csv('nikeplus.csv', usecols=['miles', 'steps', 'fuel', 'calories', 'start_time'], index_col=4) nike