Looking at data from one run is pretty cool, but most of the information we have seen so far is the same as you would get by looking at your basic stats on Runkeeper or Garmin. We can get some better insights by looking at a large amount of run data. So let's load in as much data as possible to see what we can find out.
import pandas as pd
import matplotlib.pyplot as plt
import glob
import gpxpy
import gpxpy.gpx
import os
import seaborn as sb
%matplotlib inline
def load_run_data(gpx_path, filter=""):
# List all of the GPX files in the path
gpx_files = glob.glob(os.path.join(gpx_path,filter+"*.gpx"))
run_data = []
# Loop through the files
for file_idx, gpx_file in enumerate(gpx_files):
# Parse the GPX File
try:
gpx = gpxpy.parse(open(gpx_file, 'r'))
except:
# Runkeeper often leaves off some XML tags causing an ivalid file.
# This can be fixed, but for the purpose of this talk, lets just
# remove the file from the data set.
os.remove(gpx_file)
continue
# Loop through tracks
for track_idx, track in enumerate(gpx.tracks):
track_name = track.name
track_time = track.get_time_bounds().start_time
track_length = track.length_3d()
track_duration = track.get_duration()
track_speed = track.get_moving_data().max_speed
for seg_idx, segment in enumerate(track.segments):
segment_length = segment.length_3d()
for point_idx, point in enumerate(segment.points):
run_data.append([file_idx, os.path.basename(gpx_file), track_idx, track_name,
track_time, track_length, track_duration, track_speed,
seg_idx, segment_length, point.time, point.latitude,
point.longitude, point.elevation, segment.get_speed(point_idx)])
#run_data_tmp = [[file_idx, gpx_file, track_idx, track.name, track.number,
# track.length_3d(), seg_idx, segment.length_3d(), point.time,
# point.latitude,point.longitude, point.elevation, point.speed]
# for track_idx, track in enumerate(gpx.tracks)
# for seg_idx, segment in enumerate(track.segments)
# for point in segment.points]
#run_data += run_data_tmp
return run_data
run_data = load_run_data("rundata")
In this case, we just looped trhough all of the run data and built up a giant two diensional list of data. We can now load that data into a Pandas DataFrame so we can easily sort, filter, group, etc.
run_data_df = pd.DataFrame(run_data, columns=['File_Index', 'File_Name', 'Track_Index', 'Track_Name',
'Track_Time', 'Track_Length', 'Track_Duration', 'Track_Max_Speed',
'Segment_Index', 'Segment_Length', 'Point_Time', 'Point_Latitude',
'Point_Longitude', 'Point_Elevation', 'Point_Speed'])
run_data_df.head()
File_Index | File_Name | Track_Index | Track_Name | Track_Time | Track_Length | Track_Duration | Track_Max_Speed | Segment_Index | Segment_Length | Point_Time | Point_Latitude | Point_Longitude | Point_Elevation | Point_Speed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2011-06-21-0549.gpx | 0 | Running 6/21/11 5:49 am | 2011-06-21 05:49:43 | 4680.687646 | 1715 | 3.600856 | 0 | 4680.687646 | 2011-06-21 05:49:43 | 34.789057 | -86.791616 | 216.7 | NaN |
1 | 0 | 2011-06-21-0549.gpx | 0 | Running 6/21/11 5:49 am | 2011-06-21 05:49:43 | 4680.687646 | 1715 | 3.600856 | 0 | 4680.687646 | 2011-06-21 05:49:45 | 34.789059 | -86.791518 | 216.9 | 3.410757 |
2 | 0 | 2011-06-21-0549.gpx | 0 | Running 6/21/11 5:49 am | 2011-06-21 05:49:43 | 4680.687646 | 1715 | 3.600856 | 0 | 4680.687646 | 2011-06-21 05:49:50 | 34.789063 | -86.791391 | 217.0 | 2.635188 |
3 | 0 | 2011-06-21-0549.gpx | 0 | Running 6/21/11 5:49 am | 2011-06-21 05:49:43 | 4680.687646 | 1715 | 3.600856 | 0 | 4680.687646 | 2011-06-21 05:49:54 | 34.789069 | -86.791262 | 217.1 | 3.281448 |
4 | 0 | 2011-06-21-0549.gpx | 0 | Running 6/21/11 5:49 am | 2011-06-21 05:49:43 | 4680.687646 | 1715 | 3.600856 | 0 | 4680.687646 | 2011-06-21 05:49:57 | 34.789082 | -86.791144 | 217.2 | 3.006575 |
5 rows × 15 columns
Pandas does a great job of detecting the data type of each column automatically. You can see the data type of each column below.
run_data_df.dtypes
File_Index int64 File_Name object Track_Index int64 Track_Name object Track_Time datetime64[ns] Track_Length float64 Track_Duration int64 Track_Max_Speed float64 Segment_Index int64 Segment_Length float64 Point_Time datetime64[ns] Point_Latitude float64 Point_Longitude float64 Point_Elevation float64 Point_Speed float64 dtype: object
Let's look at some summary statistics about our run data month to month.
# Filter Data Frame to just track specific information
run_data_df_tracks = run_data_df[['File_Index', 'File_Name', 'Track_Index', 'Track_Name', 'Track_Time', 'Track_Length', 'Track_Duration', 'Track_Max_Speed']].copy()
# Drop duplicates to get rid of repeated track information that is
# lingering since we chopped off the segments and points
run_data_df_tracks.drop_duplicates(inplace=True)
run_data_df_tracks.head()
File_Index | File_Name | Track_Index | Track_Name | Track_Time | Track_Length | Track_Duration | Track_Max_Speed | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 2011-06-21-0549.gpx | 0 | Running 6/21/11 5:49 am | 2011-06-21 05:49:43 | 4680.687646 | 1715 | 3.600856 |
438 | 1 | 2011-06-23-0545.gpx | 0 | Running 6/23/11 5:45 am | 2011-06-23 05:45:42 | 8330.047449 | 2940 | 3.689208 |
1144 | 2 | 2011-06-25-0641.gpx | 0 | Running 6/25/11 6:41 am | 2011-06-25 06:41:55 | 5370.031909 | 2017 | 3.727692 |
1646 | 3 | 2011-07-31-2029.gpx | 0 | Running 7/31/11 8:29 pm | 2011-07-31 20:29:49 | 5640.499900 | 2099 | 2.890534 |
1690 | 4 | 2011-08-02-0529.gpx | 0 | Running 8/2/11 5:29 am | 2011-08-02 05:29:45 | 9713.389441 | 3320 | 4.087020 |
5 rows × 8 columns
We can look at summary stats for all of our data by grouping the data by year and month
# Add Track Year and Month columns based on track time
run_data_df_tracks['Track_Year'] = run_data_df_tracks['Track_Time'].apply(lambda x: x.year)
run_data_df_tracks['Track_Month'] = run_data_df_tracks['Track_Time'].apply(lambda x: x.month)
tracks_grouped = run_data_df_tracks.groupby(['Track_Year','Track_Month'])
tracks_grouped.describe()
File_Index | Track_Index | Track_Length | Track_Duration | Track_Max_Speed | Track_Year | Track_Month | |||
---|---|---|---|---|---|---|---|---|---|
Track_Year | Track_Month | ||||||||
2011 | 6 | count | 3.000000 | 3 | 3.000000 | 3.000000 | 3.000000 | 3 | 3 |
mean | 1.000000 | 0 | 6126.922335 | 2224.000000 | 3.672585 | 2011 | 6 | ||
std | 1.000000 | 0 | 1938.844780 | 638.195111 | 0.065031 | 0 | 0 | ||
min | 0.000000 | 0 | 4680.687646 | 1715.000000 | 3.600856 | 2011 | 6 | ||
25% | 0.500000 | 0 | 5025.359778 | 1866.000000 | 3.645032 | 2011 | 6 | ||
50% | 1.000000 | 0 | 5370.031909 | 2017.000000 | 3.689208 | 2011 | 6 | ||
75% | 1.500000 | 0 | 6850.039679 | 2478.500000 | 3.708450 | 2011 | 6 | ||
max | 2.000000 | 0 | 8330.047449 | 2940.000000 | 3.727692 | 2011 | 6 | ||
7 | count | 1.000000 | 1 | 1.000000 | 1.000000 | 1.000000 | 1 | 1 | |
mean | 3.000000 | 0 | 5640.499900 | 2099.000000 | 2.890534 | 2011 | 7 | ||
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
min | 3.000000 | 0 | 5640.499900 | 2099.000000 | 2.890534 | 2011 | 7 | ||
25% | 3.000000 | 0 | 5640.499900 | 2099.000000 | 2.890534 | 2011 | 7 | ||
50% | 3.000000 | 0 | 5640.499900 | 2099.000000 | 2.890534 | 2011 | 7 | ||
75% | 3.000000 | 0 | 5640.499900 | 2099.000000 | 2.890534 | 2011 | 7 | ||
max | 3.000000 | 0 | 5640.499900 | 2099.000000 | 2.890534 | 2011 | 7 | ||
8 | count | 5.000000 | 5 | 5.000000 | 5.000000 | 5.000000 | 5 | 5 | |
mean | 6.000000 | 0 | 5987.192455 | 2021.000000 | 4.248593 | 2011 | 8 | ||
std | 1.581139 | 0 | 3475.187474 | 1162.767604 | 0.510012 | 0 | 0 | ||
min | 4.000000 | 0 | 1089.973361 | 355.000000 | 3.785933 | 2011 | 8 | ||
25% | 5.000000 | 0 | 3922.508666 | 1398.000000 | 3.790957 | 2011 | 8 | ||
50% | 6.000000 | 0 | 6918.533350 | 2333.000000 | 4.087020 | 2011 | 8 | ||
75% | 7.000000 | 0 | 8291.557459 | 2699.000000 | 4.736441 | 2011 | 8 | ||
max | 8.000000 | 0 | 9713.389441 | 3320.000000 | 4.842616 | 2011 | 8 | ||
9 | count | 8.000000 | 8 | 8.000000 | 8.000000 | 8.000000 | 8 | 8 | |
mean | 12.500000 | 0 | 8700.187289 | 3310.375000 | 4.019886 | 2011 | 9 | ||
std | 2.449490 | 0 | 7248.425263 | 2160.913427 | 0.414384 | 0 | 0 | ||
min | 9.000000 | 0 | 860.741325 | 829.000000 | 3.570209 | 2011 | 9 | ||
25% | 10.750000 | 0 | 2500.700468 | 2124.500000 | 3.810723 | 2011 | 9 | ||
50% | 12.500000 | 0 | 8385.600070 | 3065.000000 | 3.931056 | 2011 | 9 | ||
75% | 14.250000 | 0 | 11052.604188 | 3916.750000 | 4.096799 | 2011 | 9 | ||
max | 16.000000 | 0 | 22404.907212 | 7233.000000 | 4.927764 | 2011 | 9 | ||
10 | count | 15.000000 | 15 | 15.000000 | 15.000000 | 15.000000 | 15 | 15 | |
mean | 24.000000 | 0 | 10553.908112 | 3574.800000 | 4.845673 | 2011 | 10 | ||
std | 4.472136 | 0 | 6635.553532 | 2117.364372 | 2.119345 | 0 | 0 | ||
min | 17.000000 | 0 | 3667.566086 | 1562.000000 | 3.374733 | 2011 | 10 | ||
25% | 20.500000 | 0 | 5371.677595 | 1968.500000 | 3.702662 | 2011 | 10 | ||
50% | 24.000000 | 0 | 8051.276227 | 2944.000000 | 4.166693 | 2011 | 10 | ||
75% | 27.500000 | 0 | 14635.401354 | 5017.500000 | 4.519542 | 2011 | 10 | ||
max | 31.000000 | 0 | 25231.587060 | 8400.000000 | 9.953947 | 2011 | 10 | ||
11 | count | 4.000000 | 4 | 4.000000 | 4.000000 | 4.000000 | 4 | 4 | |
mean | 33.500000 | 0 | 13661.726287 | 4726.500000 | 4.035178 | 2011 | 11 | ||
std | 1.290994 | 0 | 13068.323855 | 4669.397142 | 0.474684 | 0 | 0 | ||
min | 32.000000 | 0 | 5354.780084 | 1791.000000 | 3.533277 | 2011 | 11 | ||
25% | 32.750000 | 0 | 7297.025557 | 2424.750000 | 3.838214 | 2011 | 11 | ||
50% | 33.500000 | 0 | 8061.031559 | 2707.500000 | 3.964833 | 2011 | 11 | ||
75% | 34.250000 | 0 | 14425.732289 | 5009.250000 | 4.161797 | 2011 | 11 | ||
max | 35.000000 | 0 | 33170.061946 | 11700.000000 | 4.677771 | 2011 | 11 | ||
12 | count | 2.000000 | 2 | 2.000000 | 2.000000 | 2.000000 | 2 | 2 | |
mean | 36.500000 | 0 | 22140.064184 | 8399.000000 | 3.717502 | 2011 | 12 | ||
std | 0.707107 | 0 | 23735.576227 | 9289.968891 | 0.140706 | 0 | 0 | ||
min | 36.000000 | 0 | 5356.477278 | 1830.000000 | 3.618008 | 2011 | 12 | ||
25% | 36.250000 | 0 | 13748.270731 | 5114.500000 | 3.667755 | 2011 | 12 | ||
50% | 36.500000 | 0 | 22140.064184 | 8399.000000 | 3.717502 | 2011 | 12 | ||
75% | 36.750000 | 0 | 30531.857636 | 11683.500000 | 3.767250 | 2011 | 12 | ||
max | 37.000000 | 0 | 38923.651089 | 14968.000000 | 3.816997 | 2011 | 12 | ||
2012 | 1 | count | 4.000000 | 4 | 4.000000 | 4.000000 | 4.000000 | 4 | 4 |
mean | 39.500000 | 0 | 6831.430265 | 2013.000000 | 7.598426 | 2012 | 1 | ||
std | 1.290994 | 0 | 1188.357027 | 887.751091 | 7.406879 | 0 | 0 | ||
min | 38.000000 | 0 | 5315.116285 | 814.000000 | 3.445419 | 2012 | 1 | ||
... | ... | ... | ... | ... | ... | ... |
264 rows × 7 columns
Let's use some of pandas buit in plot capabilities to view our total distance from month to month.
tracks_grouped = run_data_df_tracks.groupby(['Track_Year','Track_Month'])
tracks_grouped['Track_Length'].sum().plot(kind='bar', figsize=(15,5))
plt.xticks(rotation=70)
plt.ylabel('Distance (meters)')
<matplotlib.text.Text at 0x10fa8c610>
With IPython 2.0, you can easily interact with the plot (assuming you set up an appropriate function). Let's set up a function to plot the total length ran during each day of a single month.
def plot_run_lenth_per_month(year, month):
try:
focus_group = tracks_grouped.get_group((year, month))
focus_group = focus_group.set_index('Track_Time')
focus_group.groupby(lambda x: x.day)['Track_Length'].sum().reindex(range(1,32)).plot(kind='bar', figsize=(15,5))
except:
print("No data")
from IPython.html.widgets.interaction import interactive
interactive(plot_run_lenth_per_month, year=(2010,2014), month=(1,12))
What if we wanted to see the distribution of daily run lengths over all months... that's easy!
# Use Pandas built in box plot functionality
run_data_df_tracks.boxplot(column='Track_Length', by=['Track_Year','Track_Month'], figsize=(15,5))
ax = plt.gca()
plt.xticks(rotation=70)
plt.ylabel('Distance (meters)')
<matplotlib.text.Text at 0x10f3b6c10>
Seaborn has some very nice statistical plotting caabilities built on top of matplotlib. Let's look at a box plot of each month's runs regardless of year to see what trends we find.
fig = plt.figure(figsize=(15,6))
sb.boxplot(run_data_df_tracks['Track_Length'], run_data_df_tracks['Track_Month'])
<matplotlib.axes.AxesSubplot at 0x1102d7710>
Seaborn comes with some cool options for viewing distributions. Let's see a violin plot.
fig = plt.figure(figsize=(15,6))
sb.violinplot(run_data_df_tracks['Track_Length'], run_data_df_tracks['Track_Month'], inner="stick")
<matplotlib.axes.AxesSubplot at 0x1190aec50>