import pandas as pd
import numpy as np
Earthquake data
fileUrl = 'http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt'
eData = pd.read_csv(fileUrl)
dateDownloaded = !date
dateDownloaded
['Mon Mar 18 21:45:48 CET 2013']
# looking at data
# for large data, only a summary is shown
eData
<class 'pandas.core.frame.DataFrame'> Int64Index: 1126 entries, 0 to 1125 Data columns: Src 1126 non-null values Eqid 1126 non-null values Version 1126 non-null values Datetime 1126 non-null values Lat 1126 non-null values Lon 1126 non-null values Magnitude 1126 non-null values Depth 1126 non-null values NST 1126 non-null values Region 1126 non-null values dtypes: float64(4), int64(1), object(5)
eData.head()
Src | Eqid | Version | Datetime | Lat | Lon | Magnitude | Depth | NST | Region | |
---|---|---|---|---|---|---|---|---|---|---|
0 | ci | 15309081 | 0 | Monday, March 18, 2013 20:39:21 UTC | 34.2713 | -117.2475 | 1.8 | 30.1 | 14 | Southern California |
1 | ci | 15309073 | 0 | Monday, March 18, 2013 20:38:22 UTC | 33.5075 | -116.4293 | 1.0 | 6.6 | 32 | Southern California |
2 | ci | 15309057 | 0 | Monday, March 18, 2013 20:28:36 UTC | 33.5195 | -116.4300 | 1.0 | 11.3 | 40 | Southern California |
3 | nn | 00406332 | 1 | Monday, March 18, 2013 20:22:00 UTC | 38.2462 | -118.6417 | 1.0 | 7.0 | 10 | Nevada |
4 | ci | 15309049 | 0 | Monday, March 18, 2013 20:21:41 UTC | 34.4252 | -117.0105 | 2.1 | 1.1 | 11 | Southern California |
# equivalent to R's dim()
eData.shape
(1126, 10)
# equivalent to R's names()
eData.columns
Index([Src, Eqid, Version, Datetime, Lat, Lon, Magnitude, Depth, NST, Region], dtype=object)
# computing quantiles, equivalent to R's quantile()
p = [0, 0.25, 0.5, 0.75, 1]
[eData['Lat'].quantile(q=i) for i in p]
[-55.428199999999997, 33.512300000000003, 38.027299999999997, 53.807400000000001, 68.162499999999994]
# equivalent to (but not as detailed as) R summary()
eData.describe()
Lat | Lon | Magnitude | Depth | NST | |
---|---|---|---|---|---|
count | 1126.000000 | 1126.000000 | 1126.000000 | 1126.000000 | 1126.000000 |
mean | 40.467150 | -115.068258 | 1.841741 | 23.956217 | 34.748668 |
std | 16.661746 | 58.041865 | 0.964461 | 48.891918 | 29.083693 |
min | -55.428200 | -179.967400 | 1.000000 | 0.000000 | 0.000000 |
25% | 33.512300 | -146.874675 | 1.200000 | 4.700000 | 15.000000 |
50% | 38.027300 | -119.670800 | 1.500000 | 10.400000 | 26.000000 |
75% | 53.807400 | -116.459800 | 2.100000 | 17.675000 | 44.000000 |
max | 68.162500 | 167.418600 | 5.600000 | 607.300000 | 254.000000 |
# R's class() is equivalent to type()
type(eData)
pandas.core.frame.DataFrame
# get the data types of all columns
# similarly to the method used in the video,
# we just apply the function type() to values in a row
# the zip() method is just for nice printing
zip(eData.columns, [type(x) for x in eData.ix[0,:]])
[('Src', str), ('Eqid', str), ('Version', str), ('Datetime', str), ('Lat', numpy.float64), ('Lon', numpy.float64), ('Magnitude', numpy.float64), ('Depth', numpy.float64), ('NST', numpy.int64), ('Region', str)]
# equivalent to R's unique() command
eData['Src'].unique()
array(['ci', 'nn', 'nc', 'us', 'ak', 'hv', 'uw', 'uu', 'mb', 'pr', 'ld', 'nm'], dtype=object)
# equivalent to R's length() command
len(eData['Src'].unique())
12
# for this particular case, value_counts() is equivalent to R's table()
eData['Src'].value_counts()
# or alternatively: pd.crosstab(eData['Src'], [])
ci 375 ak 306 nc 212 us 88 nn 67 uw 22 hv 18 uu 16 pr 14 ld 4 nm 3 mb 1
# equivalent to R's table() to compute frequency table
pd.crosstab(eData['Src'], eData['Version'])
Version | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | A | B | D | N |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Src | ||||||||||||||
ak | 0 | 77 | 214 | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ci | 170 | 60 | 49 | 72 | 21 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
hv | 0 | 10 | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
ld | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
mb | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
nc | 91 | 55 | 28 | 25 | 6 | 5 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
nm | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 |
nn | 0 | 67 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
pr | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
us | 0 | 0 | 0 | 2 | 10 | 18 | 26 | 16 | 10 | 2 | 1 | 1 | 1 | 1 |
uu | 0 | 0 | 5 | 2 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
uw | 0 | 17 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
eData.ix[0:9,'Lat']
0 34.2713 1 33.5075 2 33.5195 3 38.2462 4 34.4252 5 33.8267 6 32.6598 7 33.8420 8 33.9153 9 38.0265 Name: Lat
eData.ix[0:9,'Lat'] > 40
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False Name: Lat
# equivalent to R's any()
(eData.ix[0:9,'Lat'] > 40).any()
False
# equivalent to R's all()
(eData.ix[0:9,'Lat'] > 40).all()
False
# looking at subsets; very similar to R's & operator
eData[(eData['Lat'] > 0) & (eData['Lon'] > 0)][['Lat', 'Lon']][:10]
Lat | Lon | |
---|---|---|
85 | 36.2654 | 69.7945 |
86 | 36.5114 | 70.0785 |
102 | 36.4774 | 140.4687 |
132 | 2.4794 | 128.6353 |
133 | 34.0983 | 135.4945 |
183 | 44.3796 | 148.9563 |
221 | 36.5178 | 70.8805 |
246 | 9.6966 | 125.3708 |
253 | 36.1766 | 141.4287 |
295 | 35.7555 | 5.6368 |
# looking at subsets; very similar to R's | operator
eData[(eData['Lat'] > 0) | (eData['Lon'] > 0)][['Lat', 'Lon']][-10:]
Lat | Lon | |
---|---|---|
1116 | 33.5147 | -116.4460 |
1117 | 63.2469 | -150.4575 |
1118 | 33.5015 | -116.4617 |
1119 | 63.4993 | -146.4718 |
1120 | 49.3783 | -120.4828 |
1121 | 33.5075 | -116.4638 |
1122 | 33.5103 | -116.4387 |
1123 | 38.7893 | -122.7495 |
1124 | 33.5040 | -116.4415 |
1125 | 33.5047 | -116.4570 |
Peer review experiment data
fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'
fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'
reviews = pd.read_csv(fileUrl1)
solutions = pd.read_csv(fileUrl2)
reviews.head(2)
id | solution_id | reviewer_id | start | stop | time_left | accept | |
---|---|---|---|---|---|---|---|
0 | 1 | 3 | 27 | 1304095698 | 1304095758 | 1754 | 1 |
1 | 2 | 4 | 22 | 1304095188 | 1304095206 | 2306 | 1 |
solutions.head(2)
id | problem_id | subject_id | start | stop | time_left | answer | |
---|---|---|---|---|---|---|---|
0 | 1 | 156 | 29 | 1304095119 | 1304095169 | 2343 | B |
1 | 2 | 269 | 25 | 1304095119 | 1304095183 | 2329 | C |
# find if there are missing values; equivalent to R's is.na()
reviews.ix[0:9,'time_left'].isnull()
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 True 8 False 9 False Name: time_left
reviews['time_left'].isnull().sum()
84
reviews['time_left'].isnull().value_counts()
False 115 True 84
# equivalent to R's colSums()
# notice we need to specifically include missing values with the option skipna=False
# this is in contrary to R, where the default is to *include* NA
reviews.sum(skipna=False)
id 19900 solution_id 19929 reviewer_id 5064 start NaN stop NaN time_left NaN accept NaN
# equivalent to R's colMeans
# same remark as above: here missing values are by default excluded
reviews.mean()
id 1.000000e+02 solution_id 1.001457e+02 reviewer_id 2.544724e+01 start 1.304096e+09 stop 1.304096e+09 time_left 1.114287e+03 accept 6.434783e-01
# equivalent to R's rowMeans()
reviews.mean(axis=1)
0 3.725990e+08 1 3.725990e+08 2 3.725990e+08 3 3.725990e+08 4 3.725990e+08 5 3.725990e+08 6 3.725990e+08 7 1.300000e+01 8 3.725990e+08 9 3.725990e+08 10 3.725990e+08 11 3.725990e+08 12 3.725990e+08 13 3.725990e+08 14 3.725990e+08 ... 184 1.326667e+02 185 1.333333e+02 186 1.343333e+02 187 1.340000e+02 188 3.725993e+08 189 1.356667e+02 190 1.370000e+02 191 3.725993e+08 192 1.390000e+02 193 1.383333e+02 194 1.366667e+02 195 1.410000e+02 196 1.396667e+02 197 1.420000e+02 198 1.393333e+02 Length: 199