Summarizing data¶

In [1]:

import pandas as pd
import numpy as np

Earthquake data

In [2]:

fileUrl = 'http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt'

eData = pd.read_csv(fileUrl)

dateDownloaded = !date
dateDownloaded

Out[2]:

['Mon Mar 18 21:45:48 CET 2013']

In [3]:

# looking at data
# for large data, only a summary is shown
eData

Out[3]:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1126 entries, 0 to 1125
Data columns:
Src          1126  non-null values
Eqid         1126  non-null values
Version      1126  non-null values
Datetime     1126  non-null values
Lat          1126  non-null values
Lon          1126  non-null values
Magnitude    1126  non-null values
Depth        1126  non-null values
NST          1126  non-null values
Region       1126  non-null values
dtypes: float64(4), int64(1), object(5)

In [4]:

eData.head()

Out[4]:

	Src	Eqid	Version	Datetime	Lat	Lon	Magnitude	Depth	NST	Region
0	ci	15309081	0	Monday, March 18, 2013 20:39:21 UTC	34.2713	-117.2475	1.8	30.1	14	Southern California
1	ci	15309073	0	Monday, March 18, 2013 20:38:22 UTC	33.5075	-116.4293	1.0	6.6	32	Southern California
2	ci	15309057	0	Monday, March 18, 2013 20:28:36 UTC	33.5195	-116.4300	1.0	11.3	40	Southern California
3	nn	00406332	1	Monday, March 18, 2013 20:22:00 UTC	38.2462	-118.6417	1.0	7.0	10	Nevada
4	ci	15309049	0	Monday, March 18, 2013 20:21:41 UTC	34.4252	-117.0105	2.1	1.1	11	Southern California

In [5]:

# equivalent to R's dim()
eData.shape

Out[5]:

(1126, 10)

In [6]:

# equivalent to R's names()
eData.columns

Out[6]:

Index([Src, Eqid, Version, Datetime, Lat, Lon, Magnitude, Depth, NST, Region], dtype=object)

In [7]:

# computing quantiles, equivalent to R's quantile()
p = [0, 0.25, 0.5, 0.75, 1]
[eData['Lat'].quantile(q=i) for i in p]

Out[7]:

[-55.428199999999997,
 33.512300000000003,
 38.027299999999997,
 53.807400000000001,
 68.162499999999994]

In [8]:

# equivalent to (but not as detailed as) R summary()
eData.describe()

Out[8]:

	Lat	Lon	Magnitude	Depth	NST
count	1126.000000	1126.000000	1126.000000	1126.000000	1126.000000
mean	40.467150	-115.068258	1.841741	23.956217	34.748668
std	16.661746	58.041865	0.964461	48.891918	29.083693
min	-55.428200	-179.967400	1.000000	0.000000	0.000000
25%	33.512300	-146.874675	1.200000	4.700000	15.000000
50%	38.027300	-119.670800	1.500000	10.400000	26.000000
75%	53.807400	-116.459800	2.100000	17.675000	44.000000
max	68.162500	167.418600	5.600000	607.300000	254.000000

In [9]:

# R's class() is equivalent to type()
type(eData)

Out[9]:

pandas.core.frame.DataFrame

In [10]:

# get the data types of all columns
# similarly to the method used in the video, 
# we just apply the function type() to values in a row
# the zip() method is just for nice printing
zip(eData.columns, [type(x) for x in eData.ix[0,:]])

Out[10]:

[('Src', str),
 ('Eqid', str),
 ('Version', str),
 ('Datetime', str),
 ('Lat', numpy.float64),
 ('Lon', numpy.float64),
 ('Magnitude', numpy.float64),
 ('Depth', numpy.float64),
 ('NST', numpy.int64),
 ('Region', str)]

In [11]:

# equivalent to R's unique() command
eData['Src'].unique()

Out[11]:

array(['ci', 'nn', 'nc', 'us', 'ak', 'hv', 'uw', 'uu', 'mb', 'pr', 'ld',
       'nm'], dtype=object)

In [12]:

# equivalent to R's length() command
len(eData['Src'].unique())

Out[12]:

In [13]:

# for this particular case, value_counts() is equivalent to R's table()
eData['Src'].value_counts() 

# or alternatively: pd.crosstab(eData['Src'], [])

Out[13]:

ci    375
ak    306
nc    212
us     88
nn     67
uw     22
hv     18
uu     16
pr     14
ld      4
nm      3
mb      1

In [14]:

# equivalent to R's table() to compute frequency table
pd.crosstab(eData['Src'], eData['Version'])

Out[14]:

Version	0	1	2	3	4	5	6	7	8	9	A	B	D	N
Src
ak	0	77	214	15	0	0	0	0	0	0	0	0	0	0
ci	170	60	49	72	21	2	0	0	0	1	0	0	0	0
hv	0	10	4	1	2	1	0	0	0	0	0	0	0	0
ld	0	4	0	0	0	0	0	0	0	0	0	0	0	0
mb	0	0	1	0	0	0	0	0	0	0	0	0	0	0
nc	91	55	28	25	6	5	0	1	0	0	1	0	0	0
nm	0	0	0	0	0	0	0	0	0	0	3	0	0	0
nn	0	67	0	0	0	0	0	0	0	0	0	0	0	0
pr	14	0	0	0	0	0	0	0	0	0	0	0	0	0
us	0	0	0	2	10	18	26	16	10	2	1	1	1	1
uu	0	0	5	2	9	0	0	0	0	0	0	0	0	0
uw	0	17	5	0	0	0	0	0	0	0	0	0	0	0

In [15]:

eData.ix[0:9,'Lat']

Out[15]:

0    34.2713
1    33.5075
2    33.5195
3    38.2462
4    34.4252
5    33.8267
6    32.6598
7    33.8420
8    33.9153
9    38.0265
Name: Lat

In [16]:

eData.ix[0:9,'Lat'] > 40

Out[16]:

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: Lat

In [17]:

# equivalent to R's any()
(eData.ix[0:9,'Lat'] > 40).any()

Out[17]:

False

In [18]:

# equivalent to R's all()
(eData.ix[0:9,'Lat'] > 40).all()

Out[18]:

False

In [19]:

# looking at subsets; very similar to R's & operator
eData[(eData['Lat'] > 0) & (eData['Lon'] > 0)][['Lat', 'Lon']][:10]

Out[19]:

	Lat	Lon
85	36.2654	69.7945
86	36.5114	70.0785
102	36.4774	140.4687
132	2.4794	128.6353
133	34.0983	135.4945
183	44.3796	148.9563
221	36.5178	70.8805
246	9.6966	125.3708
253	36.1766	141.4287
295	35.7555	5.6368

In [20]:

# looking at subsets; very similar to R's | operator
eData[(eData['Lat'] > 0) | (eData['Lon'] > 0)][['Lat', 'Lon']][-10:]

Out[20]:

	Lat	Lon
1116	33.5147	-116.4460
1117	63.2469	-150.4575
1118	33.5015	-116.4617
1119	63.4993	-146.4718
1120	49.3783	-120.4828
1121	33.5075	-116.4638
1122	33.5103	-116.4387
1123	38.7893	-122.7495
1124	33.5040	-116.4415
1125	33.5047	-116.4570

Peer review experiment data

In [21]:

fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'
fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'

reviews = pd.read_csv(fileUrl1)
solutions = pd.read_csv(fileUrl2)

In [22]:

reviews.head(2)

Out[22]:

	id	solution_id	reviewer_id	start	stop	time_left	accept
0	1	3	27	1304095698	1304095758	1754	1
1	2	4	22	1304095188	1304095206	2306	1

In [23]:

solutions.head(2)

Out[23]:

	id	problem_id	subject_id	start	stop	time_left	answer
0	1	156	29	1304095119	1304095169	2343	B
1	2	269	25	1304095119	1304095183	2329	C

In [24]:

# find if there are missing values; equivalent to R's is.na()
reviews.ix[0:9,'time_left'].isnull()

Out[24]:

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8    False
9    False
Name: time_left

In [25]:

reviews['time_left'].isnull().sum()

Out[25]:

In [26]:

reviews['time_left'].isnull().value_counts()

Out[26]:

False    115
True      84

In [27]:

# equivalent to R's colSums()
# notice we need to specifically include missing values with the option skipna=False
# this is in contrary to R, where the default is to *include* NA
reviews.sum(skipna=False) 

Out[27]:

id             19900
solution_id    19929
reviewer_id     5064
start            NaN
stop             NaN
time_left        NaN
accept           NaN

In [28]:

# equivalent to R's colMeans
# same remark as above: here missing values are by default excluded
reviews.mean()

Out[28]:

id             1.000000e+02
solution_id    1.001457e+02
reviewer_id    2.544724e+01
start          1.304096e+09
stop           1.304096e+09
time_left      1.114287e+03
accept         6.434783e-01

In [29]:

# equivalent to R's rowMeans()
reviews.mean(axis=1)

Out[29]:

0     3.725990e+08
1     3.725990e+08
2     3.725990e+08
3     3.725990e+08
4     3.725990e+08
5     3.725990e+08
6     3.725990e+08
7     1.300000e+01
8     3.725990e+08
9     3.725990e+08
10    3.725990e+08
11    3.725990e+08
12    3.725990e+08
13    3.725990e+08
14    3.725990e+08
...
184    1.326667e+02
185    1.333333e+02
186    1.343333e+02
187    1.340000e+02
188    3.725993e+08
189    1.356667e+02
190    1.370000e+02
191    3.725993e+08
192    1.390000e+02
193    1.383333e+02
194    1.366667e+02
195    1.410000e+02
196    1.396667e+02
197    1.420000e+02
198    1.393333e+02
Length: 199

In [ ]:

Version	0	1	2	3	4	5	6	7	8	9	A	B	D	N
Src
ak	0	77	214	15	0	0	0	0	0	0	0	0	0	0
ci	170	60	49	72	21	2	0	0	0	1	0	0	0	0
hv	0	10	4	1	2	1	0	0	0	0	0	0	0	0
ld	0	4	0	0	0	0	0	0	0	0	0	0	0	0
mb	0	0	1	0	0	0	0	0	0	0	0	0	0	0
nc	91	55	28	25	6	5	0	1	0	0	1	0	0	0
nm	0	0	0	0	0	0	0	0	0	0	3	0	0	0
nn	0	67	0	0	0	0	0	0	0	0	0	0	0	0
pr	14	0	0	0	0	0	0	0	0	0	0	0	0	0
us	0	0	0	2	10	18	26	16	10	2	1	1	1	1
uu	0	0	5	2	9	0	0	0	0	0	0	0	0	0
uw	0	17	5	0	0	0	0	0	0	0	0	0	0	0

Version	0	1	2	3	4	5	6	7	8	9	A	B	D	N
Src
ak	0	77	214	15	0	0	0	0	0	0	0	0	0	0
ci	170	60	49	72	21	2	0	0	0	1	0	0	0	0
hv	0	10	4	1	2	1	0	0	0	0	0	0	0	0
ld	0	4	0	0	0	0	0	0	0	0	0	0	0	0
mb	0	0	1	0	0	0	0	0	0	0	0	0	0	0
nc	91	55	28	25	6	5	0	1	0	0	1	0	0	0
nm	0	0	0	0	0	0	0	0	0	0	3	0	0	0
nn	0	67	0	0	0	0	0	0	0	0	0	0	0	0
pr	14	0	0	0	0	0	0	0	0	0	0	0	0	0
us	0	0	0	2	10	18	26	16	10	2	1	1	1	1
uu	0	0	5	2	9	0	0	0	0	0	0	0	0	0
uw	0	17	5	0	0	0	0	0	0	0	0	0	0	0

Version	0	1	2	3	4	5	6	7	8	9	A	B	D	N
Src
ak	0	77	214	15	0	0	0	0	0	0	0	0	0	0
ci	170	60	49	72	21	2	0	0	0	1	0	0	0	0
hv	0	10	4	1	2	1	0	0	0	0	0	0	0	0
ld	0	4	0	0	0	0	0	0	0	0	0	0	0	0
mb	0	0	1	0	0	0	0	0	0	0	0	0	0	0
nc	91	55	28	25	6	5	0	1	0	0	1	0	0	0
nm	0	0	0	0	0	0	0	0	0	0	3	0	0	0
nn	0	67	0	0	0	0	0	0	0	0	0	0	0	0
pr	14	0	0	0	0	0	0	0	0	0	0	0	0	0
us	0	0	0	2	10	18	26	16	10	2	1	1	1	1
uu	0	0	5	2	9	0	0	0	0	0	0	0	0	0
uw	0	17	5	0	0	0	0	0	0	0	0	0	0	0