import pandas as pd
import numpy as np
Camera data
cameraData = pd.read_csv('../data/cameras.csv')
cameraData.columns
# somehow the column is named 'Location 1' instead of 'Location.1'
Index([address, direction, street, crossStreet, intersection, Location 1], dtype=object)
# transform column names to lowercase
# equivalent to R's tolower()
cameraData.columns = cameraData.columns.map(lambda x: x.lower())
cameraData.columns
Index([address, direction, street, crossstreet, intersection, location 1], dtype=object)
# string split; split a string into list of strings
# equivalent to R's strsplit()
splitNames = cameraData.columns.map(lambda x: x.split(' '))
splitNames[4]
['intersection']
splitNames[5]
['location', '1']
splitNames[5][0]
'location'
# apply function to names
cameraData.columns.map(lambda x: x.split(' ')[0])
array(['address', 'direction', 'street', 'crossstreet', 'intersection', 'location'], dtype=object)
Peer review experiment data
fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'
fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'
reviews = pd.read_csv(fileUrl1)
solutions = pd.read_csv(fileUrl2)
reviews.head(2)
id | solution_id | reviewer_id | start | stop | time_left | accept | |
---|---|---|---|---|---|---|---|
0 | 1 | 3 | 27 | 1304095698 | 1304095758 | 1754 | 1 |
1 | 2 | 4 | 22 | 1304095188 | 1304095206 | 2306 | 1 |
solutions.head(2)
id | problem_id | subject_id | start | stop | time_left | answer | |
---|---|---|---|---|---|---|---|
0 | 1 | 156 | 29 | 1304095119 | 1304095169 | 2343 | B |
1 | 2 | 269 | 25 | 1304095119 | 1304095183 | 2329 | C |
reviews.columns
Index([id, solution_id, reviewer_id, start, stop, time_left, accept], dtype=object)
# remove underscores
# equivalent to R's sub()
reviews.columns = reviews.columns.map(lambda x: x.replace('_', ''))
solutions.columns = solutions.columns.map(lambda x: x.replace('_', ''))
# test; every occurence is replaced
# in contrast to R's sub(), where only one occurence is replaced
testName = 'this_is_a_test'
testName.replace('_', '')
'thisisatest'
reviews['timeleft'][:10]
0 1754 1 2306 2 2192 3 2089 4 2043 5 1999 6 2130 7 NaN 8 1899 9 2024 Name: timeleft
# cut into ranges
# this is equivalent to R's cut()
timeRanges = pd.cut(reviews['timeleft'], range(0, 4000, 600)) # note it needs to go up to 4000 in order to include 3600
timeRanges[:10]
Categorical: array(['(1200, 1800]', '(1800, 2400]', '(1800, 2400]', '(1800, 2400]', '(1800, 2400]', '(1800, 2400]', '(1800, 2400]', nan, '(1800, 2400]', '(1800, 2400]'], dtype=object) Levels (6): Index(['(0, 600]', '(600, 1200]', '(1200, 1800]', '(1800, 2400]', '(2400, 3000]', '(3000, 3600]'], dtype=object)
type(timeRanges)
pandas.core.categorical.Categorical
pd.value_counts(timeRanges)
(600, 1200] 32 (0, 600] 30 (1800, 2400] 28 (1200, 1800] 25
# equivalent to R's cut2()
timeRanges = pd.cut(reviews['timeleft'], 6)
pd.value_counts(timeRanges) # note that NaN values are excluded; the resulting ranges are thus different from video
(19.716, 402.667] 22 (1164, 1544.667] 20 (783.333, 1164] 19 (1544.667, 1925.333] 19 (402.667, 783.333] 19 (1925.333, 2306] 16
# adding an extra variable
# similarly to R, simply assign to a new column
reviews['timeRanges'] = timeRanges
reviews.head(2)
id | solutionid | reviewerid | start | stop | timeleft | accept | timeRanges | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 3 | 27 | 1304095698 | 1304095758 | 1754 | 1 | (1544.667, 1925.333] |
1 | 2 | 4 | 22 | 1304095188 | 1304095206 | 2306 | 1 | (1925.333, 2306] |
Merging data
First let's have a look at the columns:
print reviews.columns
print solutions.columns
Index([id, solutionid, reviewerid, start, stop, timeleft, accept, timeRanges], dtype=object) Index([id, problemid, subjectid, start, stop, timeleft, answer], dtype=object)
Then merge; this is equivalent to R's merge()
mergedData2 = pd.merge(reviews, solutions, left_on='solutionid', right_on='id', sort=True)
mergedData2.ix[:,0:6].head(3)
id_x | solutionid | reviewerid | start_x | stop_x | timeleft_x | |
---|---|---|---|---|---|---|
0 | 4 | 1 | 26 | 1304095267 | 1304095423 | 2089 |
1 | 6 | 2 | 29 | 1304095471 | 1304095513 | 1999 |
2 | 1 | 3 | 27 | 1304095698 | 1304095758 | 1754 |
reviews.ix[0,0:6]
id 1 solutionid 3 reviewerid 27 start 1.304096e+09 stop 1.304096e+09 timeleft 1754 Name: 0
Sorting values
First let's have a look at the unsorted values:
mergedData2['reviewerid'][:10]
0 26 1 29 2 27 3 22 4 28 5 22 6 29 7 23 8 25 9 29 Name: reviewerid
Then sort; this is equivalent to R's sort()
mergedData2['reviewerid'].order()[:10]
3 22 5 22 13 22 21 22 22 22 23 22 26 22 31 22 36 22 38 22 Name: reviewerid
R's order
can be implemented by getting the index of the sorted Series:
mergedData2['reviewerid'][mergedData2['reviewerid'].order().index][:10]
3 22 5 22 13 22 21 22 22 22 23 22 26 22 31 22 36 22 38 22 Name: reviewerid
Reordering a DataFrame
The original DataFrame ordering:
mergedData2.ix[:,0:6].head(3)
id_x | solutionid | reviewerid | start_x | stop_x | timeleft_x | |
---|---|---|---|---|---|---|
0 | 4 | 1 | 26 | 1304095267 | 1304095423 | 2089 |
1 | 6 | 2 | 29 | 1304095471 | 1304095513 | 1999 |
2 | 1 | 3 | 27 | 1304095698 | 1304095758 | 1754 |
Order by reviewerid
:
sortedData = mergedData2.sort(['reviewerid'])
sortedData.ix[:,0:6].head(3)
id_x | solutionid | reviewerid | start_x | stop_x | timeleft_x | |
---|---|---|---|---|---|---|
3 | 2 | 4 | 22 | 1304095188 | 1304095206 | 2306 |
5 | 16 | 6 | 22 | 1304095303 | 1304095471 | 2041 |
13 | 12 | 14 | 22 | 1304095280 | 1304095301 | 2211 |
Reordering by multiple columns:
sortedData = mergedData2.sort(['reviewerid', 'id_x'])
sortedData.ix[:,0:6].head(3)
id_x | solutionid | reviewerid | start_x | stop_x | timeleft_x | |
---|---|---|---|---|---|---|
3 | 2 | 4 | 22 | 1304095188 | 1304095206 | 2306 |
13 | 12 | 14 | 22 | 1304095280 | 1304095301 | 2211 |
5 | 16 | 6 | 22 | 1304095303 | 1304095471 | 2041 |
Reshaping data - example
misShaped = pd.DataFrame({'treatmentA' : [NaN, 1, 2], 'treatmentB' : [5, 4, 3]})
misShaped['people'] = ['John', 'Jane', 'Mary']
misShaped
treatmentA | treatmentB | people | |
---|---|---|---|
0 | NaN | 5 | John |
1 | 1 | 4 | Jane |
2 | 2 | 3 | Mary |
# equivalent to R's melt()
pd.melt(misShaped, id_vars='people')
people | variable | value | |
---|---|---|---|
0 | John | treatmentA | NaN |
1 | Jane | treatmentA | 1 |
2 | Mary | treatmentA | 2 |
3 | John | treatmentB | 5 |
4 | Jane | treatmentB | 4 |
5 | Mary | treatmentB | 3 |