import pandas as pd import numpy as np cameraData = pd.read_csv('../data/cameras.csv') cameraData.columns # somehow the column is named 'Location 1' instead of 'Location.1' # transform column names to lowercase # equivalent to R's tolower() cameraData.columns = cameraData.columns.map(lambda x: x.lower()) cameraData.columns # string split; split a string into list of strings # equivalent to R's strsplit() splitNames = cameraData.columns.map(lambda x: x.split(' ')) splitNames[4] splitNames[5] splitNames[5][0] # apply function to names cameraData.columns.map(lambda x: x.split(' ')[0]) fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv' fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv' reviews = pd.read_csv(fileUrl1) solutions = pd.read_csv(fileUrl2) reviews.head(2) solutions.head(2) reviews.columns # remove underscores # equivalent to R's sub() reviews.columns = reviews.columns.map(lambda x: x.replace('_', '')) solutions.columns = solutions.columns.map(lambda x: x.replace('_', '')) # test; every occurence is replaced # in contrast to R's sub(), where only one occurence is replaced testName = 'this_is_a_test' testName.replace('_', '') reviews['timeleft'][:10] # cut into ranges # this is equivalent to R's cut() timeRanges = pd.cut(reviews['timeleft'], range(0, 4000, 600)) # note it needs to go up to 4000 in order to include 3600 timeRanges[:10] type(timeRanges) pd.value_counts(timeRanges) # equivalent to R's cut2() timeRanges = pd.cut(reviews['timeleft'], 6) pd.value_counts(timeRanges) # note that NaN values are excluded; the resulting ranges are thus different from video # adding an extra variable # similarly to R, simply assign to a new column reviews['timeRanges'] = timeRanges reviews.head(2) print reviews.columns print solutions.columns mergedData2 = pd.merge(reviews, solutions, left_on='solutionid', right_on='id', sort=True) mergedData2.ix[:,0:6].head(3) reviews.ix[0,0:6] mergedData2['reviewerid'][:10] mergedData2['reviewerid'].order()[:10] mergedData2['reviewerid'][mergedData2['reviewerid'].order().index][:10] mergedData2.ix[:,0:6].head(3) sortedData = mergedData2.sort(['reviewerid']) sortedData.ix[:,0:6].head(3) sortedData = mergedData2.sort(['reviewerid', 'id_x']) sortedData.ix[:,0:6].head(3) misShaped = pd.DataFrame({'treatmentA' : [NaN, 1, 2], 'treatmentB' : [5, 4, 3]}) misShaped['people'] = ['John', 'Jane', 'Mary'] misShaped # equivalent to R's melt() pd.melt(misShaped, id_vars='people')