import pandas as pd import numpy as np fileUrl = 'http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt' eData = pd.read_csv(fileUrl) dateDownloaded = !date dateDownloaded # looking at data # for large data, only a summary is shown eData eData.head() # equivalent to R's dim() eData.shape # equivalent to R's names() eData.columns # computing quantiles, equivalent to R's quantile() p = [0, 0.25, 0.5, 0.75, 1] [eData['Lat'].quantile(q=i) for i in p] # equivalent to (but not as detailed as) R summary() eData.describe() # R's class() is equivalent to type() type(eData) # get the data types of all columns # similarly to the method used in the video, # we just apply the function type() to values in a row # the zip() method is just for nice printing zip(eData.columns, [type(x) for x in eData.ix[0,:]]) # equivalent to R's unique() command eData['Src'].unique() # equivalent to R's length() command len(eData['Src'].unique()) # for this particular case, value_counts() is equivalent to R's table() eData['Src'].value_counts() # or alternatively: pd.crosstab(eData['Src'], []) # equivalent to R's table() to compute frequency table pd.crosstab(eData['Src'], eData['Version']) eData.ix[0:9,'Lat'] eData.ix[0:9,'Lat'] > 40 # equivalent to R's any() (eData.ix[0:9,'Lat'] > 40).any() # equivalent to R's all() (eData.ix[0:9,'Lat'] > 40).all() # looking at subsets; very similar to R's & operator eData[(eData['Lat'] > 0) & (eData['Lon'] > 0)][['Lat', 'Lon']][:10] # looking at subsets; very similar to R's | operator eData[(eData['Lat'] > 0) | (eData['Lon'] > 0)][['Lat', 'Lon']][-10:] fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv' fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv' reviews = pd.read_csv(fileUrl1) solutions = pd.read_csv(fileUrl2) reviews.head(2) solutions.head(2) # find if there are missing values; equivalent to R's is.na() reviews.ix[0:9,'time_left'].isnull() reviews['time_left'].isnull().sum() reviews['time_left'].isnull().value_counts() # equivalent to R's colSums() # notice we need to specifically include missing values with the option skipna=False # this is in contrary to R, where the default is to *include* NA reviews.sum(skipna=False) # equivalent to R's colMeans # same remark as above: here missing values are by default excluded reviews.mean() # equivalent to R's rowMeans() reviews.mean(axis=1)