import pandas as pd
import numpy as np

fileUrl = 'http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt'

eData = pd.read_csv(fileUrl)

dateDownloaded = !date
dateDownloaded

# looking at data
# for large data, only a summary is shown
eData

eData.head()

# equivalent to R's dim()
eData.shape

# equivalent to R's names()
eData.columns

# computing quantiles, equivalent to R's quantile()
p = [0, 0.25, 0.5, 0.75, 1]
[eData['Lat'].quantile(q=i) for i in p]

# equivalent to (but not as detailed as) R summary()
eData.describe()

# R's class() is equivalent to type()
type(eData)

# get the data types of all columns
# similarly to the method used in the video, 
# we just apply the function type() to values in a row
# the zip() method is just for nice printing
zip(eData.columns, [type(x) for x in eData.ix[0,:]])

# equivalent to R's unique() command
eData['Src'].unique()

# equivalent to R's length() command
len(eData['Src'].unique())

# for this particular case, value_counts() is equivalent to R's table()
eData['Src'].value_counts() 

# or alternatively: pd.crosstab(eData['Src'], [])

# equivalent to R's table() to compute frequency table
pd.crosstab(eData['Src'], eData['Version'])

eData.ix[0:9,'Lat']

eData.ix[0:9,'Lat'] > 40

# equivalent to R's any()
(eData.ix[0:9,'Lat'] > 40).any()

# equivalent to R's all()
(eData.ix[0:9,'Lat'] > 40).all()

# looking at subsets; very similar to R's & operator
eData[(eData['Lat'] > 0) & (eData['Lon'] > 0)][['Lat', 'Lon']][:10]

# looking at subsets; very similar to R's | operator
eData[(eData['Lat'] > 0) | (eData['Lon'] > 0)][['Lat', 'Lon']][-10:]

fileUrl1 = 'https://dl.dropbox.com/u/7710864/data/reviews-apr29.csv'
fileUrl2 = 'https://dl.dropbox.com/u/7710864/data/solutions-apr29.csv'

reviews = pd.read_csv(fileUrl1)
solutions = pd.read_csv(fileUrl2)

reviews.head(2)

solutions.head(2)

# find if there are missing values; equivalent to R's is.na()
reviews.ix[0:9,'time_left'].isnull()

reviews['time_left'].isnull().sum()

reviews['time_left'].isnull().value_counts()

# equivalent to R's colSums()
# notice we need to specifically include missing values with the option skipna=False
# this is in contrary to R, where the default is to *include* NA
reviews.sum(skipna=False) 

# equivalent to R's colMeans
# same remark as above: here missing values are by default excluded
reviews.mean()

# equivalent to R's rowMeans()
reviews.mean(axis=1)