from __future__ import division # always use floating point division import numpy as np # convention, use alias ``np`` # a one dimensional array x = np.array([2, 7, 5]) print 'x:', x # print x # a sequence starting from 4 to 12 with a step size of 3 y = np.arange(4, 12, 3) print 'y:', y # element-wise operations on arrays print 'x + y:', x + y print 'x / y:', x / y print 'x ^ y:', x ** y # python uses ** for exponentiation print x[1] # second element of x print x[1:3] # slice of x that includes second and third elements print print x[-2] # indexing using negative indices - starts from -1 print x[-np.array([1, 2])] # fancy indexing using index array print print x[np.array([False, True, True])] # indexing using boolean mask # reshape sequence to 2d array (=matrix) where rows hold contiguous sequences # then transpose so that columns hold contiguous sub sequences z_temp = np.arange(1, 13).reshape((3,4)) print "z_temp" print z_temp print # transpose z = z_temp.T print "z = z_temp.T (transpose of z_temp)" print z print # slicing along two dimensions a = z[2:4, 1:3] print "a = z[2:4, 1:3]" print a print # slicing along 2nd dimension b = z[:, 1:3] print "b = z[:, 1:3]" print b print # first column, returns 1d array c = z[:, 0] print "c = z[:, 0]" print c # one dimensional print # first column but return 2d array (remember: exclusive semantics) cc = z[:, 0:1] print "cc = z[:, 0:1]" print cc # two dimensional; column vector print z.shape # number of elements along each axis (=dimension) print z.ndim # number of dimensions print z[:, 0].ndim # return first column as 1d array # allows execution of R code in IPython try: %load_ext rmagic except ImportError: print "Please install rpy2 to run the R/Python comparision code examples" x = np.arange(5) # seq has excl semantics x[0] %%R # tells IPython that the following lines will be R code x <- seq(0, 4) # seq has incl semantics print(x[1]) x[0:2] # doesnt include index 2 %%R x <- seq(0, 4) # seq has incl semantics print(x[1:2]) # includes index 2 x[-2] # second element from the end %%R x <- seq(0, 4) # seq has incl semantics print(x[-2]) # drop 2nd position, ie 1 X = np.arange(4).reshape((2, 2)).T # 2d array X[0:1, :] # still 2d array - slice selects one element %%R X = matrix(seq(0, 3), 2, 2) print(X[1, , drop=FALSE]) # use drop=FALSE import pandas as pd # convention, alias ``pd`` # Load car dataset auto = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.csv") auto.head() # print the first lines auto.describe() mpg = auto.mpg # get mpg column weight = auto['weight'] # get weight column auto['mpg_per_weight'] = mpg / weight print auto[['mpg', 'weight', 'mpg_per_weight']].head() print(auto.columns) print(auto.index[:10]) auto.ix[0:5, ['weight', 'mpg']] # select the first 5 rows and two columns weight and mpg auto.head(2) # this command pushes the pandas.DataFrame auto to R-land %Rpush auto %%R auto = data.frame(auto) print(head(auto, 2)) %pylab inline import pandas as pd import matplotlib.pyplot as plt data = np.random.randn(500) # array of 500 random numbers plt.hist(data) plt.ylabel("Counts") plt.title("The Gaussian Distribution") x = np.random.randn(50) y = np.random.randn(50) plt.plot(x, y, 'bo') # b for blue, o for circles plt.xlabel("x") plt.ylabel("y") plt.title("A scatterplot") s = np.arange(11) plt.plot(s, s ** 2, 'r--') plt.scatter(x, y) plt.boxplot([x, y]) # Pass a list of two arrays to plot them side-by-side plt.title("Two box plots, side-by-side") # create a scatterplot of weight vs "miles per galone" auto.plot(x='weight', y='mpg', style='bo') plt.title("Scatterplot of weight and mpg") # create a histogram of "miles per galone" plt.figure() auto.hist('mpg') plt.title("Histogram of mpg (miles per galone)") from pandas.tools.plotting import scatter_matrix _ = scatter_matrix(auto[['mpg', 'cylinders', 'displacement']], figsize=(14, 10))