from __future__ import division  # always use floating point division
import numpy as np  # convention, use alias ``np``

# a one dimensional array
x = np.array([2, 7, 5])
print 'x:', x  # print x

# a sequence starting from 4 to 12 with a step size of 3
y = np.arange(4, 12, 3)
print 'y:', y

# element-wise operations on arrays
print 'x + y:', x + y
print 'x / y:', x / y
print 'x ^ y:', x ** y  # python uses ** for exponentiation

print x[1]  # second element of x
print x[1:3]  # slice of x that includes second and third elements
print
print x[-2]  # indexing using negative indices - starts from -1
print x[-np.array([1, 2])]   # fancy indexing using index array
print

print x[np.array([False, True, True])]  # indexing using boolean mask

# reshape sequence to 2d array (=matrix) where rows hold contiguous sequences
# then transpose so that columns hold contiguous sub sequences
z_temp = np.arange(1, 13).reshape((3,4))
print "z_temp"
print z_temp
print

# transpose
z = z_temp.T
print "z = z_temp.T (transpose of z_temp)"
print z
print

# slicing along two dimensions
a = z[2:4, 1:3]
print "a = z[2:4, 1:3]"
print a
print

# slicing along 2nd dimension
b = z[:, 1:3] 
print "b = z[:, 1:3]"
print b
print

# first column, returns 1d array
c = z[:, 0]
print "c = z[:, 0]"
print c # one dimensional
print

# first column but return 2d array (remember: exclusive semantics)
cc = z[:, 0:1]
print "cc = z[:, 0:1]"
print cc  # two dimensional; column vector

print z.shape  # number of elements along each axis (=dimension)
print z.ndim  # number of dimensions

print z[:, 0].ndim  # return first column as 1d array

# allows execution of R code in IPython
try:
    %load_ext rmagic
except ImportError:
    print "Please install rpy2 to run the R/Python comparision code examples"

x = np.arange(5)  # seq has excl semantics
x[0]

%%R  # tells IPython that the following lines will be R code
x <- seq(0, 4)  # seq has incl semantics
print(x[1])

x[0:2]  # doesnt include index 2

%%R
x <- seq(0, 4)  # seq has incl semantics
print(x[1:2])  # includes index 2

x[-2]  # second element from the end

%%R
x <- seq(0, 4)  # seq has incl semantics
print(x[-2])  # drop 2nd position, ie 1

X = np.arange(4).reshape((2, 2)).T  # 2d array
X[0:1, :]  # still 2d array - slice selects one element

%%R
X = matrix(seq(0, 3), 2, 2)
print(X[1, , drop=FALSE])  # use drop=FALSE

import pandas as pd  # convention, alias ``pd``

# Load car dataset
auto = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.csv")
auto.head()  # print the first lines

auto.describe()

mpg = auto.mpg  # get mpg column
weight = auto['weight']  # get weight column
auto['mpg_per_weight'] = mpg / weight

print auto[['mpg', 'weight', 'mpg_per_weight']].head()

print(auto.columns)
print(auto.index[:10])

auto.ix[0:5, ['weight', 'mpg']]  # select the first 5 rows and two columns weight and mpg

auto.head(2)

# this command pushes the pandas.DataFrame auto to R-land
%Rpush auto

%%R

auto = data.frame(auto)
print(head(auto, 2))

%pylab inline

import pandas as pd
import matplotlib.pyplot as plt

data = np.random.randn(500)  # array of 500 random numbers

plt.hist(data)
plt.ylabel("Counts")
plt.title("The Gaussian Distribution")

x = np.random.randn(50)
y = np.random.randn(50)

plt.plot(x, y, 'bo')  # b for blue, o for circles
plt.xlabel("x")
plt.ylabel("y")
plt.title("A scatterplot")

s = np.arange(11)
plt.plot(s, s ** 2, 'r--')

plt.scatter(x, y)

plt.boxplot([x, y])  # Pass a list of two arrays to plot them side-by-side
plt.title("Two box plots, side-by-side")

# create a scatterplot of weight vs "miles per galone"
auto.plot(x='weight', y='mpg', style='bo')
plt.title("Scatterplot of weight and mpg")

# create a histogram of "miles per galone"
plt.figure()
auto.hist('mpg')
plt.title("Histogram of mpg (miles per galone)")

from pandas.tools.plotting import scatter_matrix
_ = scatter_matrix(auto[['mpg', 'cylinders', 'displacement']], figsize=(14, 10))