# special IPython command to prepare the notebook for matplotlib %matplotlib inline import pandas as pd import matplotlib.pyplot as plt pd.options.display.mpl_style = 'default' # Import NumPy import numpy as np x = np.array([1,2,3,4]) y = np.array([[1,2], [3,4]]) x y type(x) x.shape y.shape np.arange(0, 21, 2) # Try it: Create a numpy array from 0 to 20 in steps of size 2 # Try it: Create a numpy array from -10 to 10 in steps of 0.5 (INCLUDING the number 10) # Try it: Create a numpy array from 100 to 1000 of length 10 from numpy import random np.random.randint(1, 100, 50) # Try it: Create a numpy array filled with random samples # from a normal distribution of size 4 x 4 z = np.random.rand(4,4) z z.shape z.reshape((8,2)) # dim is now 8 x 2 z.flatten() y = np.array([[1,2], [3,4]]) y.shape y[0,0] = 10 y # random samples from a uniform distribution between 0 and 1 dat = np.random.rand(4,4) dat dat[0, :] # row 1 dat[:, 0] # column 1 dat[0:3:2, 0] # first and third elements in column 1 np.diag(dat) # diagonal np.arange(32).reshape((8, 4)) # returns an 8 x 4 array x[0] # returns the first row def f(x): if x >=0: return True else: return False print f(3) f_vec = np.vectorize(f) z = np.arange(-5, 6) z f_vec(z) def f(x): return (x >=0) print f(3) # Import SciPy import scipy from scipy import stats from scipy.stats import norm x = norm.rvs(loc = 0, scale = 1, size = 1000) plt.hist(x) plt.title('Histogram of 1000 normal random variables') url = 'https://raw.githubusercontent.com/cs109/2014_data/master/mtcars.csv' mtcars = pd.read_csv(url, sep = ',', index_col=0) mtcars.head() # DataFrame with 32 observations on 11 variables mtcars.shape # return the column names mtcars.columns # return the actual data inside the panadas data frame mtcars.values mtcars[25:] # rows 25 to end of data frame # return index mtcars.index mtcars.ix['Maserati Bora'] # access a row by an index # What other methods are available when working with pandas DataFrames? # type 'mtcars.' and then click # mtcars. # try it here mtcars.describe() (mtcars.mpg >= 20).any() (mtcars > 0).all() mtcars['mpg'].hist() plt.title('Distribution of MPG') plt.xlabel('Miles Per Gallon') # Relationship between cyl and mpg plt.plot(mtcars.cyl, mtcars.mpg, 'o') plt.xlim(3, 9) plt.xlabel('Cylinders') plt.ylabel('MPG') plt.title('Relationship between cylinders and MPG') # Relationship between horsepower and mpg plt.plot(mtcars.hp, mtcars.mpg, 'o') plt.xlabel('Horsepower') plt.ylabel('MPG') plt.title('Relationship between horsepower and MPG') from pandas.tools.plotting import scatter_matrix scatter_matrix(mtcars[['mpg', 'hp', 'cyl']], figsize = (10, 6), alpha = 1, diagonal='kde')