from IPython.display import YouTubeVideo YouTubeVideo("RrPZza_vZ3w") YouTubeVideo("p8hle-ni-DM") from IPython.core.display import HTML HTML('') from IPython.core.display import Image Image(filename='stack.png') import numpy as np The basic datastructure in numpy is the ndarray, an N dimensional array. x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) I = np.eye(3) z = np.zeros((3,3)) print "x = \n", x, '\n\nI = \n', I, '\n\n z = \n', z In a lot of ways, the ndarray can be treated like a list, but there are some interesting differences. You can make an ndarray from a list, but an ndarray can only have one type of data. integer_list = [1, 2, 3] integer_array = np.array(integer_list) print integer_array my_list = [1, 1.0, None] print my_list my_list[-1] = "one" print my_list my_array = np.array([1, 2, 3]) print my_array my_array[-1] = "three" The other interesting difference is that the ndarray supports "fancy indexing"! print "x = \n", x y = x[x > 3] print "y = \n", y x[1,:] Many of the functions that operate on ndarrays are written in C or Fortran, which makes them very fast. One common type of function is the ufunc - Universal Function. This class of functions operate elementwise on the array. np.add(x,x) np.add(x,1) np.log(x) np.greater(x, 3) x = np.matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) x x * x y = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) y * y import numpy as np from scipy import linalg as LA #Solve Ax = b A = np.random.rand(5,5) b = np.random.rand(5) x = LA.solve(A,b) print A print x print b # Find eigenvalues of A e = LA.eig(A) print e # QR Decomposition Q, R = LA.qr(A) print Q print R # LU Decomposition p, l, u = LA.lu(A) print p # Permutation Matrix print l # Lower triangular matrix print u # Upper triangluar matrix # Cholesky Factorization B = np.tril(A) # Make sure B is positive definite! L = LA.cholesky(B) print B print L U, s, Vh = LA.svd(A) print U print s print Vh #Calculate the condition number min(LA.svd(A, compute_uv=0))*min(LA.svd(LA.pinv(A), compute_uv=0)) #Or an easier way np.linalg.cond(A, -2) # -2 finds the smallest singular value from scipy import stats import matplotlib.pyplot as plt numargs = stats.lognorm.numargs [ s ] = [0.9,] * numargs rv = stats.lognorm(s) #Display frozen pdf x = np.linspace(0, np.minimum(rv.dist.b, 3)) h = plt.plot(x, rv.pdf(x)) stats.describe? stats.describe(A) z = stats.zscore(A) # Calculates the z score of each value in the sample, relative to the sample mean and standard deviation. print z x = np.random.randn(10) y = np.random.randn(10) coef, p = stats.pearsonr(x, y) print coef print p from IPython.display import Image Image(url="http://scikit-learn.org/stable/_static/ml_map.png") # The famous Iris dataset from sklearn import neighbors, datasets iris = datasets.load_iris() X, y = iris.data, iris.target print X[:10], "..." print y # Visualize the data import numpy as np import matplotlib.pyplot as plt x_index = 2 y_index = 3 # this formatter will label the colorbar with the correct target names formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)]) plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target) plt.colorbar(ticks=[0, 1, 2], format=formatter) plt.xlabel(iris.feature_names[x_index]) plt.ylabel(iris.feature_names[y_index]); # Make a prediction # create the model knn = neighbors.KNeighborsClassifier(n_neighbors=3) # fit the model knn.fit(X, y) # What kind of iris has 3cm x 5cm sepal and 4cm x 2cm petal? # call the "predict" method: result = knn.predict([[3, 5, 4, 2],]) print iris.target_names[result] #Exercise: try this with a SVC classifier from sklearn.svm import SVC # Dimensionality Reduction - Principal Component Analysis X, y = iris.data, iris.target from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X) X_reduced = pca.transform(X) print "Reduced dataset shape:", X_reduced.shape import pylab as pl pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y) print "Meaning of the 2 components:" for component in pca.components_: print " + ".join("%.3f x %s" % (value, name) for value, name in zip(component, iris.feature_names)) # Clustering from sklearn.cluster import KMeans k_means = KMeans(n_clusters=3, random_state=0) k_means.fit(X) y_pred = k_means.predict(X) pl.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y_pred); # Validation # Generate an un-balanced 2D dataset np.random.seed(0) X = np.vstack([np.random.normal(0, 1, (950, 2)), np.random.normal(-1.8, 0.8, (50, 2))]) y = np.hstack([np.zeros(950), np.ones(50)]) plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='none', cmap=plt.cm.Accent) from sklearn import metrics from sklearn.svm import SVC from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = SVC().fit(X_train, y_train) y_pred = clf.predict(X_test) print "accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5) print X1.shape print X2.shape y2_pred = SVC().fit(X1, y1).predict(X2) y1_pred = SVC().fit(X2, y2).predict(X1) print np.mean([metrics.precision_score(y1, y1_pred), metrics.precision_score(y2, y2_pred)]) from sklearn.cross_validation import cross_val_score # Let's do a 2-fold cross-validation of the SVC estimator print cross_val_score(SVC(), X, y, cv=2, scoring='precision') Image("https://raw.github.com/olgabot/prettyplotlib/master/examples/boxplot_matplotlib_default.png") Image("https://raw.github.com/olgabot/prettyplotlib/master/examples/boxplot_prettyplotlib_default.png") from IPython.display import VimeoVideo VimeoVideo("79562736") !wget http://files.grouplens.org/datasets/movielens/ml-100k.zip !unzip ml-100k.zip !cd ml-100k txt = open('ml-100k/README').read() print txt import pandas as pd user_columns = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] users = pd.read_csv('ml-100k/u.user', names=user_columns, sep='|') rating_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings = pd.read_csv('ml-100k/u.data', names=rating_columns, delim_whitespace=True) movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] movies = pd.read_csv('ml-100k/u.item', names=movie_columns, sep='|', usecols=range(5)) users ratings movies movie_ratings = pd.merge(movies, ratings) movie_ratings data = pd.merge(movie_ratings, users) data data.head() # How are ages distributed? data.age.hist(bins=30) plt.title("Distribution of users' ages") plt.ylabel('count of users') plt.xlabel('age'); # How are occupations distributed? # Which movie is the highest rated? most_rated = data.title.value_counts()[0:10] print most_rated # What movie is the most popular # How many movies have less than 100 ratings # Remove movies with less than 100 ratings from the dataset # Extra Credit: How would we go about making a recommender system based on this data set? #Hint: VimeoVideo('64445499') VimeoVideo("79535180") Some material taken from: https://github.com/jakevdp/2013_fall_ASTR599/ http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/