# import all shogun classes from modshogun import * #number of data points. n=100 #generate a random 2d line(y1 = mx1 + c) m = random.randint(1,10) c = random.randint(1,10) x1 = random.random_integers(-20,20,n) y1=m*x1+c #generate the noise. noise=random.random_sample([n]) * random.random_integers(-35,35,n) #make the noise orthogonal to the line y=mx+c and add it. x=x1 + noise*m/sqrt(1+square(m)) y=y1 + noise/sqrt(1+square(m)) twoD_obsmatrix=array([x,y]) #to visualise the data we must plot it. rcParams['figure.figsize'] = 7, 7 figure,axis=subplots(1,1) xlim(-50,50) ylim(-50,50) axis.plot(twoD_obsmatrix[0,:],twoD_obsmatrix[1,:],'o',color='green',markersize=6) #the line from which we generated the data is plotted in red axis.plot(x1[:],y1[:],linewidth=0.3,color='red') title('One-Dimensional sub-space with noise') xlabel("x axis") _=ylabel("y axis") #convert the observation matrix into dense feature matrix. train_features = RealFeatures(twoD_obsmatrix) #PCA(EVD) is choosen since N=100 and D=2 (N>D). #However we can also use PCA(AUTO) as it will automagically choose the appropriate method. preprocessor = PCA(EVD) #since we are projecting down the 2d data, the target dim is 1. But here the exhaustive method is detailed by #setting the target dimension to 2 to visualize both the eigen vectors. #However, in future examples we will get rid of this step by implementing it directly. preprocessor.set_target_dim(2) #Centralise the data by subtracting its mean from it. preprocessor.init(train_features) #get the mean for the respective dimensions. mean_datapoints=preprocessor.get_mean() mean_x=mean_datapoints[0] mean_y=mean_datapoints[1] #Get the eigenvectors(We will get two of these since we set the target to 2). E = preprocessor.get_transformation_matrix() #Get all the eigenvalues returned by PCA. eig_value=preprocessor.get_eigenvalues() e1 = E[:,0] e2 = E[:,1] eig_value1 = eig_value[0] eig_value2 = eig_value[1] #find out the M eigenvectors corresponding to top M number of eigenvalues and store it in E #Here M=1 #slope of e1 & e2 m1=e1[1]/e1[0] m2=e2[1]/e2[0] #generate the two lines x1=range(-50,50) x2=x1 y1=multiply(m1,x1) y2=multiply(m2,x2) #plot the data along with those two eigenvectors figure, axis = subplots(1,1) xlim(-50, 50) ylim(-50, 50) axis.plot(x[:], y[:],'o',color='green', markersize=5, label="green") axis.plot(x1[:], y1[:], linewidth=0.7, color='black') axis.plot(x2[:], y2[:], linewidth=0.7, color='blue') p1 = Rectangle((0, 0), 1, 1, fc="black") p2 = Rectangle((0, 0), 1, 1, fc="blue") legend([p1,p2],["1st eigenvector","2nd eigenvector"],loc='center left', bbox_to_anchor=(1, 0.5)) title('Eigenvectors selection') xlabel("x axis") _=ylabel("y axis") #The eigenvector corresponding to higher eigenvalue(i.e eig_value2) is choosen (i.e e2). #E is the feature vector. E=e2 #transform all 2-dimensional feature matrices to target-dimensional approximations. yn=preprocessor.apply_to_feature_matrix(train_features) #Since, here we are manually trying to find the eigenvector corresponding to the top eigenvalue. #The 2nd row of yn is choosen as it corresponds to the required eigenvector e2. yn1=yn[1,:] x_new=(yn1 * E[0]) + tile(mean_x,[n,1]).T[0] y_new=(yn1 * E[1]) + tile(mean_y,[n,1]).T[0] figure, axis = subplots(1,1) xlim(-50, 50) ylim(-50, 50) axis.plot(x[:], y[:],'o',color='green', markersize=5, label="green") axis.plot(x_new, y_new, 'o', color='blue', markersize=5, label="red") title('PCA Projection of 2D data into 1D subspace') xlabel("x axis") ylabel("y axis") #add some legend for information p1 = Rectangle((0, 0), 1, 1, fc="r") p2 = Rectangle((0, 0), 1, 1, fc="g") p3 = Rectangle((0, 0), 1, 1, fc="b") legend([p1,p2,p3],["normal projection","2d data","1d projection"],loc='center left', bbox_to_anchor=(1, 0.5)) #plot the projections in red: for i in range(n): axis.plot([x[i],x_new[i]],[y[i],y_new[i]] , color='red') rcParams['figure.figsize'] = 8,8 #number of points n=100 #generate the data a=random.randint(1,20) b=random.randint(1,20) c=random.randint(1,20) d=random.randint(1,20) x1=random.random_integers(-20,20,n) y1=random.random_integers(-20,20,n) z1=-(a*x1+b*y1+d)/c #generate the noise noise=random.random_sample([n])*random.random_integers(-30,30,n) #the normal unit vector is [a,b,c]/magnitude magnitude=sqrt(square(a)+square(b)+square(c)) normal_vec=array([a,b,c]/magnitude) #add the noise orthogonally x=x1+noise*normal_vec[0] y=y1+noise*normal_vec[1] z=z1+noise*normal_vec[2] threeD_obsmatrix=array([x,y,z]) #to visualize the data, we must plot it. from mpl_toolkits.mplot3d import Axes3D fig = pyplot.figure() ax=fig.add_subplot(111, projection='3d') #plot the noisy data generated by distorting a plane ax.scatter(x, y, z,marker='o', color='g') ax.set_xlabel('x label') ax.set_ylabel('y label') ax.set_zlabel('z label') legend([p2],["3d data"],loc='center left', bbox_to_anchor=(1, 0.5)) title('Two dimensional subspace with noise') xx, yy = meshgrid(range(-30,30), range(-30,30)) zz=-(a * xx + b * yy + d) / c #convert the observation matrix into dense feature matrix. train_features = RealFeatures(threeD_obsmatrix) #PCA(EVD) is choosen since N=100 and D=3 (N>D). #However we can also use PCA(AUTO) as it will automagically choose the appropriate method. preprocessor = PCA(EVD) #If we set the target dimension to 2, Shogun would automagically preserve the required 2 eigenvectors(out of 3) according to their #eigenvalues. preprocessor.set_target_dim(2) preprocessor.init(train_features) #get the mean for the respective dimensions. mean_datapoints=preprocessor.get_mean() mean_x=mean_datapoints[0] mean_y=mean_datapoints[1] mean_z=mean_datapoints[2] #get the required eigenvectors corresponding to top 2 eigenvalues. E = preprocessor.get_transformation_matrix() #This can be performed by shogun's PCA preprocessor as follows: yn=preprocessor.apply_to_feature_matrix(train_features) new_data=dot(E,yn) x_new=new_data[0,:]+tile(mean_x,[n,1]).T[0] y_new=new_data[1,:]+tile(mean_y,[n,1]).T[0] z_new=new_data[2,:]+tile(mean_z,[n,1]).T[0] #all the above points lie on the same plane. To make it more clear we will plot the projection also. fig=pyplot.figure() ax=fig.add_subplot(111, projection='3d') ax.scatter(x, y, z,marker='o', color='g') ax.set_xlabel('x label') ax.set_ylabel('y label') ax.set_zlabel('z label') legend([p1,p2,p3],["normal projection","3d data","2d projection"],loc='center left', bbox_to_anchor=(1, 0.5)) title('PCA Projection of 3D data into 2D subspace') for i in range(100): ax.scatter(x_new[i], y_new[i], z_new[i],marker='o', color='b') ax.plot([x[i],x_new[i]],[y[i],y_new[i]],[z[i],z_new[i]],color='r') rcParams['figure.figsize'] = 10, 10 import os def get_imlist(path): """ Returns a list of filenames for all jpg images in a directory""" return [os.path.join(path,f) for f in os.listdir(path) if f.endswith('.pgm')] #set path of the training images path_train='../../../data/att_dataset/training/' #set no. of rows that the images will be resized. k1=100 #set no. of columns that the images will be resized. k2=100 filenames = get_imlist(path_train) filenames = array(filenames) #n is total number of images that has to be analysed. n=len(filenames) # we will be using this often to visualize the images out there. def showfig(image): imgplot=imshow(image, cmap='gray') imgplot.axes.get_xaxis().set_visible(False) imgplot.axes.get_yaxis().set_visible(False) import Image from scipy import misc # to get a hang of the data, lets see some part of the dataset images. fig = pyplot.figure() title('The Training Dataset') for i in range(49): fig.add_subplot(7,7,i) train_img=array(Image.open(filenames[i]).convert('L')) train_img=misc.imresize(train_img, [k1,k2]) showfig(train_img) #To form the observation matrix obs_matrix. #read the 1st image. train_img = array(Image.open(filenames[0]).convert('L')) #resize it to k1 rows and k2 columns train_img=misc.imresize(train_img, [k1,k2]) #since Realfeatures accepts only data of float64 datatype, we do a type conversion train_img=array(train_img, dtype='double') #flatten it to make it a row vector. train_img=train_img.flatten() # repeat the above for all images and stack all those vectors together in a matrix for i in range(1,n): temp=array(Image.open(filenames[i]).convert('L')) temp=misc.imresize(temp, [k1,k2]) temp=array(temp, dtype='double') temp=temp.flatten() train_img=vstack([train_img,temp]) #form the observation matrix obs_matrix=train_img.T train_features = RealFeatures(obs_matrix) preprocessor=PCA(AUTO) preprocessor.set_target_dim(100) preprocessor.init(train_features) mean=preprocessor.get_mean() #get the required eigenvectors corresponding to top 100 eigenvalues E = preprocessor.get_transformation_matrix() #lets see how these eigenfaces/eigenvectors look like: fig1 = pyplot.figure() title('Top 20 Eigenfaces') for i in range(20): a = fig1.add_subplot(5,4,i+1) eigen_faces=E[:,i].reshape([k1,k2]) showfig(eigen_faces) #we perform the required dot product. yn=preprocessor.apply_to_feature_matrix(train_features) re=tile(mean,[n,1]).T[0] + dot(E,yn) #lets plot the reconstructed images. fig2 = pyplot.figure() title('Reconstructed Images from 100 eigenfaces') for i in range(1,50): re1 = re[:,i].reshape([k1,k2]) fig2.add_subplot(7,7,i) showfig(re1) #set path of the training images path_train='../../../data/att_dataset/testing/' test_files=get_imlist(path_train) test_img=array(Image.open(test_files[0]).convert('L')) rcParams.update({'figure.figsize': (3, 3)}) #we plot the test image , for which we have to identify a good match from the training images we already have fig = pyplot.figure() title('The Test Image') showfig(test_img) #We flatten out our test image just the way we have done for the other images test_img=misc.imresize(test_img, [k1,k2]) test_img=array(test_img, dtype='double') test_img=test_img.flatten() #We centralise the test image by subtracting the mean from it. test_f=test_img-mean #We have already projected our training images into pca subspace as yn. train_proj = yn #Projecting our test image into pca subspace test_proj = dot(E.T, test_f) #To get Eucledian Distance as the distance measure use EuclideanDistance. workfeat = RealFeatures(mat(train_proj)) testfeat = RealFeatures(mat(test_proj).T) RaRb=EuclideanDistance(testfeat, workfeat) #The distance between one test image w.r.t all the training is stacked in matrix d. d=empty([n,1]) for i in range(n): d[i]= RaRb.distance(0,i) #The one having the minimum distance is found out min_distance_index = d.argmin() iden=array(Image.open(filenames[min_distance_index])) title('Identified Image') showfig(iden)