# import all shogun classes
from modshogun import *

#number of data points.
n=100

#generate a random 2d line(y1 = mx1 + c)
m = random.randint(1,10)
c = random.randint(1,10)
x1 = random.random_integers(-20,20,n)
y1=m*x1+c

#generate the noise.
noise=random.random_sample([n]) * random.random_integers(-35,35,n)

#make the noise orthogonal to the line y=mx+c and add it.
x=x1 + noise*m/sqrt(1+square(m))
y=y1 + noise/sqrt(1+square(m))

twoD_obsmatrix=array([x,y])

#to visualise the data we must plot it.

rcParams['figure.figsize'] = 7, 7 
figure,axis=subplots(1,1)
xlim(-50,50)
ylim(-50,50)
axis.plot(twoD_obsmatrix[0,:],twoD_obsmatrix[1,:],'o',color='green',markersize=6)

#the line from which we generated the data is plotted in red
axis.plot(x1[:],y1[:],linewidth=0.3,color='red')
title('One-Dimensional sub-space with noise')
xlabel("x axis")
_=ylabel("y axis")

#convert the observation matrix into dense feature matrix.
train_features = RealFeatures(twoD_obsmatrix)

#PCA(EVD) is choosen since N=100 and D=2 (N>D).
#However we can also use PCA(AUTO) as it will automagically choose the appropriate method. 
preprocessor = PCA(EVD)

#since we are projecting down the 2d data, the target dim is 1. But here the exhaustive method is detailed by
#setting the target dimension to 2 to visualize both the eigen vectors.
#However, in future examples we will get rid of this step by implementing it directly.
preprocessor.set_target_dim(2)

#Centralise the data by subtracting its mean from it.
preprocessor.init(train_features)

#get the mean for the respective dimensions.
mean_datapoints=preprocessor.get_mean()
mean_x=mean_datapoints[0]
mean_y=mean_datapoints[1]

#Get the eigenvectors(We will get two of these since we set the target to 2). 
E = preprocessor.get_transformation_matrix()

#Get all the eigenvalues returned by PCA.
eig_value=preprocessor.get_eigenvalues()

e1 = E[:,0]
e2 = E[:,1]
eig_value1 = eig_value[0]
eig_value2 = eig_value[1]

#find out the M eigenvectors corresponding to top M number of eigenvalues and store it in E
#Here M=1

#slope of e1 & e2
m1=e1[1]/e1[0]
m2=e2[1]/e2[0]

#generate the two lines
x1=range(-50,50)
x2=x1
y1=multiply(m1,x1)
y2=multiply(m2,x2)

#plot the data along with those two eigenvectors
figure, axis = subplots(1,1)
xlim(-50, 50)
ylim(-50, 50)
axis.plot(x[:], y[:],'o',color='green', markersize=5, label="green")
axis.plot(x1[:], y1[:], linewidth=0.7, color='black')
axis.plot(x2[:], y2[:], linewidth=0.7, color='blue')
p1 = Rectangle((0, 0), 1, 1, fc="black")
p2 = Rectangle((0, 0), 1, 1, fc="blue")
legend([p1,p2],["1st eigenvector","2nd eigenvector"],loc='center left', bbox_to_anchor=(1, 0.5))
title('Eigenvectors selection')
xlabel("x axis")
_=ylabel("y axis")

#The eigenvector corresponding to higher eigenvalue(i.e eig_value2) is choosen (i.e e2).
#E is the feature vector.
E=e2

#transform all 2-dimensional feature matrices to target-dimensional approximations.
yn=preprocessor.apply_to_feature_matrix(train_features)

#Since, here we are manually trying to find the eigenvector corresponding to the top eigenvalue.
#The 2nd row of yn is choosen as it corresponds to the required eigenvector e2.
yn1=yn[1,:]

x_new=(yn1 * E[0]) + tile(mean_x,[n,1]).T[0]
y_new=(yn1 * E[1]) + tile(mean_y,[n,1]).T[0]

figure, axis = subplots(1,1)
xlim(-50, 50)
ylim(-50, 50)

axis.plot(x[:], y[:],'o',color='green', markersize=5, label="green")
axis.plot(x_new, y_new, 'o', color='blue', markersize=5, label="red")
title('PCA Projection of 2D data into 1D subspace')
xlabel("x axis")
ylabel("y axis")

#add some legend for information
p1 = Rectangle((0, 0), 1, 1, fc="r")
p2 = Rectangle((0, 0), 1, 1, fc="g")
p3 = Rectangle((0, 0), 1, 1, fc="b")
legend([p1,p2,p3],["normal projection","2d data","1d projection"],loc='center left', bbox_to_anchor=(1, 0.5))

#plot the projections in red:
for i in range(n):
    axis.plot([x[i],x_new[i]],[y[i],y_new[i]] , color='red')

rcParams['figure.figsize'] = 8,8 
#number of points
n=100

#generate the data
a=random.randint(1,20)
b=random.randint(1,20)
c=random.randint(1,20)
d=random.randint(1,20)

x1=random.random_integers(-20,20,n)
y1=random.random_integers(-20,20,n)
z1=-(a*x1+b*y1+d)/c

#generate the noise
noise=random.random_sample([n])*random.random_integers(-30,30,n)

#the normal unit vector is [a,b,c]/magnitude
magnitude=sqrt(square(a)+square(b)+square(c))
normal_vec=array([a,b,c]/magnitude)

#add the noise orthogonally
x=x1+noise*normal_vec[0]
y=y1+noise*normal_vec[1]
z=z1+noise*normal_vec[2]
threeD_obsmatrix=array([x,y,z])

#to visualize the data, we must plot it.
from mpl_toolkits.mplot3d import Axes3D

fig = pyplot.figure()
ax=fig.add_subplot(111, projection='3d')

#plot the noisy data generated by distorting a plane
ax.scatter(x, y, z,marker='o', color='g')

ax.set_xlabel('x label')
ax.set_ylabel('y label')
ax.set_zlabel('z label')
legend([p2],["3d data"],loc='center left', bbox_to_anchor=(1, 0.5))
title('Two dimensional subspace with noise')
xx, yy = meshgrid(range(-30,30), range(-30,30))
zz=-(a * xx + b * yy + d) / c

#convert the observation matrix into dense feature matrix.
train_features = RealFeatures(threeD_obsmatrix)

#PCA(EVD) is choosen since N=100 and D=3 (N>D).
#However we can also use PCA(AUTO) as it will automagically choose the appropriate method. 
preprocessor = PCA(EVD)

#If we set the target dimension to 2, Shogun would automagically preserve the required 2 eigenvectors(out of 3) according to their
#eigenvalues.
preprocessor.set_target_dim(2)
preprocessor.init(train_features)

#get the mean for the respective dimensions.
mean_datapoints=preprocessor.get_mean()
mean_x=mean_datapoints[0]
mean_y=mean_datapoints[1]
mean_z=mean_datapoints[2]

#get the required eigenvectors corresponding to top 2 eigenvalues.
E = preprocessor.get_transformation_matrix()

#This can be performed by shogun's PCA preprocessor as follows:
yn=preprocessor.apply_to_feature_matrix(train_features)

new_data=dot(E,yn)

x_new=new_data[0,:]+tile(mean_x,[n,1]).T[0]
y_new=new_data[1,:]+tile(mean_y,[n,1]).T[0]
z_new=new_data[2,:]+tile(mean_z,[n,1]).T[0]

#all the above points lie on the same plane. To make it more clear we will plot the projection also.

fig=pyplot.figure()
ax=fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z,marker='o', color='g')
ax.set_xlabel('x label')
ax.set_ylabel('y label')
ax.set_zlabel('z label')
legend([p1,p2,p3],["normal projection","3d data","2d projection"],loc='center left', bbox_to_anchor=(1, 0.5))
title('PCA Projection of 3D data into 2D subspace')

for i in range(100):
    ax.scatter(x_new[i], y_new[i], z_new[i],marker='o', color='b')
    ax.plot([x[i],x_new[i]],[y[i],y_new[i]],[z[i],z_new[i]],color='r')  

rcParams['figure.figsize'] = 10, 10 
import os
def get_imlist(path):
    """ Returns a list of filenames for all jpg images in a directory"""
    return [os.path.join(path,f) for f in os.listdir(path) if f.endswith('.pgm')]

#set path of the training images
path_train='../../../data/att_dataset/training/'
#set no. of rows that the images will be resized.
k1=100
#set no. of columns that the images will be resized.
k2=100

filenames = get_imlist(path_train)
filenames = array(filenames)

#n is total number of images that has to be analysed.
n=len(filenames)

# we will be using this often to visualize the images out there.
def showfig(image):
    imgplot=imshow(image, cmap='gray')
    imgplot.axes.get_xaxis().set_visible(False)
    imgplot.axes.get_yaxis().set_visible(False)
    
import Image
from scipy import misc

# to get a hang of the data, lets see some part of the dataset images.
fig = pyplot.figure()
title('The Training Dataset')

for i in range(49):
    fig.add_subplot(7,7,i)
    train_img=array(Image.open(filenames[i]).convert('L'))
    train_img=misc.imresize(train_img, [k1,k2])
    showfig(train_img)

#To form the observation matrix obs_matrix.
#read the 1st image.
train_img = array(Image.open(filenames[0]).convert('L'))

#resize it to k1 rows and k2 columns
train_img=misc.imresize(train_img, [k1,k2])

#since Realfeatures accepts only data of float64 datatype, we do a type conversion
train_img=array(train_img, dtype='double')

#flatten it to make it a row vector.
train_img=train_img.flatten()

# repeat the above for all images and stack all those vectors together in a matrix
for i in range(1,n):
    temp=array(Image.open(filenames[i]).convert('L'))    
    temp=misc.imresize(temp, [k1,k2])
    temp=array(temp, dtype='double')
    temp=temp.flatten()
    train_img=vstack([train_img,temp])

#form the observation matrix    
obs_matrix=train_img.T

train_features = RealFeatures(obs_matrix)
preprocessor=PCA(AUTO)

preprocessor.set_target_dim(100)
preprocessor.init(train_features)

mean=preprocessor.get_mean()

#get the required eigenvectors corresponding to top 100 eigenvalues
E = preprocessor.get_transformation_matrix()

#lets see how these eigenfaces/eigenvectors look like:
fig1 = pyplot.figure()
title('Top 20 Eigenfaces')

for i in range(20):
    a = fig1.add_subplot(5,4,i+1)
    eigen_faces=E[:,i].reshape([k1,k2])
    showfig(eigen_faces)
    

#we perform the required dot product.
yn=preprocessor.apply_to_feature_matrix(train_features)

re=tile(mean,[n,1]).T[0] + dot(E,yn)

#lets plot the reconstructed images.
fig2 = pyplot.figure()
title('Reconstructed Images from 100 eigenfaces')
for i in range(1,50):
    re1 = re[:,i].reshape([k1,k2])
    fig2.add_subplot(7,7,i)
    showfig(re1)

#set path of the training images
path_train='../../../data/att_dataset/testing/'
test_files=get_imlist(path_train)
test_img=array(Image.open(test_files[0]).convert('L'))

rcParams.update({'figure.figsize': (3, 3)})
#we plot the test image , for which we have to identify a good match from the training images we already have
fig = pyplot.figure()
title('The Test Image')
showfig(test_img)

#We flatten out our test image just the way we have done for the other images
test_img=misc.imresize(test_img, [k1,k2])
test_img=array(test_img, dtype='double')
test_img=test_img.flatten()

#We centralise the test image by subtracting the mean from it.
test_f=test_img-mean

#We have already projected our training images into pca subspace as yn.
train_proj = yn

#Projecting our test image into pca subspace
test_proj = dot(E.T, test_f)

#To get Eucledian Distance as the distance measure use EuclideanDistance.
workfeat = RealFeatures(mat(train_proj))
testfeat = RealFeatures(mat(test_proj).T)
RaRb=EuclideanDistance(testfeat, workfeat)

#The distance between one test image w.r.t all the training is stacked in matrix d.
d=empty([n,1])
for i in range(n):
    d[i]= RaRb.distance(0,i)
   
#The one having the minimum distance is found out
min_distance_index = d.argmin()
iden=array(Image.open(filenames[min_distance_index]))
title('Identified Image')
showfig(iden)