%pylab inline
import pandas as pd
import matplotlib.pylab as plt
from sklearn.decomposition import PCA
df = pd.read_csv('iris.csv')
df.head()
#df.info()
Populating the interactive namespace from numpy and matplotlib
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
X = df.ix[:,:4]
pca = PCA(n_components='mle') # 自动选择主成分个数
pca.fit(X)
PCA(copy=True, n_components='mle', whiten=False)
pca.components_ # 主成份负载
pca.components_[0]# 第一主成分的负载
array([ 0.36158968, -0.08226889, 0.85657211, 0.35884393])
pca.explained_variance_
array([ 4.19667516, 0.24062861, 0.07800042])
# 方差贡献,碎石图
variance = pca.explained_variance_ratio_
readable_variance = variance * (1/variance[0])
component = pca.n_components_
plt.plot(range(component), readable_variance)
plt.show()
score = pca.transform(X) # 主成分得分
score = pd.concat([pd.DataFrame(score[:,:2]),df['Species']],axis=1)
score.columns = ['var1','var2','species']
score.head()
var1 | var2 | species | |
---|---|---|---|
0 | -2.684207 | -0.326607 | setosa |
1 | -2.715391 | 0.169557 | setosa |
2 | -2.889820 | 0.137346 | setosa |
3 | -2.746437 | 0.311124 | setosa |
4 | -2.728593 | -0.333925 | setosa |
from ggplot import *
ggplot(score,aes('var1','var2',color='species')) \
+ geom_point()
<ggplot: (282231425)>
X.corr()
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | |
---|---|---|---|---|
Sepal_Length | 1.000000 | -0.109369 | 0.871754 | 0.817954 |
Sepal_Width | -0.109369 | 1.000000 | -0.420516 | -0.356544 |
Petal_Length | 0.871754 | -0.420516 | 1.000000 | 0.962757 |
Petal_Width | 0.817954 | -0.356544 | 0.962757 | 1.000000 |
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components='mle') # 自动选择主成分个数
kpca.fit(X)
KernelPCA(alpha=1.0, coef0=1, degree=3, eigen_solver='auto', fit_inverse_transform=False, gamma=None, kernel='linear', kernel_params=None, max_iter=None, n_components='mle', remove_zero_eig=False, tol=0)
score = kpca.transform(X)
score = pd.concat([pd.DataFrame(score[:,:2]),df['Species']],axis=1)
score.columns = ['var1','var2','species']
score.head()
var1 | var2 | species | |
---|---|---|---|
0 | -2.684207 | 0.326607 | setosa |
1 | -2.715391 | -0.169557 | setosa |
2 | -2.889820 | -0.137346 | setosa |
3 | -2.746437 | -0.311124 | setosa |
4 | -2.728593 | 0.333925 | setosa |
from ggplot import *
ggplot(score,aes('var1','var2',color='species')) \
+ geom_point()
<ggplot: (279554953)>