# special IPython command to prepare the notebook for matplotlib
%matplotlib inline
import numpy as np
import scipy
import pandas as pd # pandas
import matplotlib.pyplot as plt # module for plotting
from mpl_toolkits.mplot3d import Axes3D #3D plotting
import datetime as dt # module for manipulating dates and times
import requests
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import binom
from __future__ import division
import re
from StringIO import StringIO
from zipfile import ZipFile
from pandas import read_csv
from urllib import urlopen
import sklearn
import sklearn.preprocessing
import sklearn.datasets
#nice defaults for matplotlib
from matplotlib import rcParams
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv"
X = pd.read_csv(url_exprs, index_col=0)
url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv"
sampleid = pd.read_csv(url_sampleinfo)
a = list(sampleid.filename)
b = list(X.columns)
matchIndex = [b.index(x) for x in a]
X = X[matchIndex]
Our data is usually saved in matrices.
In the gene expression example data, $X$, was $8793 \times 208$.
Say you want to compute the average of each column:
A = np.repeat(1/float(X.shape[0]), X.shape[0]).reshape(1, X.shape[0])
colmeans = np.dot(A, X)
print "Column averages:",np.round(colmeans,2)[0]
pd.DataFrame(np.array([X.columns.values, np.round(colmeans,2)[0]]).transpose(),
columns = ["Gene", "Average"]).head(15)
Column averages: [ 5.74 5.75 5.69 5.73 5.72 5.72 5.72 5.75 5.62 5.73 5.72 5.72 5.62 5.71 5.68 5.7 5.71 5.74 5.71 5.72 5.69 5.71 5.76 5.71 5.74 5.71 5.71 5.71 5.71 5.72 5.71 5.69 5.74 5.73 5.73 5.73 5.73 5.75 5.75 5.7 5.81 5.84 5.72 5.68 5.72 5.7 5.73 5.74 5.81 5.73 5.74 5.73 5.69 5.72 5.69 5.7 5.7 5.71 5.72 5.71 5.74 5.71 5.71 5.74 5.7 5.72 5.71 5.73 5.73 5.69 5.7 5.72 5.73 5.69 5.72 5.73 5.73 5.72 5.72 5.69 5.73 5.69 5.73 5.72 5.75 5.75 5.73 5.75 5.72 5.72 5.73 5.72 5.72 5.71 5.74 5.71 5.69 5.65 5.68 5.73 5.74 5.74 5.74 5.73 5.73 5.74 5.75 5.74 5.75 5.73 5.73 5.73 5.73 5.74 5.68 5.73 5.74 5.74 5.73 5.73 5.73 5.72 5.72 5.73 5.74 5.74 5.73 5.73 5.75 5.73 5.74 5.74 5.73 5.74 5.73 5.73 5.73 5.74 5.74 5.73 5.73 5.73 5.75 5.74 5.74 5.74 5.74 5.73 5.74 5.75 5.76 5.74 5.74 5.74 5.75 5.72 5.75 5.72 5.72 5.75 5.71 5.73 5.73 5.73 5.75 5.73 5.75 5.73 5.74 5.73 5.74 5.75 5.73 5.73 5.73 5.73 5.74 5.75 5.74 5.74 5.74 5.74 5.74 5.74 5.75 5.75 5.74 5.75 5.75 5.76 5.75 5.74 5.75 5.74 5.75 5.74 5.76 5.75 5.75 5.75 5.76 5.75 5.75 5.76 5.74 5.73 5.74 5.73]
Gene | Average | |
---|---|---|
0 | GSM25349.CEL.gz | 5.74 |
1 | GSM25350.CEL.gz | 5.75 |
2 | GSM25356.CEL.gz | 5.69 |
3 | GSM25357.CEL.gz | 5.73 |
4 | GSM25358.CEL.gz | 5.72 |
5 | GSM25359.CEL.gz | 5.72 |
6 | GSM25360.CEL.gz | 5.72 |
7 | GSM25361.CEL.gz | 5.75 |
8 | GSM25377.CEL.gz | 5.62 |
9 | GSM25378.CEL.gz | 5.73 |
10 | GSM25385.CEL.gz | 5.72 |
11 | GSM25386.CEL.gz | 5.72 |
12 | GSM25399.CEL.gz | 5.62 |
13 | GSM25400.CEL.gz | 5.71 |
14 | GSM25401.CEL.gz | 5.68 |
15 rows × 2 columns
Plot the distribution of the averages:
plt.hist(colmeans[0], bins = np.arange(5.60, 5.86, 0.02))
plt.xlim(5.60, 5.84)
plt.xticks(np.arange(5.60, 5.86, 0.04))
plt.xlabel("Average")
plt.ylabel("Frequency")
plt.title("Histogram of colmeans")
plt.show()
Let $X$ be a data matrix with $N$ rows and $p$ columns
The vector of column averages:
can be written like this $AX$ with $A=(\frac{1}{N},\dots,\frac{1}{N})$
Here are the matrix operations: $$AX = \begin{pmatrix} \frac{1}{N} & \frac{1}{N} & \dots & \frac{1}{N} \end{pmatrix} \begin{pmatrix} X_{1,1}&\dots & X_{1,p} \\ X_{2,1}&\dots & X_{2,p} \\ & \vdots & \\ X_{N,1}&\dots & X_{N,p} \\ \end{pmatrix}$$
$$ = \begin{pmatrix} \bar{X}_1 & \bar{X}_2 & \dots &\bar{X}_N \end{pmatrix} $$Here are the ** row averages**:
$$ XB = \begin{pmatrix} X_{1,1}&\dots & X_{1,p} \\ X_{2,1}&\dots & X_{2,p} \\ & \vdots & \\ X_{N,1}&\dots & X_{N,p} \\ \end{pmatrix} \begin{pmatrix} 1/p\\ 1/p\\ \vdots\\ 1/p\\ \end{pmatrix}= \begin{pmatrix} \bar{X}_1 \\ \bar{X}_2 \\ \vdots\\ \bar{X}_q\\ \end{pmatrix} $$Convenient and fast for coding
sex = sampleid['sex']=='M'
A = np.repeat(1/float(X.shape[1]), X.shape[1]).reshape(1, X.shape[1])[0]
A = np.repeat(1/float(X.shape[1]), X.shape[1]).reshape(1, X.shape[1])[0]
B = [1./sum(sex) if x == True else -1./sum(1-sex) for x in sex]
pd.DataFrame([A,B], columns = sampleid.filename.values)
GSM25349.CEL.gz | GSM25350.CEL.gz | GSM25356.CEL.gz | GSM25357.CEL.gz | GSM25358.CEL.gz | GSM25359.CEL.gz | GSM25360.CEL.gz | GSM25361.CEL.gz | GSM25377.CEL.gz | GSM25378.CEL.gz | GSM25385.CEL.gz | GSM25386.CEL.gz | GSM25399.CEL.gz | GSM25400.CEL.gz | GSM25401.CEL.gz | GSM25402.CEL.gz | GSM25409.CEL.gz | GSM25410.CEL.gz | GSM25426.CEL.gz | GSM25427.CEL.gz | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | 0.004808 | ... |
1 | 0.008197 | 0.008197 | 0.008197 | 0.008197 | 0.008197 | 0.008197 | -0.011628 | 0.008197 | 0.008197 | 0.008197 | 0.008197 | 0.008197 | -0.011628 | -0.011628 | 0.008197 | -0.011628 | 0.008197 | -0.011628 | 0.008197 | 0.008197 | ... |
2 rows × 208 columns
with $$ \begin{align*} a_j &= \begin{cases} -1/N_F & \text{if female } \\ 1/N_M & \text{if male} \end{cases} \end{align*} $$
with $N_F$=number of females and $N_M$=number of males.
We used regression with our Baseball data. For example
$$ Y_i= \beta_0 + X_{i}\beta_1 + \varepsilon_i, i=1,\dots,N $$with $Y_i$ = runs for team $i$ and $X_{i}$ = BB for team $i$.
We need to find the $\beta_0,\beta_1$ pair that minimizes
$$\sum_{i=1}^N (Y_i- \beta_0 - X_i\beta_1)^2$$How do we do this?
We can write it like this:
$$ \begin{pmatrix} Y_1\\ Y_2\\ \vdots\\ Y_N\\ \end{pmatrix} = \begin{pmatrix} 1\\ 1\\ \vdots \\ 1\\ \end{pmatrix} \beta_0 + \begin{pmatrix} X_{1}\\ X_{2}\\ \vdots \\ X_{N}\\ \end{pmatrix} \beta_1+ \begin{pmatrix} \varepsilon_1\\ \varepsilon_2\\ \vdots\\ \varepsilon_N\\ \end{pmatrix}\\ $$The $Y$ column is a point in $\mathbb{R}^N$
The $1$ and $X$ vectors define a subspace, each point defined by the pair $(\beta_0,\beta_1)\in \mathbb{R}^2$.
$\vec{y} \in \mathbb{R}^N$ and $L \subset \mathbb{R}^N$ is the space spanned by
$$\vec{v}=\begin{pmatrix} 1\\ \vdots \\ 1\end{pmatrix}; L = \{ c \vec{v}; c \in \mathbb{R}\}$$What $c$ minimizes distance between $c\vec{v}$ and $\vec{y}$?
Answer: The projection of $\vec{y}$ onto $L$: $$ \mbox{Proj}_L(\vec{y}) = \hat{c} \vec{v} $$
So how do we find $\hat{c}$?
The projection is orthogonal to the space so:
$$ (\vec{y}-\hat{c}\vec{v}) \cdot \vec{v} = 0 $$$$ \vec{y}\cdot\vec{v} - \hat{c}\vec{v}\cdot\vec{v} = 0 $$$$ \hat{c} = \frac{\vec{y}\cdot\vec{v}} {\vec{v}\cdot\vec{v}} $$$$ \hat{c} = \frac{\vec{y}\cdot\vec{v}} {\vec{v}\cdot\vec{v}} $$Going back to our original notation this means $\hat{\beta}_0 = \bar{Y}$
In this case calculus would have been just as easy:
$$\frac{\partial}{\partial\beta_0}\sum_{i=1}^N (Y_i - \beta_0)^2 = 0 \implies - 2 \sum_{i=1}^N (Y_i - \beta_0) = 0 \implies$$$$ N \beta_0 = \sum_{i=1}^N Y_i \implies \hat{\beta_0}=\bar{Y}$$Make it slightly more complicated.
The space is now
$$ L = \{ \beta_0 \vec{v}_0 + \beta_1 \vec{v}_1 ; \vec{\beta}=(\beta_0,\beta_1) \in \mathbb{R}^2 \}$$with
$$ \vec{v}_0= \begin{pmatrix} 1\\ 1\\ \vdots \\ 1\\ \end{pmatrix} \mbox{ and } \vec{v}_1= \begin{pmatrix} X_{1}\\ X_{2}\\ \vdots \\ X_{N}\\ \end{pmatrix} $$The columns become the rows
$$X = \begin{pmatrix} X_{1,1}&\dots & X_{1,p} \\ X_{2,1}&\dots & X_{2,p} \\ & \vdots & \\ X_{N,1}&\dots & X_{N,p} \\ \end{pmatrix} , \, X^\top = \begin{pmatrix} X_{1,1}&\dots & X_{N,1} \\ X_{1,2}&\dots & X_{N,2} \\ & \vdots & \\ X_{1,p}&\dots & X_{N,p} \\ \end{pmatrix} $$This helps us define projections but also:
If $X$ is $N\times p$ matrix of features for which the columns have average 0,
$$ S = \frac{1}{N} X^\top X $$Let $X = [ \vec{v}_0 \,\, \vec{v}_1]$. It's a $N\times 2$ matrix
Any point $L$ can be writen as $X\beta$.
The orthogonal projection must have
$X^\top (\vec{y}-X\vec{\beta}) = 0$
$$X^\top X \vec{\beta}= X^\top y $$$$\vec{\beta}= (X^\top X)^{-1}X^\top y$$In general, we can write this
$$ Y_i= \beta_0 + X_{i,1}\beta_1 + \dots X_{i,p} \beta_p + \varepsilon_i, i=1,\dots,N $$as
$$ \begin{pmatrix} Y_1\\ Y_2\\ \vdots\\ Y_N\\ \end{pmatrix} = \begin{pmatrix} 1\\ 1\\ \vdots \\ 1\\ \end{pmatrix} \beta_0 + \begin{pmatrix} X_{1,1}\\ X_{2,1}\\ \vdots \\ X_{N,1}\\ \end{pmatrix} \beta_1+ \dots + \begin{pmatrix} X_{1,p}\\ X_{2,p}\\ \vdots \\ X_{N,p}\\ \end{pmatrix}\beta_p + \begin{pmatrix} \varepsilon_1\\ \varepsilon_2\\ \vdots\\ \varepsilon_N\\ \end{pmatrix}\\ $$We can simplify to:
$$Y = X\beta + \varepsilon$$with $$ Y = \begin{pmatrix} Y_1\\ \vdots\\ Y_N\\ \end{pmatrix}, \, X=\begin{pmatrix} 1&X_{1,1}&\dots&X_{1,p}\\ 1&X_{2,1}&\dots&X_{2,p}\\ & \vdots & \\ 1&X_{N,1}&\dots&X_{N,p} \end{pmatrix}, \, \beta = \begin{pmatrix} \beta_1\\ \vdots\\ \beta_p \end{pmatrix}, \, \varepsilon = \begin{pmatrix} \varepsilon_1\\ \vdots\\ \varepsilon_N \end{pmatrix} $$
Solution is still
$$\hat{\beta} = (X^\top X)^{-1} X^\top Y$$Read in the data:
zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content
zip_files = StringIO()
zip_files.write(zip_folder)
csv_files = ZipFile(zip_files)
teams = csv_files.open('Teams.csv')
teams = read_csv(teams)
teams = csv_files.open('Teams.csv')
teams = read_csv(teams)
dat = teams[(teams['G']==162) & (teams['yearID'] < 2002)]
dat['Singles'] = dat['H']-dat['2B']-dat['3B']-dat['HR']
dat = dat[['R','Singles', 'HR', 'BB']]
dat.head(5)
R | Singles | HR | BB | |
---|---|---|---|---|
437 | 505 | 997 | 11 | 344 |
1366 | 744 | 902 | 189 | 681 |
1367 | 683 | 989 | 90 | 580 |
1377 | 817 | 1041 | 199 | 584 |
1379 | 718 | 973 | 137 | 602 |
5 rows × 4 columns
Get X and Y:
y = dat.R
X = np.column_stack((np.repeat(1, dat.shape[0]) , dat.BB))
print X
[[ 1 344] [ 1 681] [ 1 580] ..., [ 1 519] [ 1 678] [ 1 511]]
Solve for $\beta$:
beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y)
print beta
[ 326.82416279 0.71264016]
The projection:
proj = X.dot(beta)
print proj
[ 571.97237855 812.13211317 740.1554568 743.00601745 755.83354037 768.66106329 684.56952416 694.54648643 723.05209291 658.91447832 681.71896351 740.86809696 678.1557627 685.99480448 733.74169534 661.05239881 661.05239881 748.70713875 703.81080854 681.00632335 653.21335703 641.09847427 614.01814811 664.61559962 639.67319395 650.36279638 651.7880767 652.50071687 727.32793388 663.1903193 685.99480448 693.1212061 631.121512 640.38583411 686.70744465 673.16728157 617.58134892 682.43160367 598.34006455 721.62681259 706.66136918 703.81080854 721.62681259 687.42008481 675.30520205 642.5237546 733.02905518 759.39674118 698.10968724 677.44312254 710.22456999 617.58134892 678.86840286 666.75352011 684.56952416 692.40856594 719.4888921 647.51223573 700.96024789 713.07513064 633.25943249 615.44342844 690.27064546 572.68501871 652.50071687 698.8223274 668.89144059 621.14454973 697.39704708 689.55800529 591.92630309 715.21305113 691.69592578 672.4546414 709.51192983 584.79990147 733.02905518 631.121512 741.58073713 643.94903492 645.37431524 609.74230714 596.20214406 639.67319395 656.06391768 668.17880043 778.63802555 738.73017648 795.74138944 775.78746491 729.46585437 753.69561988 766.5231428 720.20153226 702.38552821 650.36279638 685.28216432 718.06361178 703.81080854 672.4546414 833.51131803 671.74200124 824.95963609 628.27095135 837.78715901 750.13241907 794.31610912 685.28216432 779.35066572 683.856884 743.00601745 645.37431524 693.1212061 666.75352011 643.23639476 759.39674118 814.27003366 732.31641502 796.45402961 716.63833145 846.33884095 752.98297972 698.8223274 683.14424384 711.64985032 720.20153226 740.86809696 659.62711849 727.32793388 641.09847427 661.05239881 702.38552821 716.63833145 713.7877708 682.43160367 792.8908288 675.30520205 636.10999314 638.96055379 667.46616027 788.61498782 740.86809696 689.55800529 675.30520205 728.04057405 662.47767913 750.84505923 785.76442718 752.98297972 689.55800529 709.51192983 685.28216432 705.23608886 634.68471281 822.10907544 666.04087995 782.20122636 681.00632335 747.28185842 661.05239881 760.10938134 612.59286779 689.55800529 693.83384627 732.31641502 634.68471281 683.14424384 637.53527346 731.60377486 718.77625194 693.1212061 661.05239881 752.27033956 769.37370345 752.27033956 662.47767913 717.35097162 728.75321421 720.91417243 760.8220215 747.99449858 763.67258215 761.53466166 683.856884 790.04026815 739.44281664 819.25851479 762.24730183 687.42008481 699.53496756 696.68440691 683.14424384 671.74200124 718.77625194 707.37400935 731.60377486 713.07513064 635.39735298 726.61529372 676.01784221 691.69592578 635.39735298 812.13211317 673.16728157 704.5234487 695.97176675 674.59256189 746.56921826 706.66136918 648.93751606 642.5237546 698.8223274 751.5576994 725.1900134 713.07513064 630.40887184 735.16697567 664.61559962 675.30520205 707.37400935 667.46616027 703.81080854 745.8565781 754.40826004 693.83384627 731.60377486 755.83354037 709.51192983 697.39704708 728.04057405 681.71896351 710.93721016 771.51162393 757.25882069 635.39735298 720.20153226 727.32793388 609.02966698 626.13303087 718.06361178 761.53466166 721.62681259 708.79928967 636.10999314 718.77625194 646.08695541 746.56921826 703.09816837 655.35127751 701.67288805 693.83384627 670.31672092 667.46616027 655.35127751 723.05209291 740.1554568 785.05178701 745.14393794 651.07543654 646.08695541 688.84536513 687.42008481 611.16758746 663.1903193 716.63833145 648.93751606 648.22487589 683.856884 662.47767913 716.63833145 661.76503897 747.28185842 790.75290831 622.56983006 763.67258215 641.81111443 706.66136918 651.7880767 741.58073713 645.37431524 664.61559962 732.31641502 687.42008481 685.28216432 645.37431524 654.63863735 651.7880767 721.62681259 703.09816837 759.39674118 632.54679233 636.8226333 661.76503897 755.12090021 688.84536513 706.66136918 690.27064546 665.32823978 708.79928967 757.97146085 702.38552821 700.24760773 689.55800529 659.62711849 654.63863735 681.00632335 713.7877708 661.76503897 637.53527346 741.58073713 695.25912659 767.94842312 745.8565781 755.83354037 707.37400935 683.14424384 768.66106329 611.88022763 723.05209291 638.24791362 731.60377486 699.53496756 696.68440691 683.14424384 694.54648643 722.33945275 638.96055379 663.1903193 722.33945275 678.86840286 674.59256189 730.17849453 703.09816837 677.44312254 663.90295946 788.61498782 684.56952416 688.84536513 728.75321421 744.43129777 715.92569129 727.32793388 702.38552821 710.93721016 737.30489615 666.75352011 692.40856594 720.91417243 674.59256189 786.47706734 763.67258215 728.04057405 805.00571155 690.98328562 664.61559962 720.91417243 673.87992173 683.856884 734.45433551 776.50010507 732.31641502 708.79928967 744.43129777 708.79928967 671.74200124 667.46616027 792.17818863 722.33945275 752.98297972 757.25882069 758.68410102 700.24760773 675.30520205 699.53496756 699.53496756 749.41977891 683.14424384 673.87992173 747.28185842 730.89113469 785.76442718 748.70713875 683.856884 745.14393794 708.08664951 690.98328562 693.1212061 701.67288805 643.94903492 738.01753632 770.79898377 745.8565781 639.67319395 698.10968724 623.28247022 740.1554568 703.09816837 661.05239881 671.74200124 675.30520205 638.24791362 718.77625194 664.61559962 698.10968724 749.41977891 785.05178701 651.07543654 682.43160367 743.71865761 727.32793388 721.62681259 632.54679233 685.28216432 667.46616027 675.30520205 663.1903193 685.99480448 734.45433551 688.84536513 720.20153226 704.5234487 678.1557627 752.98297972 701.67288805 778.63802555 653.21335703 696.68440691 631.121512 790.75290831 667.46616027 736.59225599 730.17849453 751.5576994 643.94903492 741.58073713 708.79928967 737.30489615 616.1560686 741.58073713 695.25912659 658.91447832 710.22456999 674.59256189 717.35097162 689.55800529 663.90295946 682.43160367 749.41977891 824.95963609 723.05209291 663.90295946 703.09816837 646.79959557 701.67288805 761.53466166 751.5576994 784.33914685 745.8565781 699.53496756 646.08695541 768.66106329 705.94872902 676.01784221 728.04057405 742.29337729 683.856884 662.47767913 674.59256189 684.56952416 726.61529372 690.98328562 787.90234766 646.08695541 708.79928967 807.8562722 747.99449858 830.66075739 702.38552821 770.08634361 718.77625194 623.28247022 639.67319395 664.61559962 732.31641502 656.77655784 679.58104302 623.99511038 734.45433551 689.55800529 678.1557627 728.04057405 649.65015622 687.42008481 636.8226333 685.28216432 745.8565781 775.07482474 793.60346896 871.99388679 688.84536513 674.59256189 722.33945275 757.25882069 671.02936108 631.83415217 771.51162393 728.75321421 678.1557627 770.08634361 800.72987058 745.8565781 708.79928967 681.71896351 646.08695541 725.90265356 694.54648643 681.00632335 677.44312254 672.4546414 603.32854568 642.5237546 826.38491641 771.51162393 737.30489615 777.21274523 784.33914685 703.81080854 715.92569129 782.91386653 679.58104302 721.62681259 757.25882069 699.53496756 690.27064546 704.5234487 677.44312254 720.91417243 643.94903492 708.79928967 755.12090021 694.54648643 702.38552821 765.09786247 679.58104302 744.43129777 808.56891236 738.73017648 693.1212061 673.87992173 772.93690426 766.5231428 683.14424384 784.33914685 777.92538539 669.60408076 695.97176675 713.7877708 648.22487589 752.27033956 815.69531398 718.77625194 626.13303087 696.68440691 784.33914685 681.71896351 727.32793388 757.25882069 775.78746491 687.42008481 651.07543654 792.17818863 712.36249048 749.41977891 663.90295946 750.84505923 690.27064546 777.92538539 769.37370345 760.10938134 705.94872902 717.35097162 734.45433551 688.84536513 639.67319395 700.96024789 757.25882069 645.37431524 661.05239881 675.30520205 856.31580322 682.43160367 838.49979917 752.27033956 738.73017648 765.09786247 714.50041097 762.24730183 875.5570876 761.53466166 690.98328562 845.62620079 733.74169534 760.10938134 776.50010507 638.96055379 668.17880043 745.8565781 822.8217156 750.13241907 776.50010507 688.84536513 747.99449858 814.98267382 727.32793388 690.98328562 723.05209291 762.24730183 701.67288805 724.47737324 879.12028841 760.10938134 740.1554568 807.8562722 806.43099188 728.75321421 777.21274523 750.84505923 807.8562722 666.04087995 762.24730183 832.08603771 802.86779107 708.08664951 755.12090021 755.83354037 738.01753632 679.58104302 697.39704708 658.91447832 616.1560686 661.76503897 693.1212061 651.7880767 764.38522231 782.91386653 678.86840286 717.35097162 740.86809696 703.81080854 738.01753632 674.59256189 660.33975865 659.62711849 678.1557627 719.4888921 715.21305113 661.76503897 667.46616027 745.14393794 772.2242641 696.68440691 809.99419269 690.98328562]
plt.scatter(X[:,1], y)
plt.plot(X[:,1],proj)
plt.xlabel('BB')
plt.ylabel('R')
plt.show()
Now let's add Home Runs:
y = dat.R
X = np.column_stack((np.repeat(1, dat.shape[0]) , dat.BB, dat.HR))
print X
[[ 1 344 11] [ 1 681 189] [ 1 580 90] ..., [ 1 519 206] [ 1 678 161] [ 1 511 213]]
beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y)
print beta
beta.shape
[ 287.72267563 0.38971779 1.52204481]
(3,)
new_prox = X.dot(beta)
len(new_prox)
len(y)
663
#this requires some 3D plotting
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,1], X[:,2], y)
ax.set_xlabel('BB')
ax.set_ylabel('HR')
ax.set_zlabel('R')
plt.show()
We have a data matrix with $N$ observations (rows) and $p$ columns.
We can write our data matrix as
$$X = U D V^\top , \mbox{ with } $$We are changing basis: $XV = UD$
And can change back $XVV^\top = X$
Y1 = np.random.normal(0, size = (100,1))
Y = np.column_stack((Y1,Y1,Y1,Y1,Y1))
Y.shape
(100, 5)
X = U[:,0] d[0,0] Vh[0,:]
#Singular Value Decomposition
#U are the left singular vectors
#d are the singular values
#U*d gives PCA scores
#V are the right singular vectors -- PCA loadings
U, d, Vh = scipy.linalg.svd(Y, full_matrices=False)
Print the eigenvalues:
print np.round(d, 6)
[ 22.872792 0. 0. 0. 0. ]
U1 = U[:,0].reshape(100,1)
dV1 = (d[0] * Vh[0,:].reshape(1,5))
newY = np.dot(U1, dV1)
print dV1
[[-10.22902343 -10.22902343 -10.22902343 -10.22902343 -10.22902343]]
np.round(np.abs(Y-newY).max(),6)
0.0
Consider this simple linear combination:
$$ \Delta_j = (Y_{1,j}, \dots, Y_{N,j})'(1/N_1,\dots,1/N_1,-1/N_2,\dots,-1/N_2) =$$$$\frac{1}{N_1}\sum_{i=1}^{N_1} Y_{i,j} - \frac{1}{N_2}\sum_{i=N_1+1}^{N} Y_{i,j}, N_2=N-N_1$$Try to find the $N_1$ that maximizes the variance
$$ \frac{1}{p}\sum_{j=1}^p \Delta_j^2 $$url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv"
exprs = pd.read_csv(url_exprs, index_col=0)
url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv"
sampleinfo = pd.read_csv(url_sampleinfo)
## Re-order the columns in the `exprs` DataFrame to match the order of the file names in the `sampleinfo` DataFrame.
fns = list(sampleinfo.filename)
cns = list(exprs.columns)
matchIndex = [cns.index(x) for x in fns]
exprs = exprs[matchIndex]
##Add a column to the `sampleinfo` DataFrame titled `days` containing the days since October 31, 2002.
sampleinfo["date"] = pd.to_datetime(sampleinfo.date)
oct31 = dt.datetime(2002,10,31,0,0)
sampleinfo["days"] = map(lambda x: (x - oct31).days, sampleinfo.date)
##Subset for CEU only samples only.
sampleinfoCEU = sampleinfo[sampleinfo.ethnicity == "CEU"]
exprsCEU = exprs[sampleinfoCEU.filename]
###Remove the mean
X = exprsCEU.apply(lambda x: x - exprsCEU.mean(axis=1), \
axis = 0)
X=X.T
X.shape
(102, 8793)
** SVD **
#Singular Value Decomposition
#U are the left singular vectors
#d are the singular values
#U*d gives PCA scores
#V are the right singular vectors -- PCA loadings
U, d, Vh = scipy.linalg.svd(X,full_matrices=False)
Vh.shape
(102, 8793)
** First PC correlates with batch **
plt.scatter(sampleinfoCEU.days, U.transpose()[0,:])
plt.xlabel('Date sample was processed (Number of days since Oct 31, 2012)')
plt.ylabel('PC1')
plt.title('Relationship between the PC1 and the date the samples were processed')
plt.ylim()
plt.show()
rankDays = scipy.stats.mstats.rankdata(sampleinfoCEU.days)
plt.scatter(rankDays, U.transpose()[0,:])
plt.xlabel('Ranked Date (Number of days since Oct 31, 2012)')
plt.ylabel('PC1')
plt.title('Relationship between the PC1 and the ranked date')
plt.show()
plt.hist(U.transpose()[0,:], bins = np.arange(-0.2,0.3,0.05))
plt.xlabel('PC1')
plt.ylabel('Frequency')
plt.title('Distribution of the values from PC1')
plt.show()
#Load the dataset from sklearn's library of example datasets
iris = sklearn.datasets.load_iris()
#data are the features (here, flower measurements)
X = iris.data
#target are the classes we wish to recover (here, flower species)
Y = iris.target
Take only the petal length attributes and scale.
X = X[:,2:4]
X = sklearn.preprocessing.scale(X, axis = 0)
plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.gca().set_aspect('equal')
plt.show()
This is the projection of $(0,1)$ and $(1,0)$
np.apply_along_axis(np.std, 0, X)
array([ 1., 1.])
Line represents the space spaned by
$$\vec{v}=\begin{pmatrix}1/\sqrt{2}&-1/\sqrt{2}\end{pmatrix}$$plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.plot([-2,2], [2, -2], c='black')
plt.gca().set_aspect('equal')
plt.show()
V = (-1,1)/np.sqrt(2)
V = V.reshape(2,1)
dat = np.dot(np.dot(X, V), V.T)
plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.plot([-2,2], [2, -2], c="black")
plt.scatter(dat[:,0], dat[:,1], c='red')
plt.gca().set_aspect('equal')
plt.show()
np.std(np.dot(X, V))
0.19298420388475782
V = (1,1)/np.sqrt(2)
V = V.reshape(2,1)
dat = np.dot(np.dot(X, V), V.T)
plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.plot([-2,2], [-2, 2], c="black")
plt.scatter(dat[:,0], dat[:,1], c='red')
plt.gca().set_aspect('equal')
plt.show()
np.std(np.dot(X, V))
1.4009843314794663
What is $\vec{v}$ here?
Note these two are orthogonal so let
$$V=\frac{1}{\sqrt{2}}\begin{pmatrix} 1&1\\ 1&-1\\ \end{pmatrix} $$$XV$ reports something proportional to the average and the difference.
The orthogonality is convenient because we can change back and forth:
$$XVV^\top = X$$** SVD gives us that transformation **
#Singular Value Decomposition
#U are the left singular vectors
#d are the singular values
#U*d gives PCA scores
#V are the right singular vectors -- PCA loadings
U, d, V = scipy.linalg.svd(X, full_matrices=False)
print V
[[ 0.70710678 0.70710678] [-0.70710678 0.70710678]]
np.sqrt((d ** 2) / X.shape[0])
array([ 1.40098433, 0.1929842 ])
#Plot components of left singular vectors
plt.scatter(U.transpose()[0],U.transpose()[1])
plt.xlabel('Projection to first PC')
plt.ylabel('Projection to second PC')
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.show()
Get rid of one dimension and recuperate data XV[:,1]V.T
xnew = np.dot(U.transpose()[0][None].T, [d[0]*V[0]])
plt.scatter(xnew[:, 0], X[:, 0])
plt.plot(xnew[:, 0],xnew[:, 0])
plt.xlabel("Original 1st column")
plt.ylabel("Approximation")
plt.show()
plt.scatter(xnew[:, 1], X[:, 1])
plt.plot(xnew[:, 1],xnew[:, 1])
plt.xlabel("Original 2nd column")
plt.ylabel("Approximation")
plt.show()
#d^2 is proportional to the variance explained by the dimension
#For variance explained, normalize d^2
var_exp = d**2/sum(d**2)
print "The first component explains %0.2f%% of the variance."%(var_exp[0]*100)
The first component explains 98.14% of the variance.
We take all four features now and apply SVD:
#Load the dataset from sklearn's library of example datasets
iris = sklearn.datasets.load_iris()
#data are the features (here, flower measurements)
X = iris.data
#target are the classes we wish to recover (here, flower species)
Y = iris.target
train = np.array(range(0,35)+range(50,85)+range(100,135))
X = X[:, 0:4]
colmeans = np.dot(np.matrix([np.zeros(len(train))+1/len(train)]),X[train])
colmeans = np.array(colmeans)[0]
X = np.array([a-colmeans for a in X])
names = {0:'setosa', 1:'versicolor', 2:'virginica'}
cols = np.zeros(len(X))
cols[train] = Y[train]
U, d, V = scipy.linalg.svd(X[train,], full_matrices=False)
plt.scatter(range(1,5), d**2/np.sum(d**2), s=70)
plt.xlabel("Dimension")
plt.ylabel("Percent explained")
plt.xlim(0.9, 4.1)
plt.ylim(-0.1,1)
plt.show()
colors = [dark2_colors[i] for i in Y[train]]
plt.scatter(U[:,0], U[:,1], color = colors, s=50)
x = np.array([-0.15, 0.15])
plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1)
plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.ylim(-0.3,0.3)
plt.show()
newd = np.array(np.matrix([1/d]).T)
newd = np.diag(newd[:,0])
a = list(set(range(0,len(X))) - set(train))
a.sort()
newu = np.dot(X[a, ],V.T).dot(newd)
colors = [dark2_colors[i] for i in Y[train]]
plt.scatter(U[:,0], U[:,1], color = colors, s=50)
x = np.array([-0.15, 0.15])
plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1)
plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.ylim(-0.3,0.3)
plt.show()
Obtain $V$ from $X_{train}=UDV^\top$
Apply to $X_{test} V$
colors = [dark2_colors[i] for i in Y[train]]
plt.scatter(U[:,0], U[:,1], color = colors, s=50)
colors = [dark2_colors[i] for i in Y[a]]
plt.scatter(newu[:,0], newu[:,1], s=70, marker = '^', facecolors=colors, edgecolors='k')
x = np.array([-0.15, 0.15])
plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1)
plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.ylim(-0.3,0.3)
plt.show()