In this lesson students will …
Principal components analysis (PCA) is often used as a dimension reduction) technique because it uses linear algebra to find a relatively small number of linear combinations of variables to explain most of the variablity.
If we have \(p\) variables \(x_1, x_2, \dots, x_p\) a PCA would produce \(z_1, z_2,\dots, z_k\) linear combinations of the original variables
\[z_1 = \phi_{11}x_1+\phi_{12}x_2+\dots +\phi_{1p}x_p\] \[z_2 = \phi_{21}x_1+\phi_{22}x_2+\dots +\phi_{2p}x_p\]
\[z_k = \phi_{k1}x_1+\phi_{k2}x_2+\dots +\phi_{kp}x_p\] where \(k=min(n,p)\). These components are ordered by the amount of variablity that they explain and the hope is that the first few components explain a majority of the variablity. The big idea here is that if all the variables are independent, we would need information for every variables; however, if there is correlation between the variables we can leverage that.
We will assume that the variables \(x_1, x_2, \dots, x_p\) are scaled such that the mean is zero and the standard devaition is one. Our goal is to find the component loading vector \((\phi_{i1}, \phi_{i2}, \dots, \phi_{ip})\)
Recall, for a square \(n\times n\) matrix \(\mathbf{A}\), \[\mathbf{A} \mathbf{v}=\mathbf{\lambda} \mathbf{v}\], where \(\mathbf{v}\) is the eigenvector and \(\mathbf{\lambda}\) is the eigenvalue.
“The eigenvectors and eigenvalues of a covariance (or correlation) matrix represent the “core” of a PCA: The eigenvectors (principal components) determine the directions of the new feature space, and the eigenvalues determine their magnitude. In other words, the eigenvalues explain the variance of the data along the new feature axes.”
In this pretend example inspired by StatCast, imagine that we have two genes and six lab mice.
#####GENES AND MOUSE
sq<-data.frame(mouse=1:6,
gene1=c(10, 11, 8, 3, 2, 1),
gene2=c(6, 4, 5, 3, 2.8, 1))
library(tidyverse)
ggplot(sq, aes(x=gene1, y=gene2))+
geom_point()+
xlim(c(0, 11))+
ylim(c(0, 6))+
coord_equal()
Data is centered (and sometimes scaled) for PCA.
## SCALED
mouse.scaled <- apply(sq[,2:3], 2, scale)
scaledDF<-as.data.frame(mouse.scaled )
ggplot(scaledDF, aes(x=gene1, y=gene2))+
geom_point()
### COVARIANCE MAT
cov.mouse <- cov(mouse.scaled)
head(cov.mouse)
## gene1 gene2
## gene1 1.0000000 0.8423358
## gene2 0.8423358 1.0000000
#eigenvalues and eigenvectors
#of the covariance matrix
ev.mouse <- eigen(cov.mouse)
head(ev.mouse)
## $values
## [1] 1.8423358 0.1576642
##
## $vectors
## [,1] [,2]
## [1,] 0.7071068 -0.7071068
## [2,] 0.7071068 0.7071068
The ratio between gene1 and gene2 in the first principal component is 1:1, so \(y=x\). The ratio between gene1 and gene2 in the second principal component is -1:1, so \(y=-x\).
ggplot(scaledDF, aes(x=gene1, y=gene2))+
geom_point()+
geom_abline(intercept=0, slope=1)+
geom_abline(intercept=0, slope=-1)
pc12<-ev.mouse$vector[,1:2]%>%
as.data.frame()%>%
cbind(variable=c("gene1", "gene2"))
dim12<-mouse.scaled %>%
as.data.frame()%>%
cbind(ID=1:dim(mouse.scaled )[1])%>%
gather("variable", "scaledVal", -ID)%>%
left_join(pc12)%>%
mutate(dim1=scaledVal*V1,
dim2=scaledVal*V2)%>%
group_by(ID)%>%
summarise(z1=sum(dim1),
z2=sum(dim2))
## Joining, by = "variable"
ggplot(dim12, aes(x=z1, y=z2))+
geom_point()+
geom_hline(yintercept = 0)+
geom_vline(xintercept = 0)
From https://bookdown.org/tpinto_home/Unsupervised-learning/#dataset-used-in-the-examples
The dataset bdiag.csv contains quantitative information from digitized images of a diagnostic test (fine needle aspirate (FNA) test on breast mass) for the diagnosis of breast cancer. The variables describe characteristics of the cell nuclei present in the image.
Variables Information:
and ten real-valued features are computed for each cell nucleus:
set.seed(1974)
bdiag<- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/bdiag.csv", stringsAsFactors = TRUE)
library(GGally)
bdiagGG<-bdiag[,c("radius_mean", "texture_mean",
"perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean",
"concavity_mean", "concave.points_mean", "symmetry_mean", "fractal_dimension_mean", "diagnosis")]
ggpairs(bdiagGG,
ggplot2::aes(color=factor(diagnosis)))
bdiag.pca.data <- bdiag[,c("radius_mean", "texture_mean",
"perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean",
"concavity_mean", "concave.points_mean", "symmetry_mean", "fractal_dimension_mean")]
bdiag.pca <- prcomp(bdiag.pca.data, scale=TRUE)
#install.packages("factoextra")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
summary(bdiag.pca) #variance explained
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3406 1.5870 0.93841 0.7064 0.61036 0.35234 0.28299
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241 0.00801
## Cumulative Proportion 0.5479 0.7997 0.88779 0.9377 0.97495 0.98736 0.99537
## PC8 PC9 PC10
## Standard deviation 0.18679 0.10552 0.01680
## Proportion of Variance 0.00349 0.00111 0.00003
## Cumulative Proportion 0.99886 0.99997 1.00000
#install.packages("tidyr")
library(tidyr)
fviz_eig(bdiag.pca) #scree plot
#loading vectors
fviz_pca_var(bdiag.pca, col.var = "steelblue") # comp 1 vs 2
fviz_pca_var(bdiag.pca, col.var = "steelblue",axes=c(1,3))
fviz_pca_var(bdiag.pca, col.var = "steelblue",axes=c(2,3))
groups <- bdiag$diagnosis
fviz_pca_ind(bdiag.pca,
col.ind = groups, # color by groups
palette = c("blue", "red"),
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
repel = FALSE,
label="none",
axes=c(1,2)
)
#install.packages("jpeg")
#install.packages("graphics")
library(jpeg)
library(graphics)
bearcat <- readJPEG("/cloud/project/bearcat.jpeg")[,,2]
par(mfrow=c(1,1))
plot(1:2, type='n', axes=F, ann=F)
rasterImage(bearcat, 1, 1, 2, 2)
dim(bearcat)
## [1] 250 400
#install.packages("gplots")
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
#color for the heatmap
col.correlation <- colorRampPalette(c("red","yellow","darkgreen"),
space = "rgb")(30)
heatmap.2(cor(bearcat),
Rowv = F, Colv = F,
dendrogram = "none",
trace="none",
col=col.correlation)
library(factoextra)
#PCA
bearcat.pca <- prcomp(bearcat, center = FALSE)
fviz_eig(bearcat.pca)
#The intensities given by the first 5 components
bearcat.pca5 <- bearcat.pca$x[,1:5] %*% t(bearcat.pca$rotation[,1:5])
#this is just to make sure all the values
#remain within 0 and 1 (due to rounding in the
#calculation, sometimes the values go slightly higher than 1)
bearcat.pca5[bearcat.pca5>1] <-1
bearcat.pca5[bearcat.pca5<0] <-0
#You can have a look at the image
par(mfrow=c(1,1))
plot(1:2, type='n', axes=F, ann=F)
title ("original")
rasterImage(bearcat, 1, 1, 2, 2)
plot(1:2, type='n', axes=F, ann=F)
title("Image with 5 components from PCA")
rasterImage(bearcat.pca5, 1, 1, 2, 2)
More principal components
# Intensities with 20, 50, 100 and 534 components
bearcat.pca.j <- lapply(c(10, 20, 50, 250), function(j) {
jcomp <- bearcat.pca$x[,1:j] %*% t(bearcat.pca$rotation[,1:j])
jcomp[jcomp>1] <-1
jcomp[jcomp<0] <-0
return(jcomp)
}
)
par(mfrow=c(1,1))
plot(1:2, type='n', axes=F, ann=F)
title ("10 components")
rasterImage(bearcat.pca.j[[1]], 1, 1, 2, 2)
plot(1:2, type='n', axes=F, ann=F)
title("20 components")
rasterImage(bearcat.pca.j[[2]], 1, 1, 2, 2)
plot(1:2, type='n', axes=F, ann=F)
title("50 components")
rasterImage(bearcat.pca.j[[3]], 1, 1, 2, 2)
plot(1:2, type='n', axes=F, ann=F)
title("250 components (original image)")
rasterImage(bearcat.pca.j[[4]], 1, 1, 2, 2)