PCA
#############################################################
###################### #
# R example code for principal components analysis: #
###################### #
#############################################################
## Data set 1:
# The built-in R data set called state contains a multivariate data matrix.
# For information, type:
help(state)
## starting httpd help server ... done
# To see the data matrix (containing variables for the 50 states, as of 1977):
state.x77[1:5,]
## Population Income Illiteracy Life Exp Murder HS Grad Frost Area
## Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
## California 21198 5114 1.1 71.71 10.3 62.6 20 156361
# Creating a data frame out of a matrix:
state.x77.df <- data.frame(state.x77)
names(state.x77.df) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area")
attach(state.x77.df)
## Data set 2:
# We can read the Bumpus bird data from the Internet:
bumpbird <- read.table("http://www.stat.sc.edu/~hitchcock/bumpusbird.txt", header=T)
# If not connected to the Internet, we could save the file and read it
# similarly to the following:
# bumpbird <- read.table("Z:/stat_530/bumpusbird.txt", header=T)
# Changing the names of the variables to something more descriptive:
names(bumpbird) <- c("ID", "tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length")
attach(bumpbird)
############################################
############################################
# Principal Components Analysis of the State data:
state.pc <- princomp(state.x77.df, cor=T)
# Showing the coefficients of the components:
summary(state.pc,loadings=T)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.8970755 1.2774659 1.0544862 0.84113269 0.62019488
## Proportion of Variance 0.4498619 0.2039899 0.1389926 0.08843803 0.04808021
## Cumulative Proportion 0.4498619 0.6538519 0.7928445 0.88128252 0.92936273
## Comp.6 Comp.7 Comp.8
## Standard deviation 0.55449226 0.3800642 0.33643379
## Proportion of Variance 0.03843271 0.0180561 0.01414846
## Cumulative Proportion 0.96779544 0.9858515 1.00000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## Popul 0.126 0.411 0.656 0.409 0.406 0.219
## Income -0.299 0.519 0.100 -0.638 -0.462
## Illit 0.468 -0.353 -0.387 0.620 0.339
## LifeExp -0.412 0.360 -0.443 0.327 -0.219 0.256 -0.527
## Murder 0.444 0.307 -0.108 0.166 -0.128 0.325 0.295 -0.678
## HSGrad -0.425 0.299 -0.232 0.645 0.393 0.307
## Frost -0.357 -0.154 -0.387 0.619 0.217 -0.213 0.472
## Area 0.588 -0.510 -0.201 0.499 -0.148 -0.286
# Showing the eigenvalues of the correlation matrix:
(state.pc$sdev)^2
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## 3.5988956 1.6319192 1.1119412 0.7075042 0.3846417 0.3074617 0.1444488 0.1131877
# A scree plot:
plot(1:(length(state.pc$sdev)), (state.pc$sdev)^2, type='b',
main="Scree Plot", xlab="Number of Components", ylab="Eigenvalue Size")
# Where does the "elbow" occur?
# What seems to be a reasonable number of PCs to use?
# Plotting the PC scores for the sample data in the space of the first two principal components:
par(pty="s")
plot(state.pc$scores[,1], state.pc$scores[,2], ylim=range(state.pc$scores[,1]),
xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with state abbreviations:
text(state.pc$scores[,1], state.pc$scores[,2], labels=state.abb, cex=0.7, lwd=2)
# We see the Southeastern states grouped in the bottom left
# and several New England states together in the bottom right.
# The same plot, but without restricting the y axis limits:
par(pty="s")
plot(state.pc$scores[,1], state.pc$scores[,2],
xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with state abbreviations:
text(state.pc$scores[,1], state.pc$scores[,2], labels=state.abb, cex=0.7, lwd=2)
# The biplot can add information about the variables to the plot of the first two PC scores:
biplot(state.pc,xlabs=state.abb)
# Note the Area vector (arrow) goes almost totally in the direction of the second PC.
############################################
############################################
# Principal Components Analysis of the Bird data:
# We ignore the first column (the IDs)
bird.pc <- princomp(bumpbird[,-1], cor=T)
# Showing the coefficients of the components:
summary(bird.pc,loadings=T)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.9015726 0.7290433 0.62163056 0.5491498 0.4056199
## Proportion of Variance 0.7231957 0.1063008 0.07728491 0.0603131 0.0329055
## Cumulative Proportion 0.7231957 0.8294965 0.90678139 0.9670945 1.0000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## tot.length 0.452 0.690 0.420 0.374
## alar.length 0.462 -0.300 0.341 -0.548 -0.530
## beak.head.length 0.451 -0.325 -0.454 0.606 -0.343
## humerus.length 0.471 -0.185 -0.411 -0.388 0.652
## keel.stern.length 0.398 0.876 -0.178 -0.192
# The first component seems to be a very general size component,
# but the second component really picks up the variability in keel and sternum length.
# A scree plot:
plot(1:(length(bird.pc$sdev)), (bird.pc$sdev)^2, type='b',
main="Scree Plot", xlab="Number of Components", ylab="Eigenvalue Size")
# Where does the "elbow" occur?
# What seems to be a reasonable number of PCs to use?
# Plotting the PC scores for the sample data in the space of the first two principal components:
par(pty="s")
plot(bird.pc$scores[,1], bird.pc$scores[,2], ylim=range(bird.pc$scores[,1]),
xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with IDs for birds:
text(bird.pc$scores[,1], bird.pc$scores[,2], labels=ID, cex=0.7, lwd=2)
# Interestingly, birds 1 to 21 survived a storm in RI
# while birds 22 to 49 died.
# See the survivors (red) and the deceased (blue) separated by color.
par(pty="s")
plot(bird.pc$scores[,1], bird.pc$scores[,2], ylim=range(bird.pc$scores[,1]),
xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with IDs for birds:
text(bird.pc$scores[,1], bird.pc$scores[,2], labels=ID, cex=0.7, lwd=2,
col=c(rep("red", times = 21), rep("blue", times=28)) )
# The medium-sized birds seem to have had the best survival chances!
# The biplot can add information about the variables to the plot of the first two PC scores:
biplot(bird.pc,xlabs=ID)
# Expanding the plotting window a bit:
# Don't run this:
#biplot(bird.pc,xlabs=ID, xlim=c(-0.4,0.4), ylim=c(-0.4,0.4))
############################################
############################################
# Inference about Principal Components:
# CIs for eigenvalue(s) of Sigma:
# (Note these eigenvalues will be different from the eigenvalues in the PCA done above,
# which was based on the *correlation* matrix R, not on S.)
# Copy this function into R first:
evalue.CI <- function(datamatrix, labelvec, m=length(labelvec), conf.level=0.95){
alpha<- 1-conf.level
n<-nrow(datamatrix)
z<-qnorm(alpha/(2*m),lower=F)
lambdas<- princomp(datamatrix)$sdev^2
print(lambdas)
LCL<-lambdas[labelvec]/(1+z*sqrt(2/n))
UCL<-lambdas[labelvec]/(1-z*sqrt(2/n))
CIs<-cbind(LCL,UCL)
return(CIs)
}
# 95% CI for variance of the first population PC:
evalue.CI(bumpbird[,-1],1)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## 34.60482313 4.52812344 0.61804191 0.30640029 0.07593901
## LCL UCL
## Comp.1 24.78904 57.29015
# Joint 95% CIs for variances of the first two population PCs:
evalue.CI(bumpbird[,-1],c(1,2))
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## 34.60482313 4.52812344 0.61804191 0.30640029 0.07593901
## LCL UCL
## Comp.1 23.818879 63.243476
## Comp.2 3.116757 8.275559
# Note the first interval is wider because the FAMILYWISE confidence coefficient is 95% here.
# 99% CI for variance of the first population PC:
evalue.CI(bumpbird[,-1],1, conf.level=0.99)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## 34.60482313 4.52812344 0.61804191 0.30640029 0.07593901
## LCL UCL
## Comp.1 22.7604 72.15292