PCA

#############################################################
######################                                      #           
# R example code for principal components analysis:         #
######################                                      #
#############################################################

## Data set 1:

# The built-in R data set called state contains a multivariate data matrix.
# For information, type:

help(state)
## starting httpd help server ... done
# To see the data matrix (containing variables for the 50 states, as of 1977):

state.x77[1:5,]
##            Population Income Illiteracy Life Exp Murder HS Grad Frost   Area
## Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708
## Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432
## Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417
## Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945
## California      21198   5114        1.1    71.71   10.3    62.6    20 156361
# Creating a data frame out of a matrix:

state.x77.df <- data.frame(state.x77)

names(state.x77.df) <- c("Popul", "Income", "Illit", "LifeExp", "Murder", "HSGrad", "Frost", "Area")

attach(state.x77.df)

## Data set 2:

# We can read the Bumpus bird data from the Internet:

bumpbird <- read.table("http://www.stat.sc.edu/~hitchcock/bumpusbird.txt", header=T)

# If not connected to the Internet, we could save the file and read it 
# similarly to the following:

# bumpbird <- read.table("Z:/stat_530/bumpusbird.txt", header=T)

# Changing the names of the variables to something more descriptive:

names(bumpbird) <- c("ID", "tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length")

attach(bumpbird)

############################################
############################################

# Principal Components Analysis of the State data:

state.pc <- princomp(state.x77.df, cor=T)

# Showing the coefficients of the components:

summary(state.pc,loadings=T)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     1.8970755 1.2774659 1.0544862 0.84113269 0.62019488
## Proportion of Variance 0.4498619 0.2039899 0.1389926 0.08843803 0.04808021
## Cumulative Proportion  0.4498619 0.6538519 0.7928445 0.88128252 0.92936273
##                            Comp.6    Comp.7     Comp.8
## Standard deviation     0.55449226 0.3800642 0.33643379
## Proportion of Variance 0.03843271 0.0180561 0.01414846
## Cumulative Proportion  0.96779544 0.9858515 1.00000000
## 
## Loadings:
##         Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## Popul    0.126  0.411  0.656  0.409  0.406                0.219
## Income  -0.299  0.519  0.100        -0.638 -0.462              
## Illit    0.468               -0.353        -0.387  0.620  0.339
## LifeExp -0.412         0.360 -0.443  0.327 -0.219  0.256 -0.527
## Murder   0.444  0.307 -0.108  0.166 -0.128  0.325  0.295 -0.678
## HSGrad  -0.425  0.299        -0.232         0.645  0.393  0.307
## Frost   -0.357 -0.154 -0.387  0.619  0.217 -0.213  0.472       
## Area            0.588 -0.510 -0.201  0.499 -0.148 -0.286
# Showing the eigenvalues of the correlation matrix:

(state.pc$sdev)^2
##    Comp.1    Comp.2    Comp.3    Comp.4    Comp.5    Comp.6    Comp.7    Comp.8 
## 3.5988956 1.6319192 1.1119412 0.7075042 0.3846417 0.3074617 0.1444488 0.1131877
# A scree plot:

plot(1:(length(state.pc$sdev)),  (state.pc$sdev)^2, type='b', 
     main="Scree Plot", xlab="Number of Components", ylab="Eigenvalue Size")

# Where does the "elbow" occur?
# What seems to be a reasonable number of PCs to use?

# Plotting the PC scores for the sample data in the space of the first two principal components:

par(pty="s")
plot(state.pc$scores[,1], state.pc$scores[,2], ylim=range(state.pc$scores[,1]), 
     xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with state abbreviations:
text(state.pc$scores[,1], state.pc$scores[,2], labels=state.abb, cex=0.7, lwd=2)

# We see the Southeastern states grouped in the bottom left 
# and several New England states together in the bottom right.

# The same plot, but without restricting the y axis limits:

par(pty="s")
plot(state.pc$scores[,1], state.pc$scores[,2], 
     xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with state abbreviations:
text(state.pc$scores[,1], state.pc$scores[,2], labels=state.abb, cex=0.7, lwd=2)

# The biplot can add information about the variables to the plot of the first two PC scores:

biplot(state.pc,xlabs=state.abb)

# Note the Area vector (arrow) goes almost totally in the direction of the second PC.

############################################
############################################

# Principal Components Analysis of the Bird data:
# We ignore the first column (the IDs)

bird.pc <- princomp(bumpbird[,-1], cor=T)

# Showing the coefficients of the components:

summary(bird.pc,loadings=T)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3    Comp.4    Comp.5
## Standard deviation     1.9015726 0.7290433 0.62163056 0.5491498 0.4056199
## Proportion of Variance 0.7231957 0.1063008 0.07728491 0.0603131 0.0329055
## Cumulative Proportion  0.7231957 0.8294965 0.90678139 0.9670945 1.0000000
## 
## Loadings:
##                   Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## tot.length         0.452         0.690  0.420  0.374
## alar.length        0.462 -0.300  0.341 -0.548 -0.530
## beak.head.length   0.451 -0.325 -0.454  0.606 -0.343
## humerus.length     0.471 -0.185 -0.411 -0.388  0.652
## keel.stern.length  0.398  0.876 -0.178        -0.192
# The first component seems to be a very general size component,
# but the second component really picks up the variability in keel and sternum length.

# A scree plot:

plot(1:(length(bird.pc$sdev)),  (bird.pc$sdev)^2, type='b',
     main="Scree Plot", xlab="Number of Components", ylab="Eigenvalue Size")

# Where does the "elbow" occur?
# What seems to be a reasonable number of PCs to use?

# Plotting the PC scores for the sample data in the space of the first two principal components:

par(pty="s")
plot(bird.pc$scores[,1], bird.pc$scores[,2], ylim=range(bird.pc$scores[,1]), 
     xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with IDs for birds:
text(bird.pc$scores[,1], bird.pc$scores[,2], labels=ID, cex=0.7, lwd=2)

# Interestingly, birds 1 to 21 survived a storm in RI
# while birds 22 to 49 died.
# See the survivors (red) and the deceased (blue) separated by color.

par(pty="s")
plot(bird.pc$scores[,1], bird.pc$scores[,2], ylim=range(bird.pc$scores[,1]), 
     xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
# labeling points with IDs for birds:
text(bird.pc$scores[,1], bird.pc$scores[,2], labels=ID, cex=0.7, lwd=2,
     col=c(rep("red", times = 21), rep("blue", times=28)) )

# The medium-sized birds seem to have had the best survival chances!

# The biplot can add information about the variables to the plot of the first two PC scores:

biplot(bird.pc,xlabs=ID)

# Expanding the plotting window a bit:
# Don't run this:
#biplot(bird.pc,xlabs=ID, xlim=c(-0.4,0.4), ylim=c(-0.4,0.4))

############################################
############################################

# Inference about Principal Components:

# CIs for eigenvalue(s) of Sigma:

# (Note these eigenvalues will be different from the eigenvalues in the PCA done above,
# which was based on the *correlation* matrix R, not on S.)

# Copy this function into R first:

evalue.CI <- function(datamatrix, labelvec, m=length(labelvec), conf.level=0.95){
alpha<- 1-conf.level
n<-nrow(datamatrix)
z<-qnorm(alpha/(2*m),lower=F)
lambdas<- princomp(datamatrix)$sdev^2
print(lambdas)
LCL<-lambdas[labelvec]/(1+z*sqrt(2/n))
UCL<-lambdas[labelvec]/(1-z*sqrt(2/n))
CIs<-cbind(LCL,UCL)
return(CIs)
}


# 95% CI for variance of the first population PC:

evalue.CI(bumpbird[,-1],1)
##      Comp.1      Comp.2      Comp.3      Comp.4      Comp.5 
## 34.60482313  4.52812344  0.61804191  0.30640029  0.07593901
##             LCL      UCL
## Comp.1 24.78904 57.29015
# Joint 95% CIs for variances of the first two population PCs:

evalue.CI(bumpbird[,-1],c(1,2))
##      Comp.1      Comp.2      Comp.3      Comp.4      Comp.5 
## 34.60482313  4.52812344  0.61804191  0.30640029  0.07593901
##              LCL       UCL
## Comp.1 23.818879 63.243476
## Comp.2  3.116757  8.275559
# Note the first interval is wider because the FAMILYWISE confidence coefficient is 95% here.

# 99% CI for variance of the first population PC:

evalue.CI(bumpbird[,-1],1, conf.level=0.99)
##      Comp.1      Comp.2      Comp.3      Comp.4      Comp.5 
## 34.60482313  4.52812344  0.61804191  0.30640029  0.07593901
##            LCL      UCL
## Comp.1 22.7604 72.15292