library(ChemometricsWithRData)
## Warning: package 'ChemometricsWithRData' was built under R version 3.2.3
library(plyr)
## Warning: package 'plyr' was built under R version 3.2.3
library(car)
library(maptools)
## Warning: package 'maptools' was built under R version 3.2.5
## Warning: package 'sp' was built under R version 3.2.3
library(rgeos)
## Warning: package 'rgeos' was built under R version 3.2.5
# if these pacakges are not installed in your library, then use the following code to install the packages
#if (length(setdiff(c("ChemometricsWithRData", "plyr", "car","maptools","rgeos"), rownames(installed.packages()))) > 0) {
# install.packages(setdiff(c("ChemometricsWithRData", "plyr", "car","maptools","rgeos"), rownames(installed.packages())))
#}
We will use the wines data set from the ChemometricsWithRData package
# load data
data(wines)
# Data
vino <- wines
# Wine classes
vint <- vintages
# Look at data
head(vino)
## alcohol malic acid ash ash alkalinity magnesium tot. phenols
## [1,] 13.20 1.78 2.14 11.2 100 2.65
## [2,] 13.16 2.36 2.67 18.6 101 2.80
## [3,] 14.37 1.95 2.50 16.8 113 3.85
## [4,] 13.24 2.59 2.87 21.0 118 2.80
## [5,] 14.20 1.76 2.45 15.2 112 3.27
## [6,] 14.39 1.87 2.45 14.6 96 2.50
## flavonoids non-flav. phenols proanth col. int. col. hue OD ratio
## [1,] 2.76 0.26 1.28 4.38 1.05 3.40
## [2,] 3.24 0.30 2.81 5.68 1.03 3.17
## [3,] 3.49 0.24 2.18 7.80 0.86 3.45
## [4,] 2.69 0.39 1.82 4.32 1.04 2.93
## [5,] 3.39 0.34 1.97 6.75 1.05 2.85
## [6,] 2.52 0.30 1.98 5.25 1.02 3.58
## proline
## [1,] 1050
## [2,] 1185
## [3,] 1480
## [4,] 735
## [5,] 1450
## [6,] 1290
# log transform, pseudo scale - not entirely necessary
vino_log <- log(vino)
# Auto scale
vino_log_scale <- scale(vino_log)
head(vino_log_scale)
## alcohol malic acid ash ash alkalinity magnesium
## [1,] 0.2828779 -0.3758597 -0.7755685 -3.0780917 0.09983581
## [2,] 0.2343540 0.2491415 1.0609081 -0.1889435 0.17316526
## [3,] 1.6407254 -0.1737308 0.5148955 -0.7686706 1.00052461
## [4,] 0.3312550 0.4552154 1.6604185 0.5022955 1.31960253
## [5,] 1.4504491 -0.4008988 0.3472210 -1.3387193 0.93501708
## [6,] 1.6629627 -0.2665585 0.3472210 -1.5681086 -0.20100386
## tot. phenols flavonoids non-flav. phenols proanth col. int.
## [1,] 0.6406245 0.7593406 -0.763312956 -0.3678704 -0.08626281
## [2,] 0.8316405 1.0170422 -0.360413881 1.6333298 0.47725234
## [3,] 1.9364357 1.1365023 -0.988672444 0.9872556 1.16493909
## [4,] 0.8316405 0.7180526 0.378270229 0.5279117 -0.11616920
## [5,] 1.3699658 1.0897783 -0.008018233 0.7294686 0.85146111
## [6,] 0.4384752 0.6131314 -0.360413881 0.7423547 0.30656645
## col. hue OD ratio proline
## [1,] 0.4864741 1.0260875 1.0332086
## [2,] 0.4106187 0.7930185 1.3246951
## [3,] -0.3008711 1.0746646 1.8604193
## [4,] 0.4487288 0.5310494 0.1736495
## [5,] 0.4864741 0.4389335 1.8110676
## [6,] 0.3721369 1.1977431 1.5292962
vino_PCA <- prcomp(vino_log_scale, center=FALSE)
summary(vino_PCA)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.1354 1.6540 1.2209 0.94009 0.93071 0.77096
## Proportion of Variance 0.3508 0.2104 0.1147 0.06798 0.06663 0.04572
## Cumulative Proportion 0.3508 0.5612 0.6759 0.74384 0.81047 0.85619
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.70445 0.59697 0.55542 0.48845 0.44637 0.38858
## Proportion of Variance 0.03817 0.02741 0.02373 0.01835 0.01533 0.01161
## Cumulative Proportion 0.89437 0.92178 0.94551 0.96386 0.97919 0.99080
## PC13
## Standard deviation 0.3458
## Proportion of Variance 0.0092
## Cumulative Proportion 1.0000
PCAcolors <- c("#66c2a5","#fc8d62","#8da0cb")[as.integer(vint)]
PCAscores <- vino_PCA$x
PCAloadings <- vino_PCA$rotation
par(mfrow=c(2,1))
plot(PCAscores[,1:2], # x and y data
pch=21, # point shape
col=PCAcolors, # point border color
bg=PCAcolors, # point color
cex=1.5, # point size
main="Scores" # title of plot
)
legend("topright", # position of legend
legend=levels(vint), # legend display
pch=21, # point shape
pt.bg=c("#66c2a5","#fc8d62","#8da0cb"), # point colors
pt.cex=1.5, # point size
col = c("#66c2a5","#fc8d62","#8da0cb") # point border color
)
plot(PCAloadings[,1:2], # x and y data
pch=21, # point shape
bg="black", # point color
cex=1, # point size
main="Loadings" # title of plot
)
text(PCAloadings[,1:2], # sets position of labels
labels=rownames(PCAloadings) # print labels
)
We see that the text is on top of the points, which makes the labels a little unreadable. We can fix this using the maptools package. Also, we can add 95% confidence ellipses to the groups in the score plot and clean up the plots using a custom function.
## Cut and paste into console
## Customize yaxis range to makes sure axis ticks cover data
## Axes ticks do not always cover data range in R plots - reviewer did not like!
plotat <- function(RANGE) {
if(length(RANGE) != 2) stop("RANGE argument must have a length of 2")
if(RANGE[1] > RANGE[2]) stop("First element in RANGE must be smaller then second element")
prettyres <- pretty(sprintf("%.2f",RANGE[1]):sprintf("%.2f",RANGE[2]), 7)
while((min(prettyres) < RANGE[1]) == FALSE) {
prdiff <- prettyres[2] - prettyres[1]
prettyres[length(prettyres) + 1] <- prettyres[1] - prdiff
prettyres <- sort(prettyres)
}
while((max(prettyres) > RANGE[2]) == FALSE) {
prdiff <- prettyres[2] - prettyres[1]
prettyres[length(prettyres) + 1] <- prettyres[length(prettyres)] + prdiff
prettyres <- sort(prettyres)
}
plotticks <- as.numeric(sprintf("%.2f",prettyres))
plotticks
}
## ellipseplot function
ellipseplot <- function(x, y, factr,
elev=0.95, # Ellipse probability level
legpos=c("topright","topleft","bottomleft","bottomleft"), # Legend position
pcol=NULL, # manual addition of colors, must meet length of factors
cexsize=1, # point size
ppch=21, # Point type, must meet length of factors
legcexsize=2, # legend font size
legptsize=2, # legend point size
pbgcol=TRUE,
axissize=1,
linewidth=1,
font=1) {
require(plyr)
require(car)
## Set factor levels
if(is.factor(factr)) {
f <- factr
} else {
f <- factor(factr, levels=unique(as.character(factr)))
}
intfactr <- as.integer(f) # Set integer vector that matches factor levels
# Checking to make sure length of ppch equals number of factor levels
if((length(ppch) > 1 & length(unique(intfactr)) != length(ppch))) stop("Can only increase point shape if equal to factor levels")
## Get data for ellipses
edf <- data.frame(LV1 = x, LV2=y, factr = f) # create data frame with data and factor
ellipses <- dlply(edf, .(factr), function(x) {
LV1 <- x[,1]
LV2 <- x[,2]
dataEllipse(LV1, LV2, levels=elev, robust=TRUE, draw=FALSE) # Get confidence ellipse points from dataEllipse() function by factor level
})
## Get range of x and y data
xrange <- plotat(range(c(as.vector(sapply(ellipses, function(x) x[,1])), min(x), max(x))))
yrange <- plotat(range(c(as.vector(sapply(ellipses, function(x) x[,2])), min(y), max(y))))
## Set colors for plots
if(is.null(pcol) != TRUE) { # If colors are supplied by user
ptcol <- pcol
pgcol <- paste(pcol, "7e", sep="") # adds opaqueness
} else { # Default
pgcol <- c("#e41a1c7e","#377eb87e","#4daf4a7e","#984ea37e","#807f7d7e") # Defaults at 5 colors
ptcol <- c("#e41a1c","#377eb8","#4daf4a","#984ea3","#807f7d") # For opaqueness
}
# Plotting graphic
plot(x,y, type="n", xlab="", ylab="", main="", xlim=range(xrange), ylim=range(yrange), axes=FALSE)
axis(1, at=xrange, labels=xrange, cex.axis=axissize,lwd=linewidth, font=font)
axis(2, las=2, cex.axis=axissize,lwd=linewidth, font=font)
box(lwd=linewidth, font=font)
abline(h=0, v=0, col="gray", lty=2) # Adds lines at 0
legpch <- c() # vector to collect legend pch data
legcol <- c() # vector to collect legend col data
## Not sure why I split this up, might have been an artifact of an older version.
## Adds points, ellipse, and determines color specifications for legend
if(pbgcol==TRUE) {
for(i in 1:length(unique(intfactr))){
points(x[intfactr==i], y[intfactr==i], pch=ppch[i], col=ptcol[i], bg=ptcol[i],cex=cexsize)
polygon(ellipses[[i]], col=pgcol[i], border=ptcol[i])
legpch[i] <- ppch[i]
legcol[i] <- ptcol[i]
}
} else {
for(i in 1:length(unique(intfactr))){
points(x[intfactr==i], y[intfactr==i], pch=ppch[i], col="black", bg=ptcol[i],cex=cexsize)
polygon(ellipses[[i]], col=pgcol[i], border=ptcol[i])
legpch[i] <- ppch[i]
legcol[i] <- ptcol[i]
}
}
## Legend
legend(x=legpos, legend=levels(f), pch=legpch,
pt.bg=legcol, col=legcol, bty="n", border=FALSE, pt.cex=legptsize, cex=legcexsize)
}
## Axis legends for PCA output using prcomp() function
PCAvarAxis <- function(PCA, decimal=1) {
pcavar <- round((PCA$sdev^2)/sum((PCA$sdev^2)),3)*100 #Calculate % variance explained
PC1var <- paste("Principal Component 1 (", pcavar[1], "%)", sep="")
PC2var <- paste("Principal Component 2 (", pcavar[2], "%)", sep="")
PC3var <- paste("Principal Component 3 (", pcavar[3], "%)", sep="")
PC4var <- paste("Principal Component 4 (", pcavar[4], "%)", sep="")
PC5var <- paste("Principal Component 5 (", pcavar[5], "%)", sep="")
return(list(PC1=PC1var, PC2=PC2var, PC3=PC3var, PC4=PC4var, PC5=PC5var))
}
## Capture % variance explained from PCA
explainPCAvar <- PCAvarAxis(vino_PCA)
par(mfrow=c(2,1))
ellipseplot(PCAscores[,1], # data for x-axis
PCAscores[,2], # data for y-axis
vint, # factor with classes
pcol=c("#66c2a5","#fc8d62","#8da0cb"), # colors for plotting (must match # of factors)
pbgcol=FALSE, # point borders black?
cexsize=1.5, # size of points
ppch=c(21:23), # shape of points (must match # of factors)
legpos="bottomright", # position of legend
legcexsize=1.5, # legend text size
legptsize=1.5, # legend point size
axissize=1.5, # Set axis text size
linewidth=1.5 # Set axis line size
)
title(xlab=explainPCAvar[["PC1"]], # % variance explained on PC1
ylab=explainPCAvar[["PC2"]], # % variance explained on PC2
main="Scores", # Title
cex.lab=1.5, # size of label text
cex.main=1.5 # size of title text
)
plot(PCAloadings[,1:2], # x and y data
pch=21, # point shape
bg="black", # point color
cex=1.5, # point size
# type="n", # does not plot points
axes=FALSE, # does not print axes
xlab="", # removes x label
ylab="" # removes y label
)
pointLabel(PCAloadings[,1:2], # set position of labels
labels=rownames(PCAloadings), # print labels
cex=1.5 # set size of label
) # pointLabel will try to position the text around the points
axis(1, # display x-axis
cex.axis=1.5, # set size of text
lwd=1.5 # set size of axis line
)
axis(2, # display y-axis
las=2, # argument sets direction of text, 2 is perpendicular
cex.axis=1.5, # set size of text
lwd=1.5 # set size of axis line
)
box(lwd=1.5 # line width of box surrounding plot
)
title(xlab=explainPCAvar[["PC1"]], # % variance explained on PC1
ylab=explainPCAvar[["PC2"]], # % variance explained on PC2
main="Loading", # Title
cex.lab=1.5, # size of label text
cex.main=1.5 # size of title text
)
Author: Brian Piccolo
Highly recommend Chemometrics with R Multivariate Data Analysis in the Natural Sciences and Life Sciences by Ron Wehrens for those interested in PCA and Multivariate Analysis.
Created with knitr & RMarkdown and knitrBootstrap.