This is a Principal Component Analysis and Partial Least Squares analysis, drawing heavily from mixOmics’ Case Studies:
In the liver toxicity case study, 64 observations of 3116 gene expression values were used in an effort to discover which genes accounted for the variance seen in clinical variables, and could thus be used to predict the values of those clinical variables.
Unfortunately, I couldn’t find another data set from either Scientific Data or Bioconductor that had genomic and associated phenotypic data, so we will use the liver.toxicity
data set that comes with mixOmics
.
if (!require("mixOmics")) install.packages("mixOmics")
#Note: rgl installation failed-- installed XQuartz for X11
#https://www.xquartz.org/
rmse()
will accept the actual
and predicted
numeric values, and will return the root mean squared error.
rmse <- function(actual, predicted){
return(sqrt(mean((actual-predicted)^2)))
}
metrics
will iterate over the variables in the predicted data frame to generate root mean squared error stats for each variable.
metrics <- function(actual, predicted){
#Hold the data to return
res_frame <- data.frame()
#Create vector for holding values
res_vec <- vector()
for (i in 1:ncol(actual)){
res_vec[length(res_vec)+1] <- rmse(actual[,i], predicted[,i])
}
#Bind
res_frame <- rbind(res_frame, res_vec)
colnames(res_frame) <- colnames(actual)
return(res_frame)
}
data("liver.toxicity")
#Structure of liver.toxicity, but not gene because it's >3k variables
str(liver.toxicity$clinic)
## 'data.frame': 64 obs. of 10 variables:
## $ BUN.mg.dL. : num 12 15 17 19 20 17 16 14 12 11 ...
## $ Creat.mg.dL. : num 0.6 0.7 0.8 0.7 0.7 0.7 0.7 0.6 0.7 0.7 ...
## $ TP.g.dL. : num 7.3 7.5 7.6 7.5 7.8 7.7 7.6 7.4 7.5 7.3 ...
## $ ALB.g.dL. : num 4.9 5 5 5 5.1 5.1 5.1 4.9 5.1 4.9 ...
## $ ALT.IU.L. : int 73 84 54 369 59 57 67 57 54 97 ...
## $ SDH.IU.L. : int 23 23 14 81 16 10 21 19 20 26 ...
## $ AST.IU.L. : int 87 110 129 450 75 80 93 90 76 135 ...
## $ ALP.IU.L. : int 320 325 381 401 359 335 324 309 317 251 ...
## $ TBA.umol.L. : int 9 8 4 13 3 13 8 11 6 7 ...
## $ Cholesterol.mg.dL.: int 85 87 84 82 85 88 82 92 86 77 ...
str(liver.toxicity$treatment)
## 'data.frame': 64 obs. of 4 variables:
## $ Animal.Number : int 202 203 204 206 208 209 210 211 212 213 ...
## $ Treatment.Group: Factor w/ 16 levels "1500MG/K18 HR",..: 16 16 13 13 14 14 15 15 15 16 ...
## $ Dose.Group : num 50 50 50 50 50 50 50 50 50 50 ...
## $ Time.Group : int 6 6 18 18 24 24 48 48 48 6 ...
In this instance, we will be using genomic data to predict clinical phenotypes, so our predictors, x
will be analyzed alongside our clinical outcomes, y
.
x <- liver.toxicity$gene
y <- liver.toxicity$clinic
Because x
contains more than 3,000 variables, I am only interested in ensuring that the data are complete. Otherwise, I will summarize y
.
all(complete.cases(x))
## [1] TRUE
summary(y)
## BUN.mg.dL. Creat.mg.dL. TP.g.dL. ALB.g.dL.
## Min. :11.00 Min. :0.6000 Min. :6.300 Min. :4.400
## 1st Qu.:14.00 1st Qu.:0.6750 1st Qu.:7.300 1st Qu.:4.900
## Median :16.50 Median :0.7000 Median :7.500 Median :5.000
## Mean :17.04 Mean :0.6877 Mean :7.475 Mean :5.014
## 3rd Qu.:20.00 3rd Qu.:0.7000 3rd Qu.:7.725 3rd Qu.:5.200
## Max. :28.00 Max. :0.8000 Max. :8.400 Max. :5.600
## ALT.IU.L. SDH.IU.L. AST.IU.L. ALP.IU.L.
## Min. : 48.0 Min. : 2.0 Min. : 75.0 Min. :141.0
## 1st Qu.: 63.0 1st Qu.: 15.0 1st Qu.: 89.0 1st Qu.:303.0
## Median : 94.5 Median : 22.5 Median : 151.5 Median :332.5
## Mean : 1677.8 Mean : 79.8 Mean : 3375.3 Mean :338.2
## 3rd Qu.: 1137.0 3rd Qu.: 61.0 3rd Qu.: 1502.8 3rd Qu.:380.2
## Max. :15180.0 Max. :995.0 Max. :27075.0 Max. :500.0
## TBA.umol.L. Cholesterol.mg.dL.
## Min. : 3.00 Min. : 42.00
## 1st Qu.: 5.00 1st Qu.: 78.00
## Median : 9.00 Median : 84.50
## Mean :20.27 Mean : 82.62
## 3rd Qu.:26.00 3rd Qu.: 91.25
## Max. :98.00 Max. :111.00
We will run a PCA on both the predictors, the genes, and outcomes, the clinical values. The data will be normalized using centered
and scaled
. For the number of components, we’ll use 10.
x_pca <- pca(x, ncomp = 10, center = TRUE, scale = TRUE)
x_pca$explained_variance
## PC1 PC2 PC3 PC4 PC5 PC6
## 0.28073776 0.14862034 0.07963801 0.05377996 0.04555745 0.03925671
## PC7 PC8 PC9 PC10
## 0.03490533 0.02495056 0.02380736 0.01811799
y_pca <- pca(y, ncomp = 10, center = TRUE, scale = TRUE)
y_pca$explained_variance
## PC1 PC2 PC3 PC4 PC5
## 0.4083260500 0.2219951947 0.1247262301 0.0894716350 0.0675342967
## PC6 PC7 PC8 PC9 PC10
## 0.0483596842 0.0261923034 0.0089045216 0.0036440620 0.0008460224
In the case of the predictors, the majority of variance is explained by the first three principal components, while the variance in outcomes is explained by its first two principal components.
plot(x_pca)
plotVar(x_pca, comp = c(1, 2), var.names = TRUE, title = "Genomic Values, PCA comp 1 - 2")
plot(y_pca)
plotVar(y_pca, comp = c(1, 2), var.names = TRUE, title = "Clincal Values, PCA comp 1- 2")
plotIndiv(x_pca, comp = c(1, 2), group = liver.toxicity$treatment$Time.Group, ind.names = liver.toxicity$treatment$Dose.Group,
legend = TRUE, title = "Genomic Values, PCA comp 1 - 2")
plotIndiv(y_pca, comp = c(1, 2), group = liver.toxicity$treatment$Time.Group, ind.names = liver.toxicity$treatment$Dose.Group,
legend = TRUE, title = "Clinical Values, PCA comp 1 - 2")
Initially, we will use ten components.
set.seed(1337)
clin_pls <- pls(x, y, ncomp = 10, mode = "regression")
set.seed(1337)
clin_spls <- spls(x, y, ncomp = 10, keepX = c(10,10,10), keepY= c(10,10,10), mode = "regression")
## Note: Results are in terms of unit variance-- the further away from the center, the stronger the association
#plotVar(clin_pls, overlap = F)
#plotVar(clin_spls, overlap = F)
plotIndiv(clin_pls, group = liver.toxicity$treatment$Time.Group, ind.names = liver.toxicity$treatment$Dose.Group, legend = T, ellipse = T)
plotIndiv(clin_spls, group = liver.toxicity$treatment$Time.Group, ind.names = liver.toxicity$treatment$Dose.Group, legend = T, ellipse = T)
Note: CIMs are not rendering to markdown.
##Flush graphics device
#graphics.off()
#par(mar=c(1,1,1,1))
#cim(clin_pls, comp = 1:3, xlab = "clinic", ylab = "genes", margins = c(10, 10), title = "Clinical PLS Clustered Image Map")
#cim(clin_spls, comp = 1:3, xlab = "clinic", ylab = "genes", margins = c(10, 10), title = "Clinical sPLS Clustered Image Map")
perf()
will be used to compute evaluation critera for PLS and sPLS models. For cross-validation, Mfold
will be used rather than leave-one-out
. nrepeat
will be set to 50. We will set the progressBar
to false, because the calculations aren’t large enough to warrant its use.
tune_pls <- perf(clin_pls, validation = "Mfold", folds = 10, nrepeat = 50, progressBar = FALSE)
tune_spls <- perf(clin_spls, validation = "Mfold", folds = 10, nrepeat = 50, progressBar = FALSE)
plot(tune_pls$Q2.total, ylim = c(-0.5, 0.5),
main = "PLS Q^2 Values by Principal Component",
xlab = "Principal Component",
ylab = "Q^2 Value")
abline(h = 0.0975)
plot(tune_spls$Q2.total, ylim = c(-0.5, 0.5),
main = "sPLS Q^2 Values by Principal Component",
xlab = "Principal Component",
ylab = "Q^2 Value")
abline(h = 0.0975)
Tenenhaus, M. (1998) suggests using a cuttoff of 0.0975 for Q2 values, where any component with values lower than the cutoff are not included in the model.
#Note: to grab component names: attr(tune_pls$Q2.total, "dimnames")[[1]]
print(tune_pls$R2)
## 1 comp 2 comp 3 comp 4 comp
## BUN.mg.dL. 0.5010501736 5.533059e-01 0.541979011 0.546431310
## Creat.mg.dL. 0.0104128500 1.130104e-06 0.007529732 0.004108325
## TP.g.dL. 0.1621763577 3.106748e-02 0.204434306 0.161673397
## ALB.g.dL. 0.0129306670 1.525449e-01 0.346642550 0.318270237
## ALT.IU.L. 0.5908381965 7.192351e-01 0.723715929 0.728600956
## SDH.IU.L. 0.0002296715 3.709830e-03 0.037301067 0.069389524
## AST.IU.L. 0.5476274469 6.864532e-01 0.691048848 0.696700542
## ALP.IU.L. 0.1070668271 9.883115e-02 0.092646188 0.095964584
## TBA.umol.L. 0.5928074189 5.625907e-01 0.704913555 0.701690184
## Cholesterol.mg.dL. 0.2052163681 3.406464e-01 0.328877337 0.497029050
## 5 comp 6 comp 7 comp 8 comp
## BUN.mg.dL. 0.557108882 0.5580134918 0.6141905547 0.6304584533
## Creat.mg.dL. 0.001535764 0.0004712176 0.0001002794 0.0002140002
## TP.g.dL. 0.191612400 0.2013152159 0.2008858918 0.2057602307
## ALB.g.dL. 0.361406289 0.3785894818 0.3481212686 0.3458423687
## ALT.IU.L. 0.825358230 0.8388844896 0.8388257015 0.8635916112
## SDH.IU.L. 0.099250623 0.1224836936 0.1287803210 0.1453694367
## AST.IU.L. 0.792456114 0.7969327361 0.7941605173 0.8314073175
## ALP.IU.L. 0.161500900 0.1643902634 0.1567516748 0.1004379349
## TBA.umol.L. 0.714657619 0.7160075439 0.7162251906 0.7350029798
## Cholesterol.mg.dL. 0.564063105 0.5470620592 0.5576590574 0.5250811013
## 9 comp 10 comp
## BUN.mg.dL. 6.228067e-01 6.168478e-01
## Creat.mg.dL. 5.733002e-05 5.094242e-05
## TP.g.dL. 2.055860e-01 1.706926e-01
## ALB.g.dL. 3.299031e-01 2.789352e-01
## ALT.IU.L. 8.821954e-01 8.898601e-01
## SDH.IU.L. 1.573624e-01 1.709252e-01
## AST.IU.L. 8.541277e-01 8.594644e-01
## ALP.IU.L. 7.807649e-02 9.614649e-02
## TBA.umol.L. 7.250511e-01 7.215680e-01
## Cholesterol.mg.dL. 5.280646e-01 5.313970e-01
print(tune_spls$R2)
## 1 comp 2 comp 3 comp 4 comp
## BUN.mg.dL. 0.5593292411 0.5709933981 5.285063e-01 0.54077595
## Creat.mg.dL. 0.0051112429 0.0003599984 6.198234e-05 0.01483872
## TP.g.dL. 0.0179072602 0.2895135065 2.827782e-01 0.29782297
## ALB.g.dL. 0.0278577698 0.4816249221 5.101652e-01 0.53227983
## ALT.IU.L. 0.7389432249 0.7392490835 8.418223e-01 0.86025610
## SDH.IU.L. 0.0001294534 0.1096043018 1.392788e-01 0.16116359
## AST.IU.L. 0.7321526166 0.7332178099 8.375168e-01 0.85062574
## ALP.IU.L. 0.1117629794 0.0947503579 4.393165e-02 0.04154122
## TBA.umol.L. 0.6277592495 0.7586689914 7.714992e-01 0.77279800
## Cholesterol.mg.dL. 0.3830545500 0.4256502709 4.115514e-01 0.40650831
## 5 comp 6 comp 7 comp 8 comp
## BUN.mg.dL. 0.53073977 0.516002304 0.509055888 0.523225678
## Creat.mg.dL. 0.01312890 0.004191419 0.001185317 0.002630369
## TP.g.dL. 0.29004159 0.342261629 0.352651636 0.377256264
## ALB.g.dL. 0.51746630 0.543615439 0.544735274 0.545344821
## ALT.IU.L. 0.86573392 0.890115087 0.890111439 0.888047145
## SDH.IU.L. 0.16811515 0.106682337 0.102531987 0.106143500
## AST.IU.L. 0.84926556 0.863890631 0.858909682 0.856567092
## ALP.IU.L. 0.08193683 0.091171524 0.087073328 0.026046661
## TBA.umol.L. 0.76367316 0.772354134 0.760692161 0.755793525
## Cholesterol.mg.dL. 0.58128935 0.589427393 0.602389991 0.566424653
## 9 comp 10 comp
## BUN.mg.dL. 0.5307574921 5.349843e-01
## Creat.mg.dL. 0.0002127933 6.845561e-06
## TP.g.dL. 0.3937182065 3.714473e-01
## ALB.g.dL. 0.5557123920 5.332098e-01
## ALT.IU.L. 0.8929678692 8.984436e-01
## SDH.IU.L. 0.1106091464 1.361759e-01
## AST.IU.L. 0.8686789112 8.740647e-01
## ALP.IU.L. 0.0297329711 2.884399e-02
## TBA.umol.L. 0.7500899307 7.504276e-01
## Cholesterol.mg.dL. 0.5628222678 5.660352e-01
### PLS
#I only want the predictions, and in data frame format
pls_pred <- as.data.frame(predict(clin_pls, x)$predict)
pls_pred_one <- pls_pred[, grepl("dim 1", colnames(pls_pred))]
pls_pred_two <- pls_pred[, grepl("dim 2", colnames(pls_pred))]
### sPLS
spls_pred <- as.data.frame(predict(clin_spls, x)$predict)
spls_pred_one <- spls_pred[, grepl("dim 1", colnames(spls_pred))]
spls_pred_two <- spls_pred[, grepl("dim 2", colnames(spls_pred))]
performance_10comp <- rbind(metrics(y, pls_pred_one),
metrics(y, pls_pred_two),
metrics(y, spls_pred_one),
metrics(y, spls_pred_two))
row.names(performance_10comp) <- c("PLS Comp One",
"PLS Comp Two",
"sPLS Comp One",
"sPLS Comp Two")
For both models, we will keep only two components.
clin_pls <- pls(x, y, ncomp = 2, mode = "regression")
clin_spls <- spls(x, y, ncomp = 2, keepX = c(10,10,10), keepY= c(10,10,10), mode = "regression")
plotIndiv(clin_pls, group = liver.toxicity$treatment$Time.Group, ind.names = liver.toxicity$treatment$Dose.Group, legend = T, ellipse = T)
plotIndiv(clin_spls, group = liver.toxicity$treatment$Time.Group, ind.names = liver.toxicity$treatment$Dose.Group, legend = T, ellipse = T)
tune_pls <- perf(clin_pls, validation = "Mfold", folds = 10, nrepeat = 50, progressBar = FALSE)
tune_spls <- perf(clin_spls, validation = "Mfold", folds = 10, nrepeat = 50, progressBar = FALSE)
tune_pls$R2
## 1 comp 2 comp
## BUN.mg.dL. 0.496389004 5.393365e-01
## Creat.mg.dL. 0.013822791 4.862696e-05
## TP.g.dL. 0.143952350 1.348909e-02
## ALB.g.dL. 0.008143928 1.025431e-01
## ALT.IU.L. 0.608783895 7.225729e-01
## SDH.IU.L. 0.011599466 2.135409e-03
## AST.IU.L. 0.570868463 6.972753e-01
## ALP.IU.L. 0.118927380 1.083447e-01
## TBA.umol.L. 0.562102039 5.171175e-01
## Cholesterol.mg.dL. 0.210938921 3.564370e-01
tune_spls$R2
## 1 comp 2 comp
## BUN.mg.dL. 0.506889742 0.514231307
## Creat.mg.dL. 0.006593085 0.002243226
## TP.g.dL. 0.144963026 0.253371077
## ALB.g.dL. 0.002583555 0.418627305
## ALT.IU.L. 0.764310951 0.758689163
## SDH.IU.L. 0.001060766 0.120233397
## AST.IU.L. 0.752117821 0.750245885
## ALP.IU.L. 0.119551252 0.114109905
## TBA.umol.L. 0.613836204 0.749953144
## Cholesterol.mg.dL. 0.357968911 0.408439470
plotVar(clin_pls, overlap = F)
plotVar(clin_spls, overlap = F)
At this point, the sparse PLS model appears to be outperforming the the PLS model in terms of its coefficients of determination.
### PLS
#I only want the predictions, and in data frame format
pls_pred <- as.data.frame(predict(clin_pls, x)$predict)
pls_pred_one <- pls_pred[, grepl("dim 1", colnames(pls_pred))]
pls_pred_two <- pls_pred[, grepl("dim 2", colnames(pls_pred))]
### sPLS
spls_pred <- as.data.frame(predict(clin_spls, x)$predict)
spls_pred_one <- pls_pred[, grepl("dim 1", colnames(spls_pred))]
spls_pred_two <- pls_pred[, grepl("dim 2", colnames(spls_pred))]
performance_2comp <- rbind(metrics(y, pls_pred_one),
metrics(y, pls_pred_two),
metrics(y, spls_pred_one),
metrics(y, spls_pred_two))
row.names(performance_2comp) <- c("PLS Comp One",
"PLS Comp Two",
"sPLS Comp One",
"sPLS Comp Two")
print(performance_10comp)
## BUN.mg.dL. Creat.mg.dL. TP.g.dL. ALB.g.dL. ALT.IU.L.
## PLS Comp One 2.880435 0.05771321 0.3870113 0.2545776 1924.443
## PLS Comp Two 2.730955 0.05759611 0.3588442 0.2188154 1551.441
## sPLS Comp One 2.779389 0.05864911 0.3858104 0.2507651 1487.405
## sPLS Comp Two 2.737753 0.05824218 0.3089602 0.1778778 1478.146
## SDH.IU.L. AST.IU.L. ALP.IU.L. TBA.umol.L. Cholesterol.mg.dL.
## PLS Comp One 155.9711 4309.313 53.73584 14.44731 11.894906
## PLS Comp Two 153.6316 3492.627 53.65465 14.43709 10.438546
## sPLS Comp One 156.8447 3266.191 54.21545 13.51068 10.488472
## sPLS Comp Two 141.3324 3243.873 53.88277 10.25237 9.919381
print(performance_2comp)
## BUN.mg.dL. Creat.mg.dL. TP.g.dL. ALB.g.dL. ALT.IU.L.
## PLS Comp One 2.880435 0.05771321 0.3870113 0.2545776 1924.443
## PLS Comp Two 2.730955 0.05759611 0.3588442 0.2188154 1551.441
## sPLS Comp One 2.880435 0.05771321 0.3870113 0.2545776 1924.443
## sPLS Comp Two 2.730955 0.05759611 0.3588442 0.2188154 1551.441
## SDH.IU.L. AST.IU.L. ALP.IU.L. TBA.umol.L. Cholesterol.mg.dL.
## PLS Comp One 155.9711 4309.313 53.73584 14.44731 11.89491
## PLS Comp Two 153.6316 3492.627 53.65465 14.43709 10.43855
## sPLS Comp One 155.9711 4309.313 53.73584 14.44731 11.89491
## sPLS Comp Two 153.6316 3492.627 53.65465 14.43709 10.43855
Note: when components were reduced to 2, both sPLS and PLS had identical performance.