library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.94 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
##
## Attaching package: 'pls'
##
## The following object is masked from 'package:corrplot':
##
## corrplot
##
## The following object is masked from 'package:caret':
##
## R2
##
## The following object is masked from 'package:stats':
##
## loadings
library(arm)
## Warning: package 'arm' was built under R version 4.3.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
##
## arm (Version 1.14-4, built: 2024-4-1)
##
## Working directory is C:/Users/Owner/Desktop/DATA 624
##
##
## Attaching package: 'arm'
##
## The following objects are masked from 'package:pls':
##
## coefplot, corrplot
##
## The following object is masked from 'package:corrplot':
##
## corrplot
Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug.
data(permeability)
ncol(fingerprints)
## [1] 1107
nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]
num_predictors_left <- ncol(fingerprints_filtered)
num_predictors_left
## [1] 388
We started with 1107 predictors, but after applying nearZeroVar, we reduced this number to 388.
set.seed(334)
index <- createDataPartition(permeability, p = .8, list = FALSE)
# Split variables
train_fp <- fingerprints_filtered[index, ]
test_fp <- fingerprints_filtered[-index, ]
train_perm <- permeability[index]
test_perm <- permeability[-index]
# Cross Validation using 10 fold
ctrl <- trainControl(method = "cv", number = 10)
plsmodel <- train(train_fp, train_perm, method = "pls",
metric = "Rsquared",tuneLength = 20, trControl = ctrl,
preProc = c("center", "scale"))
plot(plsmodel)
plsmodel
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 121, 120, 121, 120, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.99280 0.3070361 10.009069
## 2 12.16128 0.4339740 8.759500
## 3 11.91646 0.4313142 9.091931
## 4 11.71985 0.4403444 8.995034
## 5 11.52004 0.4849718 8.545651
## 6 11.22189 0.5120605 8.435937
## 7 11.12635 0.5352629 8.365518
## 8 11.15961 0.5243175 8.378692
## 9 11.33492 0.5060529 8.610821
## 10 11.29378 0.5167202 8.546452
## 11 11.25400 0.5231956 8.490934
## 12 11.29238 0.5252548 8.633259
## 13 11.36673 0.5138357 8.731267
## 14 11.42145 0.5074234 8.706800
## 15 11.67288 0.5035852 8.991768
## 16 12.14120 0.4803830 9.278428
## 17 12.49861 0.4605289 9.580962
## 18 12.77110 0.4494579 9.787425
## 19 12.81140 0.4481352 9.894780
## 20 13.24251 0.4282701 10.141927
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 7.
Based on the graph and table, it appears that the optimal number of components is 7, which yields an R-squared value of 0.5352.
# Prediction of value
predictions <- predict(plsmodel, test_fp)
# Combination of observation + prediction
pls1 <- data.frame(obs = test_perm, pred = predictions)
# calculation of prediction accuracy
defaultSummary(pls1)
## RMSE Rsquared MAE
## 12.0828371 0.4807966 9.5900950
The R-squared value for the test set appears to be 0.4808.
# LARS Model
set.seed(17)
larsmodel <- train(train_fp, train_perm, method = "lars", metric = "Rsquared",
tuneLength = 20, trControl = ctrl, preProc = c("center", "scale"))
plot(larsmodel)
larsmodel
## Least Angle Regression
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 119, 121, 120, 119, 119, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.05 11.89877 0.4596729 8.650788
## 0.10 11.57994 0.4676499 8.659622
## 0.15 11.49042 0.4660516 8.684607
## 0.20 11.79188 0.4556552 8.757311
## 0.25 12.00700 0.4480268 8.894531
## 0.30 12.20094 0.4307403 9.035129
## 0.35 12.39225 0.4231629 9.200676
## 0.40 12.81794 0.4049887 9.494958
## 0.45 13.36303 0.3879590 9.887695
## 0.50 13.88195 0.3741034 10.243365
## 0.55 14.29867 0.3672552 10.537459
## 0.60 14.73194 0.3560844 10.849888
## 0.65 15.12604 0.3521315 11.093740
## 0.70 15.70629 0.3482580 11.399051
## 0.75 16.32444 0.3448372 11.727394
## 0.80 16.75558 0.3440126 11.966841
## 0.85 17.32837 0.3354593 12.340370
## 0.90 18.08390 0.3248463 12.693740
## 0.95 18.86889 0.3122171 13.191606
## 1.00 19.69152 0.3021566 13.722607
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was fraction = 0.1.
set.seed(18)
# Elastic Net Model
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1), .fraction = seq(.05, 1, length = 20))
enetmodel <- train(train_fp, train_perm, method = "enet",
tuneGrid = enetGrid, trControl = ctrl, preProc = c("center", "scale"))
## Warning: model fit failed for Fold04: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(enetmodel)
enetmodel
## Elasticnet
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 121, 119, 118, 120, 120, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.05 12.26077 0.4302238 9.068764
## 0.00 0.10 12.06994 0.4461138 8.710435
## 0.00 0.15 12.33234 0.4336162 9.037856
## 0.00 0.20 12.99665 0.4173619 9.596072
## 0.00 0.25 13.58559 0.4062614 10.104331
## 0.00 0.30 14.12676 0.4000653 10.474480
## 0.00 0.35 14.62540 0.3978212 10.870962
## 0.00 0.40 15.02091 0.3928948 11.235684
## 0.00 0.45 15.72952 0.3852346 11.744823
## 0.00 0.50 16.50410 0.3766445 12.313339
## 0.00 0.55 17.21365 0.3713162 12.801739
## 0.00 0.60 17.95253 0.3679022 13.261248
## 0.00 0.65 18.38448 0.3703363 13.563239
## 0.00 0.70 18.90872 0.3742150 13.855196
## 0.00 0.75 19.53553 0.3737620 14.108055
## 0.00 0.80 20.19605 0.3718859 14.397302
## 0.00 0.85 20.71126 0.3666779 14.514537
## 0.00 0.90 22.45772 0.3597702 15.061962
## 0.00 0.95 24.67305 0.3518519 15.646127
## 0.00 1.00 27.00508 0.3414503 16.498961
## 0.01 0.05 11.73387 0.4305112 8.383810
## 0.01 0.10 11.35687 0.4685121 8.118469
## 0.01 0.15 10.84106 0.4909361 7.763605
## 0.01 0.20 10.79683 0.4830988 7.779883
## 0.01 0.25 10.86258 0.4805777 7.764118
## 0.01 0.30 10.95105 0.4830138 7.798469
## 0.01 0.35 11.09094 0.4798167 7.935408
## 0.01 0.40 11.28013 0.4731556 8.093791
## 0.01 0.45 11.48214 0.4652692 8.259275
## 0.01 0.50 11.69816 0.4576207 8.454800
## 0.01 0.55 11.86757 0.4520983 8.600172
## 0.01 0.60 12.00163 0.4463469 8.706146
## 0.01 0.65 12.09158 0.4421529 8.812382
## 0.01 0.70 12.19029 0.4387677 8.946471
## 0.01 0.75 12.27026 0.4379888 9.095215
## 0.01 0.80 12.39657 0.4369456 9.264022
## 0.01 0.85 12.56291 0.4334777 9.415054
## 0.01 0.90 12.76677 0.4284080 9.586584
## 0.01 0.95 12.97475 0.4228666 9.745072
## 0.01 1.00 13.18892 0.4173654 9.898766
## 0.10 0.05 12.67053 0.3520824 9.588713
## 0.10 0.10 11.65835 0.4299693 8.303430
## 0.10 0.15 11.42173 0.4544492 8.092523
## 0.10 0.20 11.31504 0.4691684 8.070741
## 0.10 0.25 11.06593 0.4837197 7.914023
## 0.10 0.30 10.90718 0.4890784 7.819914
## 0.10 0.35 10.87891 0.4887349 7.786409
## 0.10 0.40 10.89849 0.4890789 7.779867
## 0.10 0.45 10.89071 0.4929433 7.785470
## 0.10 0.50 10.94018 0.4926868 7.838433
## 0.10 0.55 10.99183 0.4920893 7.917916
## 0.10 0.60 11.07382 0.4906018 8.015487
## 0.10 0.65 11.15629 0.4886977 8.093059
## 0.10 0.70 11.19853 0.4892970 8.145665
## 0.10 0.75 11.23936 0.4901289 8.196103
## 0.10 0.80 11.30112 0.4891838 8.268567
## 0.10 0.85 11.34713 0.4882780 8.328482
## 0.10 0.90 11.37836 0.4881447 8.367509
## 0.10 0.95 11.40881 0.4883520 8.403798
## 0.10 1.00 11.46064 0.4872150 8.456577
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.2 and lambda = 0.01.
Examining alternative models like LARS (Least-Angle Regression), we find that the optimal model was achieved at a 0.10 fraction, with an R-squared value of 0.467. Another model tested, the Elastic Net, had its best performance with a lambda of 0.00 and a fraction of 0.10, resulting in an R-squared value of 0.44611. Both models performed worse than the Partial Least Squares regression.
I wouldn’t suggest using any of the models I tested as a substitute for the permeability laboratory experiment. However, I would consider trying XGBoost or Support Vector Machines (SVM) to see if they could achieve a higher R-squared value and lower RMSE and MAE. I believe these methods might better manage a large number of components or predictors compared to those previously used. In particular, SVM could be especially effective when the number of predictors exceeds the number of samples.
data(ChemicalManufacturingProcess)
# Verifying quantity of missing data
sum(is.na(ChemicalManufacturingProcess))
## [1] 106
library(RANN)
## Warning: package 'RANN' was built under R version 4.3.3
# Impute missing data by applying KNN
impute <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
imputed <- predict(impute, ChemicalManufacturingProcess)
# Verify remaining quantity of missing data
sum(is.na(imputed))
## [1] 0
Using KNN imputation, we replaced 106 missing values with imputed values. I chose KNN because biological data often follows a predictable density function, where observations with similar features tend to have similar outcomes. I opted not to remove the missing data, as doing so in a biological dataset—especially given the number of missing samples—could risk losing potential relationships in the data.
imputed <- imputed[, -nearZeroVar(imputed)]
X <- dplyr::select(imputed, -Yield)
y <- imputed$Yield
set.seed(22)
index <- createDataPartition(y, p = .8, list = FALSE)
# Split Variables
train_X <- X[index, ] %>% as.matrix()
test_X <- X[-index, ] %>% as.matrix()
train_y <- y[index]
test_y <- y[-index]
# Cross Validation with 10 Fold
ctrl <- trainControl(method = "cv", number = 10)
plsmodel <- train(x = train_X, y = train_y, method = "pls", metric = "Rsquared",
tuneLength = 20, trControl = ctrl, preProc = c("center", "scale"))
plot(plsmodel)
plsmodel$results
The optimal model for PLS was achieved with 3 components, resulting in an R-squared value of 0.5314297.
# Predict the value
prediction2 <- predict(plsmodel, test_X)
# Combine the Observation and Prediction
pls2 <- data.frame(obs = test_y, pred = prediction2)
# Calculate prediction accuracy
defaultSummary(pls2)
## RMSE Rsquared MAE
## 0.7758733 0.4299477 0.5180942
I chose PLS because it performed best in question 6.2. However, the resampled data shows a lower R-squared value than the training set, indicating that the model requires further tuning.
# Top 10 Predictors contained in the Model
varImpPlot <- varImp(plsmodel, scale = FALSE)
Imp2 <- data.frame(
Variable = rownames(varImpPlot$importance),
Score = varImpPlot$importance$Overall
)
# Sort the dataframe in descending order
sorted_Imp2 <- Imp2[order(-Imp2$Score), ]
# Extract names of the Top 10 Predictors
top10_pred <- head(sorted_Imp2$Variable, 10)
top10_pred
## [1] "ManufacturingProcess32" "ManufacturingProcess13" "ManufacturingProcess36"
## [4] "ManufacturingProcess09" "ManufacturingProcess17" "ManufacturingProcess33"
## [7] "BiologicalMaterial02" "BiologicalMaterial06" "ManufacturingProcess06"
## [10] "BiologicalMaterial08"
top10_pred_df <- dplyr::select(X, all_of(top10_pred))
top10_pred_df$Yield <- y
It appears that among the top 10 predictors for the model, Manufacturing Process predictors are predominant, accounting for 7 of the top 10, while Biological Material predictors make up the remaining 3.
# Correlation Data
corr_data <- cor(dplyr::select_if(top10_pred_df, is.numeric), use = "complete.obs")
# Correlation Plot
corrplot::corrplot(corr_data, method = 'number', type = 'lower', number.cex = 0.75, tl.cex= 0.75)
An intriguing aspect of the correlation analysis is that all three predictors showing negative correlations with yield are from Manufacturing Process, with Manufacturing Process 36 exhibiting the weakest negative correlation. In contrast, Manufacturing Process 32 displays the highest positive correlation at 0.61. All Biological Material predictors have positive correlations with yield, suggesting a strong relationship between biological components and yield levels. Additionally, the correlations among predictors are noteworthy, as all biological materials are positively correlated with one another. The lowest negative correlation among predictors is -0.79, which occurs between Manufacturing Process 36 and Manufacturing Process 32, as well as between Manufacturing Process 13 and Manufacturing Process 9.