library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.94 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
## 
## Attaching package: 'pls'
## 
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## 
## The following object is masked from 'package:caret':
## 
##     R2
## 
## The following object is masked from 'package:stats':
## 
##     loadings
library(arm)
## Warning: package 'arm' was built under R version 4.3.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## 
## arm (Version 1.14-4, built: 2024-4-1)
## 
## Working directory is C:/Users/Owner/Desktop/DATA 624
## 
## 
## Attaching package: 'arm'
## 
## The following objects are masked from 'package:pls':
## 
##     coefplot, corrplot
## 
## The following object is masked from 'package:corrplot':
## 
##     corrplot

6.2

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug.

data(permeability)
ncol(fingerprints)
## [1] 1107
nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]
num_predictors_left <- ncol(fingerprints_filtered)
num_predictors_left
## [1] 388

We started with 1107 predictors, but after applying nearZeroVar, we reduced this number to 388.

set.seed(334)
index <- createDataPartition(permeability, p = .8, list = FALSE)

# Split variables
train_fp <- fingerprints_filtered[index, ]
test_fp <- fingerprints_filtered[-index, ]
train_perm <- permeability[index]
test_perm <- permeability[-index]


# Cross Validation using 10 fold
ctrl <- trainControl(method = "cv", number = 10)

plsmodel <- train(train_fp, train_perm, method = "pls", 
                  metric = "Rsquared",tuneLength = 20, trControl = ctrl,
                  preProc = c("center", "scale"))

plot(plsmodel)

plsmodel
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 121, 120, 121, 120, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     12.99280  0.3070361  10.009069
##    2     12.16128  0.4339740   8.759500
##    3     11.91646  0.4313142   9.091931
##    4     11.71985  0.4403444   8.995034
##    5     11.52004  0.4849718   8.545651
##    6     11.22189  0.5120605   8.435937
##    7     11.12635  0.5352629   8.365518
##    8     11.15961  0.5243175   8.378692
##    9     11.33492  0.5060529   8.610821
##   10     11.29378  0.5167202   8.546452
##   11     11.25400  0.5231956   8.490934
##   12     11.29238  0.5252548   8.633259
##   13     11.36673  0.5138357   8.731267
##   14     11.42145  0.5074234   8.706800
##   15     11.67288  0.5035852   8.991768
##   16     12.14120  0.4803830   9.278428
##   17     12.49861  0.4605289   9.580962
##   18     12.77110  0.4494579   9.787425
##   19     12.81140  0.4481352   9.894780
##   20     13.24251  0.4282701  10.141927
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 7.

Based on the graph and table, it appears that the optimal number of components is 7, which yields an R-squared value of 0.5352.

# Prediction of value
predictions <- predict(plsmodel, test_fp)
# Combination of observation + prediction
pls1 <- data.frame(obs = test_perm, pred = predictions)
# calculation of prediction accuracy
defaultSummary(pls1)
##       RMSE   Rsquared        MAE 
## 12.0828371  0.4807966  9.5900950

The R-squared value for the test set appears to be 0.4808.

# LARS Model
set.seed(17)
larsmodel <- train(train_fp, train_perm, method = "lars", metric = "Rsquared",
                  tuneLength = 20, trControl = ctrl, preProc = c("center", "scale"))
plot(larsmodel)

larsmodel
## Least Angle Regression 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 119, 121, 120, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE      Rsquared   MAE      
##   0.05      11.89877  0.4596729   8.650788
##   0.10      11.57994  0.4676499   8.659622
##   0.15      11.49042  0.4660516   8.684607
##   0.20      11.79188  0.4556552   8.757311
##   0.25      12.00700  0.4480268   8.894531
##   0.30      12.20094  0.4307403   9.035129
##   0.35      12.39225  0.4231629   9.200676
##   0.40      12.81794  0.4049887   9.494958
##   0.45      13.36303  0.3879590   9.887695
##   0.50      13.88195  0.3741034  10.243365
##   0.55      14.29867  0.3672552  10.537459
##   0.60      14.73194  0.3560844  10.849888
##   0.65      15.12604  0.3521315  11.093740
##   0.70      15.70629  0.3482580  11.399051
##   0.75      16.32444  0.3448372  11.727394
##   0.80      16.75558  0.3440126  11.966841
##   0.85      17.32837  0.3354593  12.340370
##   0.90      18.08390  0.3248463  12.693740
##   0.95      18.86889  0.3122171  13.191606
##   1.00      19.69152  0.3021566  13.722607
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was fraction = 0.1.
set.seed(18)
# Elastic Net Model
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1), .fraction = seq(.05, 1, length = 20))
enetmodel <- train(train_fp, train_perm, method = "enet",
                  tuneGrid = enetGrid, trControl = ctrl, preProc = c("center", "scale"))
## Warning: model fit failed for Fold04: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
plot(enetmodel)

enetmodel
## Elasticnet 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 121, 119, 118, 120, 120, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE      Rsquared   MAE      
##   0.00    0.05      12.26077  0.4302238   9.068764
##   0.00    0.10      12.06994  0.4461138   8.710435
##   0.00    0.15      12.33234  0.4336162   9.037856
##   0.00    0.20      12.99665  0.4173619   9.596072
##   0.00    0.25      13.58559  0.4062614  10.104331
##   0.00    0.30      14.12676  0.4000653  10.474480
##   0.00    0.35      14.62540  0.3978212  10.870962
##   0.00    0.40      15.02091  0.3928948  11.235684
##   0.00    0.45      15.72952  0.3852346  11.744823
##   0.00    0.50      16.50410  0.3766445  12.313339
##   0.00    0.55      17.21365  0.3713162  12.801739
##   0.00    0.60      17.95253  0.3679022  13.261248
##   0.00    0.65      18.38448  0.3703363  13.563239
##   0.00    0.70      18.90872  0.3742150  13.855196
##   0.00    0.75      19.53553  0.3737620  14.108055
##   0.00    0.80      20.19605  0.3718859  14.397302
##   0.00    0.85      20.71126  0.3666779  14.514537
##   0.00    0.90      22.45772  0.3597702  15.061962
##   0.00    0.95      24.67305  0.3518519  15.646127
##   0.00    1.00      27.00508  0.3414503  16.498961
##   0.01    0.05      11.73387  0.4305112   8.383810
##   0.01    0.10      11.35687  0.4685121   8.118469
##   0.01    0.15      10.84106  0.4909361   7.763605
##   0.01    0.20      10.79683  0.4830988   7.779883
##   0.01    0.25      10.86258  0.4805777   7.764118
##   0.01    0.30      10.95105  0.4830138   7.798469
##   0.01    0.35      11.09094  0.4798167   7.935408
##   0.01    0.40      11.28013  0.4731556   8.093791
##   0.01    0.45      11.48214  0.4652692   8.259275
##   0.01    0.50      11.69816  0.4576207   8.454800
##   0.01    0.55      11.86757  0.4520983   8.600172
##   0.01    0.60      12.00163  0.4463469   8.706146
##   0.01    0.65      12.09158  0.4421529   8.812382
##   0.01    0.70      12.19029  0.4387677   8.946471
##   0.01    0.75      12.27026  0.4379888   9.095215
##   0.01    0.80      12.39657  0.4369456   9.264022
##   0.01    0.85      12.56291  0.4334777   9.415054
##   0.01    0.90      12.76677  0.4284080   9.586584
##   0.01    0.95      12.97475  0.4228666   9.745072
##   0.01    1.00      13.18892  0.4173654   9.898766
##   0.10    0.05      12.67053  0.3520824   9.588713
##   0.10    0.10      11.65835  0.4299693   8.303430
##   0.10    0.15      11.42173  0.4544492   8.092523
##   0.10    0.20      11.31504  0.4691684   8.070741
##   0.10    0.25      11.06593  0.4837197   7.914023
##   0.10    0.30      10.90718  0.4890784   7.819914
##   0.10    0.35      10.87891  0.4887349   7.786409
##   0.10    0.40      10.89849  0.4890789   7.779867
##   0.10    0.45      10.89071  0.4929433   7.785470
##   0.10    0.50      10.94018  0.4926868   7.838433
##   0.10    0.55      10.99183  0.4920893   7.917916
##   0.10    0.60      11.07382  0.4906018   8.015487
##   0.10    0.65      11.15629  0.4886977   8.093059
##   0.10    0.70      11.19853  0.4892970   8.145665
##   0.10    0.75      11.23936  0.4901289   8.196103
##   0.10    0.80      11.30112  0.4891838   8.268567
##   0.10    0.85      11.34713  0.4882780   8.328482
##   0.10    0.90      11.37836  0.4881447   8.367509
##   0.10    0.95      11.40881  0.4883520   8.403798
##   0.10    1.00      11.46064  0.4872150   8.456577
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.2 and lambda = 0.01.

Examining alternative models like LARS (Least-Angle Regression), we find that the optimal model was achieved at a 0.10 fraction, with an R-squared value of 0.467. Another model tested, the Elastic Net, had its best performance with a lambda of 0.00 and a fraction of 0.10, resulting in an R-squared value of 0.44611. Both models performed worse than the Partial Least Squares regression.

I wouldn’t suggest using any of the models I tested as a substitute for the permeability laboratory experiment. However, I would consider trying XGBoost or Support Vector Machines (SVM) to see if they could achieve a higher R-squared value and lower RMSE and MAE. I believe these methods might better manage a large number of components or predictors compared to those previously used. In particular, SVM could be especially effective when the number of predictors exceeds the number of samples.

6.3

data(ChemicalManufacturingProcess)
# Verifying quantity of missing data
sum(is.na(ChemicalManufacturingProcess))
## [1] 106
library(RANN)
## Warning: package 'RANN' was built under R version 4.3.3
# Impute missing data by applying KNN
impute <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
imputed <- predict(impute, ChemicalManufacturingProcess)
# Verify remaining quantity of missing data
sum(is.na(imputed))
## [1] 0

Using KNN imputation, we replaced 106 missing values with imputed values. I chose KNN because biological data often follows a predictable density function, where observations with similar features tend to have similar outcomes. I opted not to remove the missing data, as doing so in a biological dataset—especially given the number of missing samples—could risk losing potential relationships in the data.

imputed <- imputed[, -nearZeroVar(imputed)]
X <- dplyr::select(imputed, -Yield)
y <- imputed$Yield

set.seed(22)
index <- createDataPartition(y, p = .8, list = FALSE)

# Split Variables
train_X <- X[index, ] %>% as.matrix()
test_X <- X[-index, ] %>% as.matrix()
train_y <- y[index]
test_y <- y[-index]


# Cross Validation with 10 Fold
ctrl <- trainControl(method = "cv", number = 10)
plsmodel <- train(x = train_X, y = train_y, method = "pls", metric = "Rsquared",
                  tuneLength = 20, trControl = ctrl, preProc = c("center", "scale"))
plot(plsmodel)

plsmodel$results

The optimal model for PLS was achieved with 3 components, resulting in an R-squared value of 0.5314297.

# Predict the value
prediction2 <- predict(plsmodel, test_X)
# Combine the Observation and Prediction
pls2 <- data.frame(obs = test_y, pred = prediction2)
# Calculate prediction accuracy
defaultSummary(pls2)
##      RMSE  Rsquared       MAE 
## 0.7758733 0.4299477 0.5180942

I chose PLS because it performed best in question 6.2. However, the resampled data shows a lower R-squared value than the training set, indicating that the model requires further tuning.

# Top 10 Predictors contained in the Model
varImpPlot <- varImp(plsmodel, scale = FALSE)

Imp2 <- data.frame(
  Variable = rownames(varImpPlot$importance),
  Score = varImpPlot$importance$Overall
)

# Sort the dataframe in descending order
sorted_Imp2 <- Imp2[order(-Imp2$Score), ]

# Extract names of the Top 10 Predictors
top10_pred <- head(sorted_Imp2$Variable, 10)
top10_pred
##  [1] "ManufacturingProcess32" "ManufacturingProcess13" "ManufacturingProcess36"
##  [4] "ManufacturingProcess09" "ManufacturingProcess17" "ManufacturingProcess33"
##  [7] "BiologicalMaterial02"   "BiologicalMaterial06"   "ManufacturingProcess06"
## [10] "BiologicalMaterial08"
top10_pred_df <- dplyr::select(X, all_of(top10_pred))
top10_pred_df$Yield <- y

It appears that among the top 10 predictors for the model, Manufacturing Process predictors are predominant, accounting for 7 of the top 10, while Biological Material predictors make up the remaining 3.

# Correlation Data
corr_data <- cor(dplyr::select_if(top10_pred_df, is.numeric), use = "complete.obs")
# Correlation Plot
corrplot::corrplot(corr_data, method = 'number', type = 'lower', number.cex = 0.75, tl.cex= 0.75)

An intriguing aspect of the correlation analysis is that all three predictors showing negative correlations with yield are from Manufacturing Process, with Manufacturing Process 36 exhibiting the weakest negative correlation. In contrast, Manufacturing Process 32 displays the highest positive correlation at 0.61. All Biological Material predictors have positive correlations with yield, suggesting a strong relationship between biological components and yield levels. Additionally, the correlations among predictors are noteworthy, as all biological materials are positively correlated with one another. The lowest negative correlation among predictors is -0.79, which occurs between Manufacturing Process 36 and Manufacturing Process 32, as well as between Manufacturing Process 13 and Manufacturing Process 9.