Homework 7

library(AppliedPredictiveModeling)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RANN)

Question 6.2

Part A

Developing a model to predict permeability (see Sect. 1.4) could save sig- nificant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

data(permeability)
str(fingerprints)
##  num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
str(permeability)
##  num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr "permeability"

Part B

nzv <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv]
cat("Number of predictors remaining:", ncol(filtered_fingerprints), "\n")
## Number of predictors remaining: 388

Originally there were roughly 720 near zero variance fingerprints, with 388 predictors remaining for modeling. This is a significant reduction.

Part C

set.seed(624)

trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)

trainX <- filtered_fingerprints[trainIndex, ]
testX  <- filtered_fingerprints[-trainIndex, ]

trainY <- permeability[trainIndex]
testY  <- permeability[-trainIndex]
set.seed(624)
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

pls_model <- train(
  x = trainX, y = trainY,
  method = "pls",
  preProcess = c("center", "scale"),
  tuneLength = 20,
  trControl = ctrl
)
print(pls_model)
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 121, 118, 119, 121, 119, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.52338  0.3040039  10.322412
##    2     12.38787  0.4200082   8.930705
##    3     12.47117  0.4257174   9.482308
##    4     12.45276  0.4360255   9.403024
##    5     12.32778  0.4500968   9.058762
##    6     12.22362  0.4540961   9.022149
##    7     12.04225  0.4536415   9.064597
##    8     11.84075  0.4687327   9.137174
##    9     11.82483  0.4737577   9.141689
##   10     11.89296  0.4682978   9.147988
##   11     11.85878  0.4721590   9.082044
##   12     11.85413  0.4742938   9.088010
##   13     12.04888  0.4630905   9.106178
##   14     12.27021  0.4463921   9.256883
##   15     12.51958  0.4286859   9.370305
##   16     12.62154  0.4233383   9.427397
##   17     12.90120  0.4112245   9.593511
##   18     12.96692  0.4104097   9.642338
##   19     12.99103  0.4120062   9.624802
##   20     12.87135  0.4203440   9.534101
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 9.
plot(pls_model)

cat("Optimal number of latent variables:", pls_model$bestTune$ncomp, "\n")
## Optimal number of latent variables: 9
cat("Resampled R^2 estimate:", max(pls_model$results$Rsquared), "\n")
## Resampled R^2 estimate: 0.4742938

Part D

pls_pred <- predict(pls_model, testX)
pls_R2 <- cor(pls_pred, testY)^2
cat("Test set R^2:", pls_R2, "\n")
## Test set R^2: 0.530305

Part E

set.seed(624)
rf_model <- train(
  x = trainX, y = trainY,
  method = "rf",
  trControl = ctrl,
  importance = TRUE
)
rf_pred <- predict(rf_model, testX)
rf_R2 <- cor(rf_pred, testY)^2

cat("Random Forest test R^2:", rf_R2, "\n")
## Random Forest test R^2: 0.353759

Part F

I would recommend the PLS model since it has R^2 value of 0.53 which is better then the other models I had tried.

Question 6.3

6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the re- lationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing pro- cess. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

Part A

data("ChemicalManufacturingProcess")
processPredictors <- subset(ChemicalManufacturingProcess,select = -Yield)
yield <- subset(ChemicalManufacturingProcess, select = "Yield")

Part B and C

sum(is.na(processPredictors))
## [1] 106
set.seed(624)

trainingRows <- createDataPartition(yield$Yield,
                                    p = 0.8,
                                    list = FALSE)

trainPredictors <- processPredictors[trainingRows,]
testPredictors <- processPredictors[-trainingRows,]

trainYield <- yield[trainingRows,]
testYield <- yield[-trainingRows,]
preprocess <- preProcess(trainPredictors,method=c("medianImpute", "center", "scale"))
preprocessTrainPredictors <- predict(preprocess,trainPredictors)
preprocessTestPredictors <- predict(preprocess,testPredictors)
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(123)
rfModel <- train(x = preprocessTrainPredictors, y = trainYield, method = "rf", trControl = ctrl, tuneLength = 5, metric = "RMSE")
rfModel
## Random Forest 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    1.222643  0.6222791  1.0063736
##   15    1.117762  0.6531000  0.9001873
##   29    1.115297  0.6446640  0.8984263
##   43    1.126134  0.6293753  0.8995005
##   57    1.136023  0.6178496  0.9042034
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
rfModel$results[which.min(rfModel$results$RMSE), ]
##   mtry     RMSE Rsquared       MAE    RMSESD RsquaredSD     MAESD
## 3   29 1.115297 0.644664 0.8984263 0.2456929  0.1280491 0.1939157

Part D

rfPred <- predict(rfModel, newdata = preprocessTestPredictors)

testResults <- postResample(pred = rfPred, obs = testYield)
testResults
##      RMSE  Rsquared       MAE 
## 1.2199526 0.7359082 0.8711505
resampled <- rfModel$results[which.min(rfModel$results$RMSE), ]
comparison <- data.frame(
Data = c("Training (CV)", "Test"),
RMSE = c(resampled$RMSE, testResults["RMSE"]),
Rsquared = c(resampled$Rsquared, testResults["Rsquared"])
)
comparison
##               Data     RMSE  Rsquared
##      Training (CV) 1.115297 0.6446640
## RMSE          Test 1.219953 0.7359082

Part E

varImpRF <- varImp(rfModel)
plot(varImpRF, top = 15)

topVars <- varImpRF$importance %>%
arrange(desc(Overall)) %>%
head(10) %>%
rownames()
topVars
##  [1] "ManufacturingProcess32" "BiologicalMaterial12"   "BiologicalMaterial06"  
##  [4] "BiologicalMaterial03"   "ManufacturingProcess09" "ManufacturingProcess31"
##  [7] "ManufacturingProcess13" "ManufacturingProcess06" "ManufacturingProcess36"
## [10] "BiologicalMaterial11"
bio_predictors <- colnames(processPredictors)[1:12]
process_predictors <- colnames(processPredictors)[13:57]

data.frame(
Predictor = topVars,
Type = ifelse(topVars %in% bio_predictors, "Biological", "Process")
)
##                 Predictor       Type
## 1  ManufacturingProcess32    Process
## 2    BiologicalMaterial12 Biological
## 3    BiologicalMaterial06 Biological
## 4    BiologicalMaterial03 Biological
## 5  ManufacturingProcess09    Process
## 6  ManufacturingProcess31    Process
## 7  ManufacturingProcess13    Process
## 8  ManufacturingProcess06    Process
## 9  ManufacturingProcess36    Process
## 10   BiologicalMaterial11 Biological

Part F

library(tidyr)

topData <- processPredictors[, topVars]
yield <- unlist(yield)

topData$Yield <- as.numeric(yield)

top6 <- topVars[1:6]

topData_long <- topData %>%
  pivot_longer(cols = all_of(top6), names_to = "Predictor", values_to = "Value")

ggplot(topData_long, aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.6, color = "blue") +
  geom_smooth(method = "loess", color = "red", se = FALSE) +
  facet_wrap(~ Predictor, scales = "free", ncol = 3) +
  labs(
    title = "Relationships Between Top Predictors and Product Yield",
    x = "Predictor Value",
    y = "Percent Yield"
  ) + theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).