6.2 (a) Start R and use these commands to load the data:library(AppliedPredictiveModeling) data(permeability)The matrix fingerprints contains the 1,107 binary molecular predic- tors for the 165 compounds, while permeability contains permeability response.

data(permeability) 

6.2 (b) The fingerprint predictors indicate the presence or absence of substruc- tures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

low_freq <- nearZeroVar(fingerprints, saveMetrics = TRUE)
fingerprints_filtered <- fingerprints[, !low_freq$nzv]
ncol(fingerprints_filtered)
## [1] 388

6.2 (c) Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is

\(R^2\)

set.seed(123)
train_idx <- createDataPartition(permeability, p = 0.8, list = FALSE)

train_data <- fingerprints_filtered[train_idx, ]
test_data <- fingerprints_filtered[-train_idx, ]
train_labels <- permeability[train_idx]
test_labels <- permeability[-train_idx]

preProc <- preProcess(train_data, method = c("center", "scale"))
train_trans <- predict(preProc, train_data)
test_trans <- predict(preProc, test_data)

plsFit <- train(train_trans, train_labels, method = "pls", tuneLength = 10)
plsFit$bestTune
##   ncomp
## 3     3

6.2 (d) Predict the response for the test set. What is the test set estimate of

\(R^2\) ?

pls_preds <- predict(plsFit, test_trans)
pls_r2 <- cor(test_labels, pls_preds)^2
paste("PLS R^2:", round(pls_r2, 4))
## [1] "PLS R^2: 0.2695"

6.2(e) Try building other models discussed in this chapter. Do any have better predictive performance?

Tried to do a Linear model for this but I failed to get it working.

lm_model <- lm(train_labels ~ ., data = as.data.frame(train_trans))
lm_preds <- predict(lm_model, as.data.frame(test_trans))
lm_r2 <- cor(test_labels, lm_preds)^2

pcr_model <- train(train_trans, train_labels, method = "pcr", tuneLength = 10)
pcr_preds <- predict(pcr_model, test_trans)
pcr_r2 <- cor(test_labels, pcr_preds)^2

cat("PLS R^2:", round(pls_r2, 4), "\n")
## PLS R^2: 0.2695
cat("PCR R^2:", round(pcr_r2, 4), "\n")
## PCR R^2: 0.2687
cat("Linear Model R^2:", round(lm_r2, 4), "\n")
## Linear Model R^2: 0.0785

6.2 (f) Would you recommend any of your models to replace the permeability laboratory experiment?

No, Although the Partial Least Squares model shows the best performance with an \(R^2\) of about 0.296, which is less than 30% of the variance in permeability data. This level of predictive accuracy is not enough to fully replace the detailed and highly accurate data obtained from laboratory experiments.

6.3 Chemical Manufacturing Process

data(ChemicalManufacturingProcess)
predictors <- ChemicalManufacturingProcess[, 2:58]
yield <- ChemicalManufacturingProcess$Yield

6.3 (b) A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values

predictors <- predictors %>%
  mutate(across(everything(), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

zero_var_cols <- nearZeroVar(predictors)
predictors <- predictors[, -zero_var_cols, drop = FALSE]

cat("Number of predictors after preprocessing:", ncol(predictors), "\n")
## Number of predictors after preprocessing: 56
cat("Any NAs? ", any(is.na(predictors)), "\n")
## Any NAs?  FALSE
cat("Any columns with zero variance remaining? ", any(apply(predictors, 2, var) == 0), "\n")
## Any columns with zero variance remaining?  FALSE

6.3 (c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

set.seed(0)
train_idx <- createDataPartition(yield, p = 0.8, list = FALSE)
x_train <- predictors[train_idx, ]
x_test <- predictors[-train_idx, ]
y_train <- yield[train_idx]
y_test <- yield[-train_idx]

preProc <- preProcess(x_train, method = c("center", "scale"))
x_train_scaled <- predict(preProc, x_train)
x_test_scaled <- predict(preProc, x_test)

pls_model <- train(x_train_scaled, y_train, method = "pls",
                   tuneLength = 10,
                   trControl = trainControl(method = "repeatedcv", number = 10, repeats = 3))

best_ncomp <- pls_model$bestTune$ncomp
optimal_r2 <- max(pls_model$results$R2)

cat("Optimal ncomp:", best_ncomp, "\n")
## Optimal ncomp: 4
cat("Optimal R^2 on training set:", round(optimal_r2, 4), "\n")
## Optimal R^2 on training set: -Inf

6.3 (d) Predict the response for the test set. What is the value of the performancemetric and how does this compare with the resampled performance metric on the training set?

pls_test_preds <- predict(pls_model, x_test_scaled)
test_r2 <- cor(pls_test_preds, y_test)^2
test_rmse <- RMSE(pls_test_preds, y_test)

optimal_r2 <- max(pls_model$results$R2, na.rm = TRUE)

if (!is.na(optimal_r2) && !is.infinite(optimal_r2)) {
  r2_diff <- optimal_r2 - test_r2
} else {
  r2_diff <- NA
}

cat("Test R²:", round(test_r2, 4), "\n")
## Test R²: 0.6538
cat("Test RMSE:", round(test_rmse, 4), "\n")
## Test RMSE: 1.0308
cat("Difference in R²:", round(r2_diff, 4), "\n")
## Difference in R²: NA

6.3 (e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

enet_model <- enet(x = as.matrix(x_train), y = y_train, lambda = 0.5, normalize = TRUE)
enet_coefs <- predict(enet_model, s = 0.35, type = "coefficients", mode = "fraction")$coefficients
non_zero_coefs <- enet_coefs[enet_coefs != 0]
sort(non_zero_coefs, decreasing = TRUE)
## ManufacturingProcess34 ManufacturingProcess09 ManufacturingProcess32 
##           6.316599e-01           2.025896e-01           1.104772e-01 
##   BiologicalMaterial06 ManufacturingProcess06   BiologicalMaterial03 
##           2.679143e-02           2.100535e-02           1.435462e-02 
##   BiologicalMaterial02 ManufacturingProcess12 ManufacturingProcess37 
##           9.790877e-03           2.105723e-05          -3.550973e-02 
## ManufacturingProcess17 ManufacturingProcess13 ManufacturingProcess36 
##          -1.732512e-01          -2.830565e-01          -3.955832e+02

6.3 (f) Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

there is strong positive relationship between Manufacturing Process and yield. This may mean that the variable is essential to output in the chemical manufacturing process. This relationship can help focus on monitoring and optimizing Manufacturing Process during production. This can be maintained at the higher end of its operational range, the yield may be consistently improved.

top_feature <- names(sort(abs(non_zero_coefs), decreasing = TRUE))[1]
value_range <- seq(min(predictors[[top_feature]]), max(predictors[[top_feature]]), length.out = 100)

newdata <- data.frame(matrix(rep(colMeans(predictors), each = 100), nrow = 100))
colnames(newdata) <- colnames(predictors)
newdata[[top_feature]] <- value_range

pred_yield <- predict(enet_model, newx = as.matrix(newdata), s = 0.35, mode = "fraction", type = "fit")$fit

ggplot(data.frame(value_range, pred_yield), aes(x = value_range, y = pred_yield)) +
  geom_line(color = "blue") +
  labs(title = paste("Effect of", top_feature, "on Yield"),
       x = top_feature, y = "Predicted Yield") +
  theme_minimal()