6.2

Taking a look at the data:

data(permeability)

summary(permeability)
##   permeability  
##  Min.   : 0.06  
##  1st Qu.: 1.55  
##  Median : 4.91  
##  Mean   :12.24  
##  3rd Qu.:15.47  
##  Max.   :55.60
head(permeability)
##   permeability
## 1       12.520
## 2        1.120
## 3       19.405
## 4        1.730
## 5        1.680
## 6        0.510
# summary(fingerprints)
hist(permeability)

summary(apply(fingerprints, 2, mean))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.006061 0.024242 0.154767 0.181818 1.000000
  1. The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
initial_num_predictors <- ncol(fingerprints)
initial_num_predictors
## [1] 1107
near_zero_variables <- nearZeroVar(fingerprints)

filtered_fingerprints <- fingerprints[, -near_zero_variables]

num_predictors_left <- ncol(filtered_fingerprints)
num_predictors_left
## [1] 388

It looks like 719 predictors got removed. We have 388 predictors left.

  1. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?
set.seed(123)

trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
train_fingerprints <- filtered_fingerprints[trainIndex, ]
test_fingerprints <- filtered_fingerprints[-trainIndex, ]
train_permeability <- permeability[trainIndex]
test_permeability <- permeability[-trainIndex]
preProcess_params <- preProcess(train_fingerprints, method = c("center", "scale"))
train_fingerprints <- predict(preProcess_params, train_fingerprints)
test_fingerprints <- predict(preProcess_params, test_fingerprints)
train_control <- trainControl(method = "cv", number = 10)
pls_model <- train(
  x = train_fingerprints,
  y = train_permeability,
  method = "pls",
  tuneLength = 20, 
  trControl = train_control,
  preProcess = c("center", "scale")
)

optimal_components <- pls_model$bestTune$ncomp

The number of optimal components is 6, and the resulting resampled R-squared is 0.53.

We can look at the rmsep as we add components:

best_r2 <- max(pls_model$results$Rsquared)
list(optimal_components = optimal_components, best_r2 = best_r2)
## $optimal_components
## [1] 6
## 
## $best_r2
## [1] 0.5335956
rmsep_values <- pls_model$results

ggplot(rmsep_values, aes(x = ncomp, y = RMSE)) +
  geom_line() +
  geom_point() +
  labs(
    title = "RMSEP vs Number of Components in PLS Model",
    x = "Number of Components",
    y = "RMSEP"
  ) +
  theme_minimal()

  1. Predict the response for the test set. What is the test set estimate of R2?
test_predictions <- predict(pls_model, newdata = test_fingerprints)

test_r2 <- cor(test_predictions, test_permeability)^2
test_r2
## [1] 0.3244542

The test estimate R-squared is 0.3244542.

  1. Try building other models discussed in this chapter. Do any have better predictive performance?
train_control <- trainControl(method = "cv", number = 10)

ridge_model <- train(
  x = train_fingerprints,
  y = train_permeability,
  method = "ridge",
  trControl = train_control,
  preProcess = c("center", "scale")
)
## Warning: model fit failed for Fold02: lambda=0e+00 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
lasso_model <- train(
  x = train_fingerprints,
  y = train_permeability,
  method = "lasso",
  trControl = train_control,
  preProcess = c("center", "scale")
)
## Warning: model fit failed for Fold03: fraction=0.9 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed

## Warning: There were missing values in resampled performance measures.
elastic_net_model <- train(
  x = train_fingerprints,
  y = train_permeability,
  method = "glmnet",
  tuneGrid = expand.grid(alpha = seq(0, 1, length = 10), lambda = seq(0.01, 0.1, length = 10)),
  trControl = train_control,
  preProcess = c("center", "scale")
)
model_results <- resamples(list(
  PLS = pls_model,
  Ridge = ridge_model,
  Lasso = lasso_model,
  ElasticNet = elastic_net_model
))

summary(model_results)
## 
## Call:
## summary.resamples(object = model_results)
## 
## Models: PLS, Ridge, Lasso, ElasticNet 
## Number of resamples: 10 
## 
## MAE 
##                Min.  1st Qu.   Median      Mean   3rd Qu.     Max. NA's
## PLS        5.940748 7.855195 8.101540  8.658301  8.587837 12.53980    0
## Ridge      5.151345 7.914686 9.297293  9.609113 11.384128 15.00864    0
## Lasso      5.580568 8.240237 9.799450 14.693278 11.754338 43.53765    1
## ElasticNet 6.805424 7.517214 7.980207  8.463982  8.901325 11.12721    0
## 
## RMSE 
##                 Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PLS         7.479049 10.66227 11.18215 11.53275 11.66271 16.49145    0
## Ridge       7.919648 10.69651 12.44402 12.91785 15.43402 18.02992    0
## Lasso       7.170588 11.62398 12.19772 20.47861 17.05290 63.21441    1
## ElasticNet 10.030656 10.12274 10.49953 11.44026 11.49829 15.68488    0
## 
## Rsquared 
##                    Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## PLS        2.072535e-01 0.3875424 0.5626977 0.5335956 0.6291758 0.8326590    0
## Ridge      1.520028e-01 0.4317830 0.5265775 0.5272785 0.7257137 0.8134886    0
## Lasso      4.994135e-06 0.2185169 0.3885725 0.4089517 0.6773989 0.7810474    1
## ElasticNet 1.828886e-01 0.2909709 0.5741940 0.5211502 0.6512159 0.8751188    0
r2_summary <- summary(model_results)$statistics$Rsquared

r2_means <- r2_summary[, "Mean"]

best_model <- names(which.max(r2_means))
best_model
## [1] "PLS"

PLS is the best model based on R-squared.

  1. Would you recommend any of your models to replace the permeability laboratory experiment?

Based on R-squared the lasso model is the best of the bunch, and that’s with a score of 0.57, which is fairly low indicates weak predictive power. The best model we have won’t capture enough variability in permeability to justify using it over experiments, and being a scientific setting we would need to be as precise and accurate as possible.

6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), 6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch: a. Start R and use these commands to load the data:

data(ChemicalManufacturingProcess)

# str(ChemicalManufacturingProcess)

# summary(ChemicalManufacturingProcess)


head(ChemicalManufacturingProcess)
##   Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38.00                 6.25                49.58                56.97
## 2 42.44                 8.01                60.97                67.48
## 3 42.03                 8.01                60.97                67.48
## 4 41.42                 8.01                60.97                67.48
## 5 42.49                 7.47                63.33                72.25
## 6 43.57                 6.12                58.36                65.31
##   BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1                12.74                19.51                43.73
## 2                14.65                19.36                53.14
## 3                14.65                19.36                53.14
## 4                14.65                19.36                53.14
## 5                14.02                17.91                54.66
## 6                15.17                21.79                51.23
##   BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1                  100                16.66                11.44
## 2                  100                19.04                12.55
## 3                  100                19.04                12.55
## 4                  100                19.04                12.55
## 5                  100                18.22                12.80
## 6                  100                18.30                12.13
##   BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1                 3.46               138.09                18.83
## 2                 3.46               153.67                21.05
## 3                 3.46               153.67                21.05
## 4                 3.46               153.67                21.05
## 5                 3.05               147.61                21.05
## 6                 3.78               151.88                20.76
##   ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1                     NA                     NA                     NA
## 2                    0.0                      0                     NA
## 3                    0.0                      0                     NA
## 4                    0.0                      0                     NA
## 5                   10.7                      0                     NA
## 6                   12.0                      0                     NA
##   ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1                     NA                     NA                     NA
## 2                    917                 1032.2                  210.0
## 3                    912                 1003.6                  207.1
## 4                    911                 1014.6                  213.3
## 5                    918                 1027.5                  205.7
## 6                    924                 1016.8                  208.9
##   ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1                     NA                     NA                  43.00
## 2                    177                    178                  46.57
## 3                    178                    178                  45.07
## 4                    177                    177                  44.92
## 5                    178                    178                  44.96
## 6                    178                    178                  45.32
##   ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1                     NA                     NA                     NA
## 2                     NA                     NA                      0
## 3                     NA                     NA                      0
## 4                     NA                     NA                      0
## 5                     NA                     NA                      0
## 6                     NA                     NA                      0
##   ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1                   35.5                   4898                   6108
## 2                   34.0                   4869                   6095
## 3                   34.8                   4878                   6087
## 4                   34.8                   4897                   6102
## 5                   34.6                   4992                   6233
## 6                   34.0                   4985                   6222
##   ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1                   4682                   35.5                   4865
## 2                   4617                   34.0                   4867
## 3                   4617                   34.8                   4877
## 4                   4635                   34.8                   4872
## 5                   4733                   33.9                   4886
## 6                   4786                   33.4                   4862
##   ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1                   6049                   4665                    0.0
## 2                   6097                   4621                    0.0
## 3                   6078                   4621                    0.0
## 4                   6073                   4611                    0.0
## 5                   6102                   4659                   -0.7
## 6                   6115                   4696                   -0.6
##   ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1                     NA                     NA                     NA
## 2                      3                      0                      3
## 3                      4                      1                      4
## 4                      5                      2                      5
## 5                      8                      4                     18
## 6                      9                      1                      1
##   ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1                   4873                   6074                   4685
## 2                   4869                   6107                   4630
## 3                   4897                   6116                   4637
## 4                   4892                   6111                   4630
## 5                   4930                   6151                   4684
## 6                   4871                   6128                   4687
##   ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1                   10.7                   21.0                    9.9
## 2                   11.2                   21.4                    9.9
## 3                   11.1                   21.3                    9.4
## 4                   11.1                   21.3                    9.4
## 5                   11.3                   21.6                    9.0
## 6                   11.4                   21.7                   10.1
##   ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1                   69.1                    156                     66
## 2                   68.7                    169                     66
## 3                   69.3                    173                     66
## 4                   69.3                    171                     68
## 5                   69.4                    171                     70
## 6                   68.2                    173                     70
##   ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1                    2.4                    486                  0.019
## 2                    2.6                    508                  0.019
## 3                    2.6                    509                  0.018
## 4                    2.5                    496                  0.018
## 5                    2.5                    468                  0.017
## 6                    2.5                    490                  0.018
##   ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1                    0.5                      3                    7.2
## 2                    2.0                      2                    7.2
## 3                    0.7                      2                    7.2
## 4                    1.2                      2                    7.2
## 5                    0.2                      2                    7.3
## 6                    0.4                      2                    7.2
##   ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1                     NA                     NA                   11.6
## 2                    0.1                   0.15                   11.1
## 3                    0.0                   0.00                   12.0
## 4                    0.0                   0.00                   10.6
## 5                    0.0                   0.00                   11.0
## 6                    0.0                   0.00                   11.5
##   ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1                    3.0                    1.8                    2.4
## 2                    0.9                    1.9                    2.2
## 3                    1.0                    1.8                    2.3
## 4                    1.1                    1.8                    2.1
## 5                    1.1                    1.7                    2.1
## 6                    2.2                    1.8                    2.0
  1. A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values
rows_with_na <- ChemicalManufacturingProcess[!complete.cases(ChemicalManufacturingProcess), ]
NROW(rows_with_na)
## [1] 24
data(ChemicalManufacturingProcess)

predictors <- ChemicalManufacturingProcess[, -1] 

preProcess_params <- preProcess(predictors, method = "medianImpute")
predictors_imputed <- predict(preProcess_params, predictors)

sum(is.na(predictors_imputed)) 
## [1] 0
ChemicalManufacturingProcess_imputed <- cbind(Yield = ChemicalManufacturingProcess$Yield, predictors_imputed)

rows_with_na_after <- ChemicalManufacturingProcess_imputed[!complete.cases(ChemicalManufacturingProcess_imputed), ]
NROW(rows_with_na_after) 
## [1] 0
  1. Split the data into a training and a test set, pre-process the data, and tune a PSL model. What is the optimal value of the performance metric?
set.seed(123)

trainIndex <- createDataPartition(ChemicalManufacturingProcess$Yield, p = 0.8, list = FALSE)
train_data <- ChemicalManufacturingProcess[trainIndex, ]
test_data <- ChemicalManufacturingProcess[-trainIndex, ]

train_predictors <- train_data[, -1]
train_yield <- train_data$Yield
test_predictors <- test_data[, -1]
test_yield <- test_data$Yield

preProcess_params <- preProcess(train_predictors, method = c("center", "scale"))
train_predictors <- predict(preProcess_params, train_predictors)
test_predictors <- predict(preProcess_params, test_predictors)

train_control <- trainControl(method = "cv", number = 10)

pls_model <- train(
  x = train_predictors,
  y = train_yield,
  method = "pls",
  tuneLength = 20, 
  trControl = train_control,
  preProcess = c("center", "scale")
)

optimal_components <- pls_model$bestTune$ncomp

optimal_rmse <- min(pls_model$results$RMSE)
optimal_r2 <- max(pls_model$results$Rsquared)

list(
  optimal_components = optimal_components,
  optimal_rmse = optimal_rmse,
  optimal_r2 = optimal_r2
)
## $optimal_components
## [1] 3
## 
## $optimal_rmse
## [1] 1.131543
## 
## $optimal_r2
## [1] 0.6499904

The optimal number of components (3) gives us an RMSE of 1.13.

rmse_values <- pls_model$results

ggplot(rmse_values, aes(x = ncomp, y = RMSE)) +
  geom_line() +
  geom_point() +
  labs(
    title = "RMSEP vs Number of Components in PLS Model",
    x = "Number of Components",
    y = "RMSEP"
  ) +
  theme_minimal()

  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?
sum(is.na(test_predictions)) 
## [1] 0
sum(is.na(test_yield))       
## [1] 0
complete_cases <- !is.na(test_predictions)
test_predictions <- test_predictions[complete_cases]
test_yield <- test_yield[complete_cases]

test_rmse <- RMSE(test_predictions, test_yield)

ss_total <- sum((test_yield - mean(test_yield))^2)
ss_residual <- sum((test_yield - test_predictions)^2)
test_r2 <- 1 - (ss_residual / ss_total)

train_rmse <- min(pls_model$results$RMSE)
train_r2 <- max(pls_model$results$Rsquared)

list(
  test_rmse = test_rmse,
  test_r2 = test_r2,
  train_rmse = train_rmse,
  train_r2 = train_r2
)
## $test_rmse
## [1] 30.72849
## 
## $test_r2
## [1] -285.1475
## 
## $train_rmse
## [1] 1.131543
## 
## $train_r2
## [1] 0.6499904

The test RMSE is higher than the train RMSE.

  1. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?
vip_scores <- vip(pls_model$finalModel)
vip_scores

  1. Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?
library(ggplot2)

top_predictors <- c("ManufacturingProcess09", "ManufacturingProcess17", 
                    "ManufacturingProcess13", "ManufacturingProcess32", 
                    "ManufacturingProcess36", "BiologicalMaterial03", 
                    "BiologicalMaterial06", "BiologicalMaterial02", 
                    "ManufacturingProcess06", "ManufacturingProcess11")

selected_data <- ChemicalManufacturingProcess[, c("Yield", top_predictors)]

long_data <- melt(selected_data, id.vars = "Yield", variable.name = "Predictor", value.name = "Value")

ggplot(long_data, aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ Predictor, scales = "free_x") +
  labs(title = "Relationships between Top Predictors and Yield",
       x = "Predictor Value",
       y = "Yield") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

I would definitely address some of these- for example ManufacturingProcess36 seems to be categorical. We could also look into removing some of the outliers that might be skering ManufacturingProcess06, 09, 17, and 13.

library(ggplot2)
library(reshape2)

cor_data <- ChemicalManufacturingProcess[, c("Yield", top_predictors)]

cor_matrix <- cor(cor_data, use = "complete.obs")

melted_cor_matrix <- melt(cor_matrix)

ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  labs(title = "Correlation Matrix of Top Predictors and Yield",
       x = "Predictors",
       y = "Predictors") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   hjust = 1))

Some of these also correlate with each other like BiologicalMaterial02 and 3, and 02 and 06.