library(ggplot2)
library(caret)
library(dplyr)
library(pls)
library(RANN)
library(MASS)
library(lars)
library(randomForest)
library(elasticnet)
library(ggcorrplot)
library(tidyr)
library(reshape2)
set.seed(895)
theme_set(theme_bw())

6.2.

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

(a)

Start R and use these commands to load the data:

library(AppliedPredictiveModeling) 
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.1
data(permeability) 

hist(permeability, 
     main = "Histogram of Permeability Response", 
     , 
      breaks = 20,
     col = "purple", 
     border = "black")

hist(log(permeability), 
     main = "Histogram of Permeability Response log()",
      breaks = 20,
     , 
     col = "purple", 
     border = "black")

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

(b)

The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

dim(fingerprints)
## [1]  165 1107
fingerprints <- fingerprints[, -nearZeroVar(fingerprints)]

mod_predictors<- dim(fingerprints)|>as.table()




print(paste("There are", mod_predictors[2], "predictors left for modeling"))
## [1] "There are 388 predictors left for modeling"

(c)

Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

Given that the available dataset has only 165 samples compared to the 388 predictors available for modeling, we need to address the risk of overfitting due to high dimensionality. To ensure a more robust evaluation, we will use a larger test set, allocating 25% of the data for testing. This approach helps provide a better estimate of the model’s performance on unseen data and ensures the results are more reliable.

# stting crosvalid 
ctrl <- trainControl(method = "cv",10) 


# partiontioning data
training_index <- createDataPartition(
  permeability, p = .75, list = FALSE
)

train_permeability<- permeability[training_index,]
train_fingerprints <- fingerprints[training_index,]

 test_fingerprints <- fingerprints[-training_index,]
 test_permeability <- permeability[-training_index,]
# crossvalidation* 10
pf_pse<-train(train_fingerprints,log(train_permeability),
              method = "pls",
              tuneLength = 20,
              trControl = ctrl,
              metric = "Rsquared",
              preProcess = c("center","scale")
              )
plot(pf_pse)

 

pf_pse
## Partial Least Squares 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 113, 113, 111, 113, 112, 111, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     1.321938  0.2923253  1.0789982
##    2     1.180087  0.4451105  0.9428264
##    3     1.162285  0.4799376  0.9200157
##    4     1.180023  0.4796177  0.9372717
##    5     1.178906  0.4670542  0.9306719
##    6     1.177576  0.4690554  0.9476024
##    7     1.165247  0.4731429  0.9264735
##    8     1.148156  0.4899360  0.9188411
##    9     1.153504  0.5020292  0.9175574
##   10     1.179227  0.4809183  0.9137908
##   11     1.189684  0.4808153  0.9222334
##   12     1.231449  0.4592588  0.9520088
##   13     1.257302  0.4592794  0.9899096
##   14     1.302738  0.4399447  1.0134352
##   15     1.341889  0.4230849  1.0417257
##   16     1.352056  0.4227361  1.0360537
##   17     1.382593  0.4139410  1.0573537
##   18     1.414888  0.3962675  1.0718384
##   19     1.417260  0.3988993  1.0820468
##   20     1.445214  0.3888665  1.1071974
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 9.
R-squared: 0.5020292 ncomp=9

(d)

Predict the response for the test set. What is the test set estimate of R2?

pred_pf_pse <- predict(pf_pse, test_fingerprints)


postResample(pred_pf_pse , log(test_permeability))
##      RMSE  Rsquared       MAE 
## 1.0687335 0.5355765 0.9161883

R-squared:

0.04588947

(e)

Try building other models discussed in this chapter. Do any have better predictive performance?

pf_lars <- train(train_fingerprints,log(train_permeability),
                 method = "lars",
                 tuneLength = 20,
                 trControl = ctrl, 
                 preProc = c("center", "scale"),
                 metric = "Rsquared"
                 )






plot(pf_lars)

pf_lars
## Least Angle Regression 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 113, 112, 113, 113, 112, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE      Rsquared   MAE      
##   0.05      1.052486  0.5652863  0.8211627
##   0.10      1.152694  0.4858275  0.8725443
##   0.15      1.206575  0.4695524  0.9244744
##   0.20      1.227783  0.4663085  0.9502911
##   0.25      1.246697  0.4679413  0.9669804
##   0.30      1.295614  0.4529869  0.9941294
##   0.35      1.385992  0.4215663  1.0536409
##   0.40      1.509456  0.3830917  1.1189173
##   0.45      1.630297  0.3500587  1.1896033
##   0.50      1.769472  0.3177640  1.2629085
##   0.55      1.894719  0.2889009  1.3407120
##   0.60      2.030434  0.2549957  1.4216533
##   0.65      2.151231  0.2329804  1.5021205
##   0.70      2.290368  0.2137760  1.5961309
##   0.75      2.441258  0.2011728  1.6835129
##   0.80      2.577702  0.1922686  1.7609354
##   0.85      2.714031  0.1859985  1.8478823
##   0.90      2.859754  0.1874161  1.9433431
##   0.95      2.998400  0.1866377  2.0263506
##   1.00      3.140877  0.1889948  2.1214397
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was fraction = 0.05.
pred_pf_lars <- predict(pf_lars, test_fingerprints)


postResample(pred_pf_lars , log(test_permeability))
##      RMSE  Rsquared       MAE 
## 1.0047715 0.5520773 0.7888338

R-squared:

0.5520773
pf_enet <- train(
  train_fingerprints, log(train_permeability),
  method = "enet",
  
  tuneGrid = expand.grid(
    .lambda = c(0, .001, .01, .1, 1),
    .fraction = seq(.05, 1, length = 20)),
  trControl = ctrl,
  preProcess = c("center", "scale"),
  metric = "Rsquared"
  )
  

plot(pf_enet)

pf_enet
## Elasticnet 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 113, 111, 113, 112, 113, 113, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE          Rsquared   MAE         
##   0.000   0.05          1.223173  0.5427603     0.9991002
##   0.000   0.10          1.074484  0.5965196     0.8664075
##   0.000   0.15          1.052029  0.5816848     0.8339628
##   0.000   0.20          1.055622  0.5637778     0.8292775
##   0.000   0.25          1.073311  0.5446703     0.8450238
##   0.000   0.30          1.102214  0.5252925     0.8717633
##   0.000   0.35          1.123415  0.5150757     0.8898876
##   0.000   0.40          1.142258  0.5069549     0.9043022
##   0.000   0.45          1.159744  0.4993819     0.9174898
##   0.000   0.50          1.176766  0.4915677     0.9341756
##   0.000   0.55          1.196852  0.4830362     0.9510716
##   0.000   0.60          1.219363  0.4735019     0.9683233
##   0.000   0.65          1.241701  0.4639803     0.9819124
##   0.000   0.70          1.263248  0.4542146     0.9954213
##   0.000   0.75          1.284478  0.4452134     1.0107538
##   0.000   0.80          1.304042  0.4382710     1.0228841
##   0.000   0.85          1.325595  0.4304063     1.0373951
##   0.000   0.90          1.346457  0.4232432     1.0530809
##   0.000   0.95          1.363138  0.4177316     1.0651572
##   0.000   1.00          1.376314  0.4141733     1.0740363
##   0.001   0.05        615.042129  0.1874819   420.2634372
##   0.001   0.10       1149.410076  0.1708340   742.5505609
##   0.001   0.15       1660.333543  0.1645549  1031.0102449
##   0.001   0.20       2178.971762  0.1598286  1303.9174898
##   0.001   0.25       2695.283455  0.1549130  1577.4118849
##   0.001   0.30       3212.548609  0.1522761  1852.7600166
##   0.001   0.35       3748.129707  0.1511459  2127.2017765
##   0.001   0.40       4290.912559  0.1506788  2397.7008110
##   0.001   0.45       4834.051636  0.1512220  2671.2494134
##   0.001   0.50       5376.179080  0.1515879  2947.2755515
##   0.001   0.55       5918.155091  0.1519440  3222.2788679
##   0.001   0.60       6460.798352  0.1522166  3496.8895133
##   0.001   0.65       7002.165556  0.1523819  3770.0309846
##   0.001   0.70       7547.135783  0.1526413  4042.9134326
##   0.001   0.75       8099.660789  0.1529501  4322.0231712
##   0.001   0.80       8650.458946  0.1532374  4608.9834096
##   0.001   0.85       9198.152888  0.1536682  4897.2659143
##   0.001   0.90       9745.058078  0.1540934  5189.0047201
##   0.001   0.95      10283.205471  0.1544762  5476.6543302
##   0.001   1.00      10794.090734  0.1547453  5751.8955065
##   0.010   0.05          1.135491  0.5602211     0.8825960
##   0.010   0.10          1.196787  0.5164871     0.9267847
##   0.010   0.15          1.291932  0.4829455     1.0009095
##   0.010   0.20          1.385194  0.4610574     1.0815067
##   0.010   0.25          1.474657  0.4476609     1.1494827
##   0.010   0.30          1.552880  0.4360359     1.2103527
##   0.010   0.35          1.617460  0.4298311     1.2638132
##   0.010   0.40          1.678582  0.4262848     1.3114442
##   0.010   0.45          1.736598  0.4264166     1.3528744
##   0.010   0.50          1.796902  0.4276136     1.3976338
##   0.010   0.55          1.863035  0.4287982     1.4436962
##   0.010   0.60          1.931303  0.4287717     1.4925991
##   0.010   0.65          2.007307  0.4237354     1.5492793
##   0.010   0.70          2.087002  0.4151966     1.6096194
##   0.010   0.75          2.169021  0.4055075     1.6688511
##   0.010   0.80          2.249190  0.3976653     1.7270521
##   0.010   0.85          2.330287  0.3881689     1.7855796
##   0.010   0.90          2.411713  0.3785007     1.8440442
##   0.010   0.95          2.487526  0.3714464     1.9015414
##   0.010   1.00          2.560660  0.3652223     1.9572575
##   0.100   0.05          1.175557  0.5318285     0.9615614
##   0.100   0.10          1.016730  0.6053389     0.7970081
##   0.100   0.15          1.034542  0.5822048     0.8019414
##   0.100   0.20          1.076921  0.5557788     0.8362634
##   0.100   0.25          1.103592  0.5422040     0.8558684
##   0.100   0.30          1.128161  0.5318605     0.8786744
##   0.100   0.35          1.149493  0.5229358     0.8982873
##   0.100   0.40          1.163447  0.5185716     0.9096305
##   0.100   0.45          1.176577  0.5145723     0.9172881
##   0.100   0.50          1.183773  0.5133348     0.9169192
##   0.100   0.55          1.188478  0.5135032     0.9170694
##   0.100   0.60          1.193305  0.5139860     0.9179140
##   0.100   0.65          1.196366  0.5158602     0.9168251
##   0.100   0.70          1.198068  0.5186656     0.9142862
##   0.100   0.75          1.200502  0.5203454     0.9123649
##   0.100   0.80          1.202621  0.5223746     0.9110841
##   0.100   0.85          1.206775  0.5230226     0.9123517
##   0.100   0.90          1.211283  0.5227830     0.9147724
##   0.100   0.95          1.215539  0.5227227     0.9163376
##   0.100   1.00          1.221077  0.5214635     0.9180705
##   1.000   0.05          1.232576  0.4792335     1.0194635
##   1.000   0.10          1.102049  0.5091061     0.8651811
##   1.000   0.15          1.052003  0.5579634     0.8059740
##   1.000   0.20          1.080827  0.5739020     0.8339824
##   1.000   0.25          1.167811  0.5686601     0.9038833
##   1.000   0.30          1.273153  0.5494010     1.0047976
##   1.000   0.35          1.352486  0.5358260     1.0799240
##   1.000   0.40          1.397587  0.5318989     1.1217186
##   1.000   0.45          1.434391  0.5288436     1.1534631
##   1.000   0.50          1.466255  0.5250303     1.1813834
##   1.000   0.55          1.487915  0.5248060     1.1993458
##   1.000   0.60          1.506477  0.5254671     1.2134520
##   1.000   0.65          1.523561  0.5256299     1.2254995
##   1.000   0.70          1.538634  0.5261531     1.2367618
##   1.000   0.75          1.552473  0.5272516     1.2485562
##   1.000   0.80          1.564770  0.5284219     1.2590385
##   1.000   0.85          1.574286  0.5302256     1.2676888
##   1.000   0.90          1.582240  0.5323442     1.2751760
##   1.000   0.95          1.589887  0.5340848     1.2822774
##   1.000   1.00          1.596572  0.5355754     1.2877347
## 
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were fraction = 0.1 and lambda = 0.1.

The optimal model’s weight of decay is 0.1

0.100   0.10          1.016730  0.6053389     0.7970081

pred_pf_enet <- predict(pf_enet, test_fingerprints)


postResample(pred_pf_enet , log(test_permeability))
##      RMSE  Rsquared       MAE 
## 1.0204958 0.5407818 0.7960049

R-squared:

0.5407818

(f)

Would you recommend any of your models to replace the permeability laboratory experiment?

performance_metrics <- tibble(
  Model = c("Elastic Net", "LARS", "PLS"),
  RMSE = c(1.020, 1.005, 1.069),
  `R-squared` = c(0.541, 0.552, 0.536),
  MAE = c(0.796, 0.789, 0.916)
)

performance_metrics
## # A tibble: 3 × 4
##   Model        RMSE `R-squared`   MAE
##   <chr>       <dbl>       <dbl> <dbl>
## 1 Elastic Net  1.02       0.541 0.796
## 2 LARS         1.00       0.552 0.789
## 3 PLS          1.07       0.536 0.916

The LARS Model outperforms the other models, achieving the highest R-squared (0.552) and the lowest RMSE (01.005) and MAE (0.783), making it the most effective choice for predicting permeability.

The training response variable was log-transformed to address a heavily right-skewed distribution, which improved the models’ performance and interpretability. The test set was also log-transformed to maintain consistency with the training data transformation, ensuring accurate performance evaluation.

6.3.

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), 6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:

(a)

Start R and use these commands to load the data: > library(AppliedPredictiveModeling) > data(chemicalManufacturing) The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
# Plot Yield response
# Create a histogram of the Yield response variable
hist(ChemicalManufacturingProcess$Yield, 
     main = "Histogram of Yield Response", 
     xlab = "Yield", 
     col = "purple", 
     border = "black")

dim(predictors)
## NULL

(b)

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

sum(is.na(ChemicalManufacturingProcess))
## [1] 106
# Count the number of missing values in each column

There 106 missing values

(c) + imputation

Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?


#split 70/30
ccc<-preProcess(ChemicalManufacturingProcess,method="knnImpute")
Chemical<- predict(ccc, ChemicalManufacturingProcess)
index <- createDataPartition(Chemical$Yield, p = .7, list = FALSE)

# train set
train_chem <- Chemical[index, ]

# test set
test_chem <- Chemical[-index, ]
sum(is.na(Chemical))
## [1] 0
dim(Chemical)
## [1] 176  58
Chemical <- Chemical[, -nearZeroVar(Chemical)]
dim(Chemical)
## [1] 176  57

(d)

Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

# Bootstrap ctrl 

ctrl <- trainControl(method = "boot", number = 25)

# Train the PLS model using the processed training predictors
chem_plsTune <- train(
  Yield ~ ., Chemical,
  method = "pls",
  tuneLength = 20,
  trControl = ctrl,
  metric = "Rsquared",
  preproc = c("center","scale")
)

# Print the model summary
print(chem_plsTune)
## Partial Least Squares 
## 
## 176 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 176, 176, 176, 176, 176, 176, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##    1     0.7420493  0.4531334  0.6030183
##    2     0.6676641  0.5548214  0.5399557
##    3     0.6588934  0.5762591  0.5395371
##    4     0.6592457  0.5804094  0.5423491
##    5     0.6659338  0.5779856  0.5447298
##    6     0.6802816  0.5684858  0.5538099
##    7     0.6911202  0.5630143  0.5618375
##    8     0.7065249  0.5520287  0.5754470
##    9     0.7147770  0.5469014  0.5828004
##   10     0.7231332  0.5413235  0.5880027
##   11     0.7394579  0.5279997  0.6004960
##   12     0.7549771  0.5165382  0.6092831
##   13     0.7726072  0.5060865  0.6161584
##   14     0.7973430  0.4918479  0.6254689
##   15     0.8253738  0.4788635  0.6363217
##   16     0.8462693  0.4695401  0.6432092
##   17     0.8668445  0.4600580  0.6490617
##   18     0.8903493  0.4493374  0.6584473
##   19     0.9181962  0.4382499  0.6679728
##   20     0.9558570  0.4249941  0.6784212
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 4.
# Plot the model tuning results
plot(chem_plsTune)

The best model uses 4 components and returns a Rsquared value: 0.5787907

pred_C <- predict(chem_plsTune, test_chem[,-1])


postResample(pred_C , test_chem[,1])
##      RMSE  Rsquared       MAE 
## 0.5544073 0.7604198 0.4763473

(e)

Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

chem_importance <- varImp(chem_plsTune, scale = FALSE)$importance

# Convert the importance to a data frame and add variable names
chem_impor_df <- data.frame(
  Variable = rownames(chem_importance),
  Importance = chem_importance[, 1]  # Extract importance for the response variable
)

# Select the top 10 most important variables
imp_10 <- chem_impor_df |>
  arrange(desc(Importance))|>
  head(10)

# Plot the top 10 important variables using ggplot2
ggplot(imp_10, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "purple") +
  coord_flip() +
  labs(
    title = "Top 10 Most Important Variables in PLS Model",
    x = "Variable",
    y = "Importance",
    
  )

The top 3 most important predictor variables for yield response are ManufacturingProcess32, ManufacturingProcess13, and ManufacturingProcess09.

The Most important variables in the model are mostly manufacturing processes

(f)

Explore the relationships between each of the top predictors and the response. How could this information be helpful

top_variables <- imp_10$Variable[1:6]

# Extract the top n most important variables
test_chem$Yield <- ChemicalManufacturingProcess$Yield[-index]

# Prepare the data for plotting by pivoting it to long format
plot_data <- test_chem %>%
  pivot_longer(cols = top_variables, names_to = "Variable", values_to = "Value")


# Create a faceted plot using the correct data and mappings
ggplot(plot_data, aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.8, color = "black") +  
  geom_smooth(method = "loess", se = FALSE, color = "red")+
  geom_smooth(method = "lm", se = FALSE, color ="brown")+  
  facet_wrap(~ Variable, scales = "free") + 
  labs(
    title = "Univariate Relationships Between Top 3 Transformed Predictors and Yield",
    x = "Transformed Predictor Value",
    y = "Yield"
  ) 

The above plot showcases the univariate relationships of the set of 6 most significant variables:

While all of the predictors shown have a strong influence on yield, not all relationships are positive. Specifically, ManufacturingProcess13, ManufacturingProcess36, and ManufacturingProcess17 exhibit a negative influence on yield within our model. Understanding these negative relationships is essential for identifying potential areas for intervention and mitigation to improve overall production outcomes.

These plots suggest that adjustments and improvements to these manufacturing processes could potentially lead to increased yields.