Load libraries

library(tidyverse)
library(fpp3)
library(caret)
library(RANN)
## Warning: package 'RANN' was built under R version 4.4.1
library(corrplot)

#Exercise 6.2.

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

  1. Start R and use these commands to load the data:
library(AppliedPredictiveModeling)
data(permeability)

dim(fingerprints) # 165 compounds by 1107 predictors
## [1]  165 1107
dim(permeability) # 165 by 1
## [1] 165   1

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

  1. The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

There are 388 predictors left in the modeling.

#find near-zero variance predictors. Identifies variables with little or no variance in dataset. 
near_zero <- nearZeroVar(fingerprints)

# Remove near zero variance predictors
remove <- fingerprints[, -near_zero]

# of predictors left
predictor_left <- ncol(remove)
predictor_left
## [1] 388
  1. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

The optional latent variable is 6 and the r-squared is .5310320.

# Split data into training and test sets splitting it up by 80% and 20%
set.seed(123)  
trainindex <- createDataPartition(permeability, p = 0.8, list = FALSE)

#create training set
train_perm <- permeability[trainindex,]  # train response
train_fig <- remove[trainindex,]        # train predictor

#create testing set
test_perm <- permeability[-trainindex,] # test response
test_fig <- remove[-trainindex,]      #test predictor

# Preprocess the data to normalized or standardized data
pre_process <- preProcess(train_fig, method = c("center", "scale"))
train_fig <- predict(pre_process, train_fig)
test_fig <- predict(pre_process, test_fig)

# Tune PLS model 
plsTune <- train(train_fig, train_perm, method = "pls", tuneLength = 20, trControl = trainControl(method = "cv"))

plot(plsTune)

plsTune
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.36940  0.3397706  10.315252
##    2     11.78929  0.4816147   8.575238
##    3     11.90097  0.4849485   9.105076
##    4     12.00362  0.4962113   9.407429
##    5     11.76339  0.5219280   9.024291
##    6     11.55813  0.5310320   8.646010
##    7     11.56404  0.5294165   8.761596
##    8     11.83939  0.5159611   9.222387
##    9     11.97478  0.5174448   9.199224
##   10     12.57694  0.4792439   9.678215
##   11     12.73620  0.4725734   9.714152
##   12     13.03145  0.4514959   9.968887
##   13     13.06366  0.4396748   9.834200
##   14     13.38089  0.4168034  10.057344
##   15     13.60993  0.4011096  10.200724
##   16     13.74283  0.3964039  10.201961
##   17     13.97149  0.3844652  10.412445
##   18     14.21149  0.3695810  10.612168
##   19     14.23455  0.3709200  10.593149
##   20     14.31085  0.3732814  10.675872
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
  1. Predict the response for the test set. What is the test set estimate of R2?

The test set estimate of R2 is 0.3244542.

# predict the response for test set
preds <- predict(plsTune, test_fig)
head(preds)
## [1]  3.747929 19.035835  5.102938  4.498555  3.572688  1.200594
# estimate of R2
results <- data.frame(obs = test_perm, pred = preds)
defaultSummary(results)
##       RMSE   Rsquared        MAE 
## 12.3486900  0.3244542  8.2881075
  1. Try building other models discussed in this chapter. Do any have better predictive performance?

The lars model has the highest rsquared and the lowest MAE and RSME.

# PCR model (Principal Component Regression)
pcr_model <- train(train_fig, train_perm, method = "pcr", tuneLength = 20, trControl = trainControl(method = "cv", number = 20))

plot(pcr_model)

pcr_model
## Principal Component Analysis 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 126, 126, 129, 125, 126, 126, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     15.12855  0.2388250  11.990303
##    2     15.16469  0.2298260  12.100808
##    3     14.03954  0.4338455  11.129120
##    4     13.81125  0.4296606  11.083226
##    5     11.99261  0.5429695   8.881204
##    6     12.14973  0.5252840   9.001258
##    7     12.25458  0.5095772   9.099907
##    8     11.94226  0.5400248   8.868113
##    9     11.85584  0.5583869   8.883140
##   10     11.84765  0.5549366   9.082185
##   11     11.69525  0.5697105   8.791421
##   12     11.86217  0.5613899   9.001192
##   13     11.83871  0.5628608   8.865340
##   14     11.82985  0.5645557   8.850505
##   15     11.97295  0.5548194   9.068947
##   16     11.98592  0.5622066   9.184922
##   17     11.97040  0.5485688   9.063915
##   18     11.94176  0.5460955   9.120031
##   19     11.75008  0.5453340   8.903375
##   20     11.83378  0.5339776   9.022563
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 11.
#The optional latent variable is 11 and the r-squared is  0.5724243.

# predict the response for test set
preds2 <- predict(pcr_model, test_fig)
head(preds2)
## [1] -0.6525311 14.5857230 -2.9854313 10.7756507  4.2846161  7.5407337
# estimate of R2
results2 <- data.frame(obs = test_perm, pred = preds2)
defaultSummary(results2)
##       RMSE   Rsquared        MAE 
## 12.2076841  0.2922108  8.2251123
#The test set estimate of R2 is 0.2922108
# LAR model (Least Angle Regression model)
lar_model <- train(train_fig,train_perm ,method = "lars",metric = "Rsquared",tuneLength = 20, trControl = trainControl(method = "cv", number = 20))

plot(lar_model)

lar_model
## Least Angle Regression 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (20 fold) 
## Summary of sample sizes: 127, 128, 127, 128, 126, 126, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE      Rsquared   MAE      
##   0.05      11.27255  0.5424020   8.586401
##   0.10      11.16726  0.5657316   8.307219
##   0.15      11.39324  0.5576667   8.588468
##   0.20      11.99930  0.5251099   9.047357
##   0.25      12.29932  0.5068996   9.202691
##   0.30      12.57987  0.4942214   9.395214
##   0.35      12.82884  0.4842143   9.606889
##   0.40      13.29239  0.4634959   9.926427
##   0.45      13.84808  0.4454349  10.283586
##   0.50      14.52629  0.4251082  10.732905
##   0.55      15.10370  0.4026966  11.091553
##   0.60      15.80534  0.3830601  11.551521
##   0.65      16.66899  0.3590691  12.039681
##   0.70      17.49872  0.3450479  12.586979
##   0.75      18.16186  0.3398105  12.947754
##   0.80      18.94232  0.3317843  13.378403
##   0.85      19.55980  0.3283266  13.749623
##   0.90      20.29479  0.3326077  14.283189
##   0.95      21.13430  0.3325196  14.904031
##   1.00      21.98077  0.3296205  15.403904
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was fraction = 0.1.
#The optional latent variable is .1 and the rsquared is 0.5926813.

# predict the response for test set
lar2 <- predict(lar_model, test_fig)
head(lar2)
##           2           3           6          10          12          17 
##  5.38869507 16.98476482 -0.04816209  3.66560248  6.11701658  0.27608381
# estimate of R2
results3 <- data.frame(obs = test_perm, pred = lar2)
defaultSummary(results3)
##       RMSE   Rsquared        MAE 
## 11.0018629  0.3887295  7.3843906
#The test set estimate of R2 is 0.3887295.
  1. Would you recommend any of your models to replace the permeability laboratory experiment?

I would recommend using the lars model to replace the permeability laboratory experiment since it’s better in it’s predictive due to higher rsquared and lower RMSE and MAE.

Exercise 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors),6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:

Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

dim(ChemicalManufacturingProcess) #176 manufacturing runs by 58 predictors
## [1] 176  58

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

  1. A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).
#separate out predictor and response
predictor <- ChemicalManufacturingProcess[,-1]
response <- ChemicalManufacturingProcess$Yield

#find missing value
sum(is.na(predictor))
## [1] 106
# imputation and pre-process
pre_process <- preProcess(predictor, method = c("center", "scale", "knnImpute"))
pred_che <- predict(pre_process, predictor)

# any missing data left
sum(is.na(pred_che))
## [1] 0
  1. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

The optimal value of the performance is 5 with rsquared at 0.6556819.

#find near-zero variance predictors. Identifies variables with little or no variance in dataset. 
near_zero2 <- nearZeroVar(pred_che)

# Remove near zero variance predictors
remove2 <- pred_che[, -near_zero2]

# of predictors left
predictor_left2 <- ncol(remove2)
predictor_left2
## [1] 56
# Split data into training and test sets splitting it up by 80% and 20%
set.seed(123)  
trainindex2 <- createDataPartition(response, p = 0.8, list = FALSE)

#create training set
train_rep <- response[trainindex2]  # train response
train_pre <- remove2[trainindex2,]                # train predictor

#create testing set
test_rep <- response[-trainindex2] # test response
test_pre <- remove2[-trainindex2,]                 # Test predictor

# Preprocess the data to normalized or standardized data
pre_process2 <- preProcess(train_pre, method = c("center", "scale"))
train_pre <- predict(pre_process2, train_pre)
test_pre <- predict(pre_process2, test_pre)

# Tune PLS model 
plsTune2 <- train(train_pre, train_rep, method = "pls", tuneLength = 20, trControl = trainControl(method = "cv")) # cross-validation

plot(plsTune2)

plsTune2
## Partial Least Squares 
## 
## 144 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 131, 130, 130, 129, 131, 129, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     1.366841  0.4661172  1.1216565
##    2     1.161328  0.5939247  0.9411430
##    3     1.110515  0.6425731  0.9170970
##    4     1.107444  0.6576468  0.9172662
##    5     1.104609  0.6556819  0.9081022
##    6     1.110936  0.6462348  0.9127275
##    7     1.136026  0.6313198  0.9408114
##    8     1.171716  0.6164293  0.9702732
##    9     1.219901  0.5965118  1.0081905
##   10     1.263800  0.5834937  1.0357096
##   11     1.336965  0.5631760  1.0788515
##   12     1.429478  0.5407312  1.1327482
##   13     1.524420  0.5195852  1.1787326
##   14     1.582126  0.5067425  1.2092136
##   15     1.627323  0.5016396  1.2284634
##   16     1.675593  0.5017653  1.2422633
##   17     1.694754  0.5057723  1.2389459
##   18     1.736670  0.5027049  1.2522984
##   19     1.785086  0.5002747  1.2663508
##   20     1.848662  0.4963180  1.2918495
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 5.
  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

The test rsquared is .4538408 which is lower than the training rsquared of .6557 which indicates there might be overfitting in the training data and may need to be adjusted for better forecasting.

# predict the response for test set
prediction <- predict(plsTune2, test_pre)

# estimate of R2
res <- data.frame(obs = test_rep, pred = prediction)
defaultSummary(res)
##      RMSE  Rsquared       MAE 
## 1.4230432 0.4538408 1.2393205
  1. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

The most important predictors in the model are ManufacturingProcess32. The process predictors dominate the list than the biological since the ManufacturingProcess32 was most important predictor.

# Extract variable importance
importance <- varImp(plsTune2, scale = FALSE)
## Warning: package 'pls' was built under R version 4.4.1
## 
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
# Extract the importance as a data frame
importance_df <- as.data.frame(varImp(plsTune2, scale = FALSE)$importance)
importance_df$Predictor <- rownames(importance_df)  # Add predictor names as a column

# Sort by importance in descending order
sort_imp <- importance_df[order(importance_df$Overall, decreasing = TRUE), ]

# Print importance
print(sort_imp)
##                            Overall              Predictor
## ManufacturingProcess32 0.116973335 ManufacturingProcess32
## ManufacturingProcess17 0.102170282 ManufacturingProcess17
## ManufacturingProcess09 0.100497915 ManufacturingProcess09
## ManufacturingProcess13 0.100059387 ManufacturingProcess13
## ManufacturingProcess36 0.097984752 ManufacturingProcess36
## ManufacturingProcess06 0.082660964 ManufacturingProcess06
## ManufacturingProcess33 0.076345540 ManufacturingProcess33
## BiologicalMaterial06   0.074229074   BiologicalMaterial06
## BiologicalMaterial03   0.073790055   BiologicalMaterial03
## BiologicalMaterial08   0.071754007   BiologicalMaterial08
## BiologicalMaterial02   0.070893975   BiologicalMaterial02
## ManufacturingProcess11 0.070747932 ManufacturingProcess11
## BiologicalMaterial12   0.066182653   BiologicalMaterial12
## BiologicalMaterial11   0.066160876   BiologicalMaterial11
## BiologicalMaterial01   0.062830275   BiologicalMaterial01
## ManufacturingProcess28 0.061528542 ManufacturingProcess28
## BiologicalMaterial04   0.060774883   BiologicalMaterial04
## ManufacturingProcess12 0.057706356 ManufacturingProcess12
## ManufacturingProcess37 0.057426715 ManufacturingProcess37
## ManufacturingProcess04 0.055723240 ManufacturingProcess04
## BiologicalMaterial10   0.052673573   BiologicalMaterial10
## ManufacturingProcess02 0.051307150 ManufacturingProcess02
## ManufacturingProcess15 0.047536679 ManufacturingProcess15
## ManufacturingProcess34 0.044655263 ManufacturingProcess34
## ManufacturingProcess30 0.044623508 ManufacturingProcess30
## ManufacturingProcess19 0.041132346 ManufacturingProcess19
## ManufacturingProcess44 0.039685105 ManufacturingProcess44
## ManufacturingProcess10 0.037959336 ManufacturingProcess10
## BiologicalMaterial05   0.034849211   BiologicalMaterial05
## ManufacturingProcess43 0.034435154 ManufacturingProcess43
## ManufacturingProcess45 0.033768398 ManufacturingProcess45
## BiologicalMaterial09   0.031799513   BiologicalMaterial09
## ManufacturingProcess29 0.029839098 ManufacturingProcess29
## ManufacturingProcess21 0.029762493 ManufacturingProcess21
## ManufacturingProcess39 0.029012879 ManufacturingProcess39
## ManufacturingProcess01 0.027974595 ManufacturingProcess01
## ManufacturingProcess05 0.027626352 ManufacturingProcess05
## ManufacturingProcess24 0.025562425 ManufacturingProcess24
## ManufacturingProcess35 0.025222364 ManufacturingProcess35
## ManufacturingProcess42 0.022901533 ManufacturingProcess42
## ManufacturingProcess23 0.020232471 ManufacturingProcess23
## ManufacturingProcess31 0.020162875 ManufacturingProcess31
## ManufacturingProcess18 0.018575777 ManufacturingProcess18
## ManufacturingProcess20 0.018423762 ManufacturingProcess20
## ManufacturingProcess14 0.017317646 ManufacturingProcess14
## ManufacturingProcess38 0.015726649 ManufacturingProcess38
## ManufacturingProcess26 0.014466832 ManufacturingProcess26
## ManufacturingProcess41 0.013560462 ManufacturingProcess41
## ManufacturingProcess25 0.012121834 ManufacturingProcess25
## ManufacturingProcess27 0.011725872 ManufacturingProcess27
## ManufacturingProcess40 0.011691928 ManufacturingProcess40
## ManufacturingProcess08 0.010747792 ManufacturingProcess08
## ManufacturingProcess03 0.010719672 ManufacturingProcess03
## ManufacturingProcess22 0.009317751 ManufacturingProcess22
## ManufacturingProcess07 0.007629938 ManufacturingProcess07
## ManufacturingProcess16 0.005913221 ManufacturingProcess16
#print name of top importance
names <-head(sort_imp$Predictor,10)
print(names)
##  [1] "ManufacturingProcess32" "ManufacturingProcess17" "ManufacturingProcess09"
##  [4] "ManufacturingProcess13" "ManufacturingProcess36" "ManufacturingProcess06"
##  [7] "ManufacturingProcess33" "BiologicalMaterial06"   "BiologicalMaterial03"  
## [10] "BiologicalMaterial08"
  1. Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

The information is helpful to let us know the relationship between the predictor and the response. In the matrix the ManufacturingProcess32 has the highest correlation with yield. It allows us to find out which predictor is the best to optimize the yield.

# select the top predictor from all the predictors
top <- select(predictor, all_of(names))
# extract yield 
top$Yield <- ChemicalManufacturingProcess$Yield

#correlation data
correlation_data <- cor(select_if(top, is.numeric), use = "complete.obs")

# correlation Plot
corrplot::corrplot(correlation_data, method = 'number', type = 'upper', number.cex = .70, tl.cex= .70)