#Libraries Required
library(naniar)
library(caret)
library(ggplot2)
library(lattice)
library(VIM)
library(dplyr)
library(psych)
library(RANN)
6.2. Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:
(a) Start R and use these commands to load the data:
#install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.3
data(permeability)
The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
(b) The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
#Check the dimension
dim(fingerprints)
## [1] 165 1107
#identify low frequencies predictors
Low_Freq <- nearZeroVar(fingerprints)
#Remove low frequencies predictors
X <- fingerprints[,-Low_Freq]
#Check how many predictors were removed?
length(Low_Freq)
## [1] 719
There are 719 predictors removed from fingerprint matrix
#Check how many predictors are left.
dim(X)[2]
## [1] 388
There are 388 predictors left for modeling from fingerprint matrix
(c) Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?
set.seed(123)
#train test split - 75/25 split
sample_size = round(nrow(permeability)*.75)
index <- sample(seq_len(nrow(permeability)), size = sample_size)
#train-test split our data
X_train <- X[index, ]
y_train <- permeability[index]
X_test <- X[-index, ]
y_test <- permeability[-index]
A PLS model is fitted and it works by successively extracting factors from both predictive and target variables such that covariance between the extracted factors is maximized.
set.seed(222)
# Tuning PLS Model
PLS_model = train(X_train, y_train,
method = "pls",
metric = "Rsquared",
tuneLength = 15,
trControl = trainControl(method = "cv"),
preProc = c('center', 'scale'))
PLS_model
## Partial Least Squares
##
## 124 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 112, 112, 112, 111, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.42331 0.3258398 10.495871
## 2 12.21633 0.4483767 8.847583
## 3 12.24679 0.4426992 9.565663
## 4 12.46028 0.4404321 9.700378
## 5 12.55833 0.4290102 9.667452
## 6 12.34241 0.4510115 9.322539
## 7 12.41485 0.4636581 9.509446
## 8 12.36658 0.4688250 9.546664
## 9 12.67371 0.4521824 9.906890
## 10 12.88520 0.4436526 10.071000
## 11 13.07763 0.4329455 10.173023
## 12 13.10246 0.4344139 10.139496
## 13 13.41113 0.4167691 10.274757
## 14 13.77575 0.3942200 10.431778
## 15 14.11923 0.3837743 10.522231
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 8.
The number of components which resulted in the smallest root mean squared error is 8 with Rsqaured 0.468825.
data.frame(rsquared = PLS_model[["results"]][["Rsquared"]][as.numeric(rownames(PLS_model$bestTune))],
rmse = PLS_model[["results"]][["RMSE"]][as.numeric(rownames(PLS_model$bestTune))])
## rsquared rmse
## 1 0.468825 12.36658
plot(PLS_model, main = "R-squared Error of PLS Model")
#predict test set response
PLS_Pred <- predict(PLS_model, newdata=X_test)
#test set estimate of Rsquared
postResample(pred=PLS_Pred, obs=y_test)
## RMSE Rsquared MAE
## 10.9962310 0.4718432 8.3496596
The performance statistics suggest that the model fits the test data with higher error, RMSE = 10.996. It also accounts for only 47.18% of the variability of the data with the mean absolute error is 8.3496.
(e) Try building other models discussed in this chapter. Do any have better predictive performance?
Ridge Regression
# Ridge Regression
set.seed(100)
RidgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
Ridge_model = train(x = X_train, y = y_train, method = "ridge",
trControl = trainControl("cv", number = 10), metric = "Rsquared",
tuneGrid = RidgeGrid, preProc = c("center", "scale"))
## Warning: model fit failed for Fold01: lambda=0.000000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
#Ridge_model
#Ridge_model$bestTune
data.frame(rsquared = Ridge_model[["results"]][["Rsquared"]][as.numeric(rownames(Ridge_model$bestTune))],
rmse = Ridge_model[["results"]][["RMSE"]][as.numeric(rownames(Ridge_model$bestTune))])
## rsquared rmse
## 1 0.454468 13.53441
Lasso Regression
set.seed(100)
Lasso_model = train(x = X_train, y = y_train, method = "glmnet",
trControl = trainControl("cv", number = 10), metric = "Rsquared",
preProc = c("center", "scale"))
Lasso_model$bestTune
## alpha lambda
## 6 0.55 6.379887
data.frame(rsquared = Lasso_model[["results"]][["Rsquared"]][as.numeric(rownames(Lasso_model$bestTune))],
rmse = Lasso_model[["results"]][["RMSE"]][as.numeric(rownames(Lasso_model$bestTune))])
## rsquared rmse
## 1 0.5264397 12.27158
elastic net
set.seed(100)
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1), .fraction = seq(.05, 1, length = 20))
enetTune <- train(x = X_train, y = y_train,
method = "enet",
tuneGrid = enetGrid,
center = TRUE,
metric = "Rsquared",
trControl = trainControl("cv", number = 10),
preProc = c("center", "scale"))
## Warning: model fit failed for Fold01: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
#enetTune$bestTune
data.frame(rsquared = enetTune[["results"]][["Rsquared"]][as.numeric(rownames(enetTune$bestTune))],
rmse = enetTune[["results"]][["RMSE"]][as.numeric(rownames(enetTune$bestTune))])
## rsquared rmse
## 1 0.3363164 16.28092
(f) Would you recommend any of your models to replace the permeability laboratory experiment?
A low RMSE value indicates that the simulated and observed data are close to each other showing a better accuracy.
Lasso Regression shows the lowest RSMSE which is 12.27158.
rbind(actual = describe(permeability)[,-c(1,3,7)],
prediction = describe(predict(enetTune, newdata = X_test))[,-c(1,3,7)])
## n sd median trimmed min max range skew kurtosis se
## actual 165 15.58 4.91 9.32 0.06 55.60 55.54 1.40 0.56 1.21
## prediction 41 5.06 7.81 9.71 7.81 26.96 19.15 1.89 2.98 0.79
From the descriptive statistic above, it is obvious that the predictions are not as similar to the actual values. The predictive interval is smaller than what some of the actual values can be.In conclusion, it is not recommended to use this model to replace the permeability laboratory experiment
6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch
(a) Start R and use these commands to load the data:
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
#summary(ChemicalManufacturingProcess)
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
(b) A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).
sum(is.na(ChemicalManufacturingProcess))
## [1] 106
There are 106 missing values.
#Visualization of missing data
vis_miss(ChemicalManufacturingProcess)
This plot provides a specific visualization of the amount of missing data, showing in black the location of missing values, and also providing information on the overall percentage of missing values overall (in the legend), and in each variable.
Check the portion of missing values for each specific variable that needs imputation to make the set complete.
na.counts = as.data.frame(((sapply(ChemicalManufacturingProcess,
function(x) sum(is.na(x))))/ nrow(ChemicalManufacturingProcess))*100)
names(na.counts) = "counts"
na.counts = cbind(variables = rownames(na.counts), data.frame(na.counts, row.names = NULL))
na.counts[na.counts$counts > 0,] %>% arrange(counts) %>% mutate(name = factor(variables, levels = variables)) %>%
ggplot(aes(x = name, y = counts)) + geom_segment( aes(xend = name, yend = 0)) +
geom_point(size = 4, color = "steelblue2") + coord_flip() + theme_bw() +
labs(title = "Proportion of Missing Data", x = "Variables", y = "% of Missing data") +
scale_y_continuous(labels = scales::percent_format(scale = 1))
set.seed(123)
#Impute missing values
Imputed_Data <- preProcess(ChemicalManufacturingProcess[, -c(1)],method = "knnImpute")
#NOTE: preProcess will center and scale the data by default as well
Pred_Data <- predict(Imputed_Data, ChemicalManufacturingProcess[, -c(1)])
(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
In order to build a smaller model without predictors with extremely high correlations, it is best to reduce the number of predictors. Those presictors with no absolute pairwise correlations above 0.90.
corr = cor(Pred_Data)
corr[corr == 1] = NA
corr[abs(corr) < 0.85] = NA
corr = na.omit(reshape::melt(corr))
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
head(corr[order(-abs(corr$value)),], 10)
## X1 X2 value
## 2090 ManufacturingProcess26 ManufacturingProcess25 0.9975339
## 2146 ManufacturingProcess25 ManufacturingProcess26 0.9975339
## 2148 ManufacturingProcess27 ManufacturingProcess26 0.9960721
## 2204 ManufacturingProcess26 ManufacturingProcess27 0.9960721
## 2091 ManufacturingProcess27 ManufacturingProcess25 0.9934932
## 2203 ManufacturingProcess25 ManufacturingProcess27 0.9934932
## 1685 ManufacturingProcess20 ManufacturingProcess18 0.9917474
## 1797 ManufacturingProcess18 ManufacturingProcess20 0.9917474
## 2095 ManufacturingProcess31 ManufacturingProcess25 0.9706780
## 2431 ManufacturingProcess25 ManufacturingProcess31 0.9706780
The list above shows only significant correlations (at 5% level) for the top 10 highest correlations by the correlation coefficient. The results show that these ten have a perfect correlation of 1.
tooHigh = findCorrelation(cor(Pred_Data), 0.90)
Pred_Data = Pred_Data[, -tooHigh]
(Imputed_Data = preProcess(Pred_Data, method = c("YeoJohnson", "center", "scale")))
## Created from 176 samples and 47 variables
##
## Pre-processing:
## - centered (47)
## - ignored (0)
## - scaled (47)
## - Yeo-Johnson transformation (41)
##
## Lambda estimates for Yeo-Johnson transformation:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.8404 0.7085 0.8795 0.9698 1.2484 2.7992
Pred_Data = predict(Imputed_Data, Pred_Data)
set.seed(333)
Index = createDataPartition(ChemicalManufacturingProcess$Yield, p = 0.8, list = FALSE)
Train.p = Pred_Data[Index, ]
Train.r = ChemicalManufacturingProcess$Yield[Index]
Test.p = Pred_Data[-Index, ]
Test.r = ChemicalManufacturingProcess$Yield[-Index]
# Elastic Net Regression
elastic.model = train(x = Train.p, y = Train.r, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneLength = 10, metric = "Rsquared"
)
elastic.model$bestTune
## alpha lambda
## 98 1 0.1842246
ggplot(elastic.model) + labs(title = "R-squared Error of Elastic Model") + theme(legend.position = "top")
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 10. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 40 rows containing missing values (geom_point).
#predict response of test set
Test_pred <- predict(elastic.model, as.data.frame(Test.p))
xyplot(Test.r ~ Test_pred, type = c("p", "g"),
main = "Predicted vs Observed",
xlab = "Predicted",
ylab = "Observed",
panel = function(x, y) {
panel.xyplot(x, y)
panel.abline(lm(y ~ x))
})
postResample(Test_pred, Test.r)
## RMSE Rsquared MAE
## 1.2353871 0.4826968 0.9720362
rbind(actual = describe(ChemicalManufacturingProcess$Yield)[,-c(1,6,7)],
prediction = describe(Test_pred)[,-c(1,6,7)])
## n mean sd median min max range skew kurtosis se
## actual 176 40.18 1.85 39.97 35.25 46.34 11.09 0.31 -0.11 0.14
## prediction 32 40.28 1.42 40.21 38.27 43.81 5.54 0.57 -0.48 0.25
(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?
The caret::varImp calculation of variable importance for regression is the relationship between each predictor and the outcome from a linear model fit and the R2 statistic is calculated for this model against the intercept only null model.
varImp(elastic.model)
## glmnet variable importance
##
## only 20 most important variables shown (out of 47)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess13 33.322
## ManufacturingProcess06 29.071
## ManufacturingProcess17 25.213
## ManufacturingProcess09 14.515
## BiologicalMaterial11 8.683
## ManufacturingProcess34 7.767
## ManufacturingProcess39 6.192
## ManufacturingProcess30 1.886
## BiologicalMaterial08 0.000
## ManufacturingProcess21 0.000
## ManufacturingProcess19 0.000
## ManufacturingProcess43 0.000
## ManufacturingProcess35 0.000
## ManufacturingProcess12 0.000
## ManufacturingProcess08 0.000
## ManufacturingProcess02 0.000
## ManufacturingProcess04 0.000
## BiologicalMaterial01 0.000
## ManufacturingProcess03 0.000
The top 20 most important variable out 48 is ranked above. The list shows that there are a few variables that are shrunk to zero, and the most contribution variable is ManufacturingProcess32.
cor(ChemicalManufacturingProcess$Yield,
ChemicalManufacturingProcess$ManufacturingProcess32)
## [1] 0.6083321
cor(ChemicalManufacturingProcess$Yield,
ChemicalManufacturingProcess$ManufacturingProcess13)
## [1] -0.5036797
ManufacturingProcess32 has the highest positive correlaion. ManufacturingProcess13 has a negative correlation of -0.5.