library(AppliedPredictiveModeling)
library(mice) # impute NAs
library(visdat) # missing value
library(e1071)
library(caret)
library(dplyr)
library(tidyr)
library(mlbench) # chp 7 and 8
library(kableExtra)
library(randomForest)
library(party)
library(gbm)
library(Cubist)
library(rpart)
library(RANN)
library(pls)
library(earth)
library(kernlab)
data("ChemicalManufacturingProcess")
Pharma_data <- ChemicalManufacturingProcess
str(Pharma_data)
## 'data.frame': 176 obs. of 58 variables:
## $ Yield : num 38 42.4 42 41.4 42.5 ...
## $ BiologicalMaterial01 : num 6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
## $ BiologicalMaterial02 : num 49.6 61 61 61 63.3 ...
## $ BiologicalMaterial03 : num 57 67.5 67.5 67.5 72.2 ...
## $ BiologicalMaterial04 : num 12.7 14.7 14.7 14.7 14 ...
## $ BiologicalMaterial05 : num 19.5 19.4 19.4 19.4 17.9 ...
## $ BiologicalMaterial06 : num 43.7 53.1 53.1 53.1 54.7 ...
## $ BiologicalMaterial07 : num 100 100 100 100 100 100 100 100 100 100 ...
## $ BiologicalMaterial08 : num 16.7 19 19 19 18.2 ...
## $ BiologicalMaterial09 : num 11.4 12.6 12.6 12.6 12.8 ...
## $ BiologicalMaterial10 : num 3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
## $ BiologicalMaterial11 : num 138 154 154 154 148 ...
## $ BiologicalMaterial12 : num 18.8 21.1 21.1 21.1 21.1 ...
## $ ManufacturingProcess01: num NA 0 0 0 10.7 12 11.5 12 12 12 ...
## $ ManufacturingProcess02: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess03: num NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
## $ ManufacturingProcess04: num NA 917 912 911 918 924 933 929 928 938 ...
## $ ManufacturingProcess05: num NA 1032 1004 1015 1028 ...
## $ ManufacturingProcess06: num NA 210 207 213 206 ...
## $ ManufacturingProcess07: num NA 177 178 177 178 178 177 178 177 177 ...
## $ ManufacturingProcess08: num NA 178 178 177 178 178 178 178 177 177 ...
## $ ManufacturingProcess09: num 43 46.6 45.1 44.9 45 ...
## $ ManufacturingProcess10: num NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
## $ ManufacturingProcess11: num NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
## $ ManufacturingProcess12: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess13: num 35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
## $ ManufacturingProcess14: num 4898 4869 4878 4897 4992 ...
## $ ManufacturingProcess15: num 6108 6095 6087 6102 6233 ...
## $ ManufacturingProcess16: num 4682 4617 4617 4635 4733 ...
## $ ManufacturingProcess17: num 35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
## $ ManufacturingProcess18: num 4865 4867 4877 4872 4886 ...
## $ ManufacturingProcess19: num 6049 6097 6078 6073 6102 ...
## $ ManufacturingProcess20: num 4665 4621 4621 4611 4659 ...
## $ ManufacturingProcess21: num 0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
## $ ManufacturingProcess22: num NA 3 4 5 8 9 1 2 3 4 ...
## $ ManufacturingProcess23: num NA 0 1 2 4 1 1 2 3 1 ...
## $ ManufacturingProcess24: num NA 3 4 5 18 1 1 2 3 4 ...
## $ ManufacturingProcess25: num 4873 4869 4897 4892 4930 ...
## $ ManufacturingProcess26: num 6074 6107 6116 6111 6151 ...
## $ ManufacturingProcess27: num 4685 4630 4637 4630 4684 ...
## $ ManufacturingProcess28: num 10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
## $ ManufacturingProcess29: num 21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
## $ ManufacturingProcess30: num 9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
## $ ManufacturingProcess31: num 69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
## $ ManufacturingProcess32: num 156 169 173 171 171 173 159 161 160 164 ...
## $ ManufacturingProcess33: num 66 66 66 68 70 70 65 65 65 66 ...
## $ ManufacturingProcess34: num 2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ ManufacturingProcess35: num 486 508 509 496 468 490 475 478 491 488 ...
## $ ManufacturingProcess36: num 0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
## $ ManufacturingProcess37: num 0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
## $ ManufacturingProcess38: num 3 2 2 2 2 2 2 2 3 3 ...
## $ ManufacturingProcess39: num 7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
## $ ManufacturingProcess40: num NA 0.1 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess41: num NA 0.15 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess42: num 11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
## $ ManufacturingProcess43: num 3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
## $ ManufacturingProcess44: num 1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
## $ ManufacturingProcess45: num 2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
visdat::vis_miss(Pharma_data, sort_miss = TRUE)
prepro_1 <- preProcess(subset(Pharma_data, select = -Yield), method = c("BoxCox", "knnImpute", "center", "scale"))
pharma_imput <- predict(prepro_1, Pharma_data[, -1])
visdat::vis_miss(pharma_imput)
# Restore the response variable values to original
pharma_imput$Yield = Pharma_data$Yield
# Remove highly correlated predictors
cor_matrix <- cor(pharma_imput)
high_cor <- findCorrelation(cor_matrix, cutoff = 0.90)
pharma_imput <- pharma_imput[, -high_cor]
# Remove near-zero variance predictors
nzv <- nearZeroVar(pharma_imput)
pharma_imput <- pharma_imput[, -nzv]
# Split the data into a training and a test set
set.seed(123)
trainRows <- createDataPartition(pharma_imput$Yield, p = .80, list = FALSE)
training <- pharma_imput[trainRows,]
test <- pharma_imput[-trainRows,]
# Train a PLS model
set.seed(123)
ctrl <- trainControl(method = "cv", number = 5)
pslfit <- train(
x = training[, -ncol(training)],
y = training$Yield,
method = "pls",
tuneLength = 20,
trControl = ctrl
)
print(pslfit)
## Partial Least Squares
##
## 144 samples
## 47 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 116, 116, 115, 116, 113
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.349780 0.4804392 1.1046642
## 2 1.175328 0.5889834 0.9475752
## 3 1.168439 0.6150759 0.9557336
## 4 1.176341 0.6176024 0.9600506
## 5 1.184559 0.6139388 0.9617742
## 6 1.205142 0.6080452 0.9796574
## 7 1.217894 0.6049307 0.9939552
## 8 1.259650 0.5882543 1.0269892
## 9 1.291723 0.5789170 1.0489907
## 10 1.317733 0.5657914 1.0676808
## 11 1.354709 0.5520161 1.0860524
## 12 1.394908 0.5360526 1.1197126
## 13 1.422760 0.5266737 1.1345927
## 14 1.473333 0.5128033 1.1583224
## 15 1.502401 0.5067104 1.1704796
## 16 1.514326 0.5064079 1.1748180
## 17 1.523976 0.5039993 1.1756099
## 18 1.535185 0.5019438 1.1802772
## 19 1.559650 0.4959072 1.1873298
## 20 1.581071 0.4913535 1.1900139
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plot(pslfit)
set.seed(123)
# Predict on test set
pred_test <- predict(pslfit, newdata = test, ncomp = 2)
# Performance metrics for test set
test_perf <- postResample(pred_test, test$Yield)
# Performance metrics for training set
train_perf <- getTrainPerf(pslfit)
# Compare training and test performance
perf_df <- data.frame(c(train_perf[1:3], test_perf[1:3]))
names(perf_df) <- c("Training RMSE", "Training Rsquared", "Training MAE", "Test RMSE", "Test Rsquared", "Test MAE")
print(perf_df)
## Training RMSE Training Rsquared Training MAE Test RMSE Test Rsquared Test MAE
## 1 1.168439 0.6150759 0.9557336 1.342466 0.4876693 1.14565
# Plot residuals
plot(residuals(pslfit, ncomp = 2))
abline(h = 0)
( e)
# Find important predictors
plot(caret::varImp(pslfit), top = 20)
# Important predictors
imp_predictors <- c('ManufacturingProcess32', 'ManufacturingProcess13', 'ManufacturingProcess09', 'ManufacturingProcess17', 'ManufacturingProcess36', 'Yield')
# Plot relationships
pharma_imput %>%
select(imp_predictors) %>%
gather(var, val, -Yield) %>%
ggplot(aes(x = val, y = Yield, color = var)) +
geom_point() +
facet_wrap(~var)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(imp_predictors)
##
## # Now:
## data %>% select(all_of(imp_predictors))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Simulating Data:
set.seed(123)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
set.seed(123)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Visualize the training data
featurePlot(trainingData$x, trainingData$y)
KNN
set.seed(123)
knnModel <- train(x = trainingData$x,y = trainingData$y, method = "knn", preProc = c("center", "scale"),tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.360145 0.5186277 2.691602
## 7 3.200822 0.5652109 2.559310
## 9 3.146979 0.5885721 2.517311
## 11 3.115995 0.6038019 2.480000
## 13 3.114977 0.6142479 2.488102
## 15 3.102224 0.6306039 2.479184
## 17 3.103956 0.6407442 2.483608
## 19 3.114121 0.6439695 2.485436
## 21 3.140855 0.6459751 2.510892
## 23 3.158940 0.6507325 2.529683
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
knn_results <- postResample(pred = knnPred, obs = testData$y)
print(knn_results)
## RMSE Rsquared MAE
## 3.2421805 0.6536034 2.5899776
MARS
set.seed(123)
mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:38)
mars_model <- train(x = trainingData$x, y = trainingData$y, method = "earth", preProcess = c("center", "scale"), tuneGrid = mars_grid, trControl = trainControl(method = "cv"))
mars_predict <- predict(mars_model, newdata = testData$x)
mars_results <- postResample(pred = mars_predict, obs = testData$y)
print(mars_results)
## RMSE Rsquared MAE
## 1.2193997 0.9392656 0.9587801
plot(varImp(mars_model))
SVM
set.seed(123)
svm_model <- train(x = trainingData$x, y = trainingData$y, method = "svmRadial", preProcess = c("center", "scale"), tuneLength = 20)
svm_predict <- predict(svm_model, newdata = testData$x)
svm_results <- postResample(pred = svm_predict, obs = testData$y)
print(svm_results)
## RMSE Rsquared MAE
## 2.1679276 0.8083005 1.6333459
Neural Net
set.seed(123)
nnet_grid <- expand.grid(.decay = c(0, 0.01, 0.1), .size = c(1:10), .bag = FALSE)
ctrl <- trainControl(method = "cv")
nnet_model <- train(x = trainingData$x, y = trainingData$y, method = "avNNet", tuneGrid = nnet_grid, trControl = ctrl, preProcess = c("center", "scale"), linout = TRUE, trace = FALSE, maxit = 500, MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnet_predict <- predict(nnet_model, newdata = testData$x)
nnet_results <- postResample(pred = nnet_predict, obs = testData$y)
print(nnet_results)
## RMSE Rsquared MAE
## 2.2330230 0.7981146 1.6782098
Model Performance comparison
model_comparison <- data.frame(rbind(knn_results, mars_results, svm_results, nnet_results))
rownames(model_comparison) <- c("KNN", "MARS", "SVM", "Neural Network")
model_comparison <- model_comparison[order(model_comparison$RMSE), ]
print(model_comparison)
## RMSE Rsquared MAE
## MARS 1.219400 0.9392656 0.9587801
## SVM 2.167928 0.8083005 1.6333459
## Neural Network 2.233023 0.7981146 1.6782098
## KNN 3.242180 0.6536034 2.5899776
The MARS model outperforms the other models in predicting the simulated data. It also identifies the most important predictors as shown in the plot.
set.seed(123)
knnModel <- train(x = training[, -ncol(training)], y = training$Yield, method = "knn", tuneGrid = data.frame(.k = 1:20), trControl = trainControl(method = "cv"))
knn_pred <- predict(knnModel, newdata = test[, -ncol(test)])
knn_results <- postResample(pred = knn_pred, obs = test$Yield)
print(knn_results)
## RMSE Rsquared MAE
## 1.345935 0.460384 1.018750
MARS
set.seed(123)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsModel <- train(x = training[, -ncol(training)], y = training$Yield, method = "earth", tuneGrid = marsGrid, trControl = trainControl(method = "cv"))
mars_pred <- predict(marsModel, newdata = test[, -ncol(test)])
mars_results <- postResample(pred = mars_pred, obs = test$Yield)
print(mars_results)
## RMSE Rsquared MAE
## 1.4118759 0.4163619 1.1226807
SVM
set.seed(123)
svmModel <- train(x = training[, -ncol(training)], y = training$Yield, method = "svmRadial", tuneLength = 14, trControl = trainControl(method = "cv"))
svm_pred <- predict(svmModel, newdata = test[, -ncol(test)])
svm_results <- postResample(pred = svm_pred, obs = test$Yield)
print(svm_results)
## RMSE Rsquared MAE
## 1.1513032 0.6053502 0.9572238
Neural Net
set.seed(123)
nnetModel <- train(
x = training[, -ncol(training)],
y = training$Yield,
method = "nnet",
tuneGrid = expand.grid(size = 1:5, decay = c(0, 0.01, 0.1)),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(training[, -ncol(training)]) + 1) + 10 + 1,
maxit = 500
)
nnet_pred <- predict(nnetModel, newdata = test[, -ncol(test)])
nnet_results <- postResample(pred = nnet_pred, obs = test$Yield)
print(nnet_results)
## RMSE Rsquared MAE
## 1.75833927 0.08372262 1.48089754
Model Performance comparison
model_comparison <- data.frame(
Model = c("KNN", "Neural Network", "SVM", "MARS"),
RMSE = c(knn_results[1], nnet_results[1], svm_results[1], mars_results[1]),
Rsquared = c(knn_results[2], nnet_results[2], svm_results[2], mars_results[2]),
MAE = c(knn_results[3], nnet_results[3], svm_results[3], mars_results[3])
)
model_comparison <- model_comparison[order(model_comparison$RMSE), ]
print(model_comparison)
## Model RMSE Rsquared MAE
## 3 SVM 1.151303 0.60535021 0.9572238
## 1 KNN 1.345935 0.46038400 1.0187500
## 4 MARS 1.411876 0.41636188 1.1226807
## 2 Neural Network 1.758339 0.08372262 1.4808975
vimp <- varImp(svmModel)
ggplot(vimp, top = 10) + labs(title = "Top 10 Important Variables - SVM")
ggplot(varImp(pslfit), top = 10) +
labs(title = "Top 10 Important Variables - PLS")
Optimal nonlinear model Dominated by process variables.
Comparison with Linear Model Both models agree on top features but differ in ranks.
# Predictors importance
vimp_importance <- varImp(svmModel)$importance
# Top 5 predictors
top5_vars <- head(rownames(vimp_importance)[order(-vimp_importance$Overall)], 5)
X <- ChemicalManufacturingProcess[, top5_vars]
Y <- ChemicalManufacturingProcess$Yield
featurePlot(X, Y)
set.seed(123)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
str(simulated)
## 'data.frame': 200 obs. of 11 variables:
## $ V1 : num 0.288 0.788 0.409 0.883 0.94 ...
## $ V2 : num 0.239 0.962 0.601 0.515 0.403 ...
## $ V3 : num 0.986 0.137 0.905 0.576 0.395 ...
## $ V4 : num 0.237 0.686 0.226 0.318 0.174 ...
## $ V5 : num 0.471 0.366 0.121 0.047 0.263 ...
## $ V6 : num 0.274 0.594 0.16 0.853 0.848 ...
## $ V7 : num 0.859 0.887 0.489 0.718 0.487 ...
## $ V8 : num 0.2332 0.2307 0.0617 0.4971 0.2441 ...
## $ V9 : num 0.639 0.125 0.255 0.821 0.804 ...
## $ V10: num 0.155 0.846 0.214 0.67 0.618 ...
## $ y : num 10.6 17.2 13.1 13.3 10 ...
set.seed(123)
model1 <- randomForest(y ~ ., data = simulated,importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 3.06124177
## V2 7.85943835
## V3 0.38056337
## V4 10.81444548
## V5 1.94286224
## V6 0.23845388
## V7 0.14670567
## V8 0.24935492
## V9 -0.07158366
## V10 0.20212279
Did the random forest model significantly use the uninformative predictors (V6 – V10)? No
set.seed(123)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9449864
model2 <- randomForest(y ~ ., data = simulated,importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
## Overall
## V1 2.70094025
## V2 7.82707084
## V3 0.48798079
## V4 10.48694160
## V5 1.79881087
## V6 0.35376124
## V7 0.03525398
## V8 0.13056769
## V9 0.01981366
## V10 0.12224580
## duplicate1 1.31970471
Fit another random forest model to these data. Did the importance score for V1 change? Yes. What happens when you add another predictor that is also highly correlated with V1? The importance score of V1 decreases and the decrease was added to the duplicated1, the imporatnce score of V1 split between the 2.
set.seed(123)
model3 <- cforest(y ~ ., data = simulated)
ctrue <- varImp(model3, conditional=TRUE)
ctrue
## Overall
## V1 1.265164346
## V2 6.242188438
## V3 0.025008218
## V4 9.556471133
## V5 0.990179164
## V6 0.128781057
## V7 0.002201511
## V8 0.030530401
## V9 -0.029302913
## V10 0.023414696
## duplicate1 0.245946010
cfalse<- varImp(model3, conditional=FALSE)
cfalse
## Overall
## V1 1.984345833
## V2 8.185299913
## V3 0.023494492
## V4 12.121302755
## V5 1.350836279
## V6 0.151229594
## V7 -0.006018013
## V8 0.092381122
## V9 0.017947852
## V10 0.004090445
## duplicate1 0.940619520
result<-cbind(rfImp2, ctrue, cfalse)
colnames(result) <- c("tran_rf", "cf_con", "cf_uncon")
result
## tran_rf cf_con cf_uncon
## V1 2.70094025 1.265164346 1.984345833
## V2 7.82707084 6.242188438 8.185299913
## V3 0.48798079 0.025008218 0.023494492
## V4 10.48694160 9.556471133 12.121302755
## V5 1.79881087 0.990179164 1.350836279
## V6 0.35376124 0.128781057 0.151229594
## V7 0.03525398 0.002201511 -0.006018013
## V8 0.13056769 0.030530401 0.092381122
## V9 0.01981366 -0.029302913 0.017947852
## V10 0.12224580 0.023414696 0.004090445
## duplicate1 1.31970471 0.245946010 0.940619520
V2 and V4 are the highest significant variables in all the 3 scenarios. V6 to V10 remains low importance score in all 3 scenarios.
GBM
set.seed(123)
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = 0.1,
n.minobsinnode = 5)
gbmModel <- train(y ~ ., data = simulated,
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
gbm_imp <- varImp(gbmModel)
gbm_imp
## gbm variable importance
##
## Overall
## V4 100.0000
## V2 69.2635
## V1 28.0542
## V5 25.0033
## V3 19.7353
## duplicate1 7.8523
## V7 2.2947
## V10 0.6253
## V8 0.5732
## V9 0.3744
## V6 0.0000
Cubist
set.seed(123)
cubistModel <- train(y ~ ., data = simulated, method = 'cubist')
cubist_imp <- varImp(cubistModel)
cubist_imp
## cubist variable importance
##
## Overall
## V2 100.000
## V4 99.225
## V3 75.194
## V1 62.791
## V5 57.364
## duplicate1 31.783
## V8 18.605
## V7 13.178
## V6 5.426
## V10 2.326
## V9 0.000
V2 and V4 are the highest significant variables in 2 scenarios. V6 to V10 remains low importance score in all 2 scenarios.