library(AppliedPredictiveModeling)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
data(permeability)
fprints <- fingerprints[, -nearZeroVar(fingerprints)]
dim(fprints)
## [1] 165 388
PLS model building.
set.seed(366)
split_data <- createDataPartition(permeability, p = .75, list = FALSE)
x_train <- fprints[split_data, ]
x_test <- fprints[-split_data, ]
y_train <- permeability[split_data, ]
y_test <- permeability[-split_data, ]
plsFit <- train(x = x_train, y = y_train,
method = "pls",
tuneGrid = expand.grid(ncomp = 1:10),
trControl = trainControl(method = "cv", number = 10)
)
plsFit$results
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 13.69502 0.2728105 10.461630 1.613545 0.1872376 1.631011
## 2 2 11.60005 0.4785462 8.273946 2.824921 0.2269299 1.927435
## 3 3 11.42837 0.5127289 8.544649 1.923615 0.1721747 1.527194
## 4 4 11.32388 0.5340053 8.734486 2.282156 0.2072709 1.850056
## 5 5 11.70014 0.5175264 8.731123 2.317717 0.1742210 1.766777
## 6 6 11.77121 0.5047029 8.922775 2.486218 0.1736720 1.771602
## 7 7 11.91085 0.4971937 9.231209 2.789628 0.1920959 2.043239
## 8 8 12.09044 0.4828316 9.241331 2.938882 0.2135404 2.056671
## 9 9 12.11666 0.4842464 9.190571 3.353800 0.2337441 2.392811
## 10 10 12.38206 0.4818630 9.403694 3.667824 0.2399279 2.680955
plot(x=plsFit$results$ncomp, y=plsFit$results$Rsquared,
xlab = "ncomp",
ylab = "Rsquared"
)
trainingData <- cbind(as.data.frame(x_train), as.data.frame(y_train))
set.seed(366)
plsFit1 <- plsr(y_train ~ ., data = trainingData)
plsTest1 <- predict(plsFit1, as.data.frame(x_test), ncomp = 7)
head(plsTest1)
## [1] 9.3000051 -2.5675600 -5.6664192 -0.3679551 3.7408979 14.0725069
postResample(pred = plsTest1, obs = y_test)
## RMSE Rsquared MAE
## 11.734741 0.491182 7.573310
fractionTune = seq(.3,.8,.1)
lambdaTune = seq(.05, .3, .1)
ctrl <- trainControl(method = "repeatedcv", repeats = 5)
set.seed(366)
fpENet <- train(x = x_train, y = y_train,
method = "enet",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneGrid = expand.grid(lambda = lambdaTune,
fraction = fractionTune)
)
fpENet
## Elasticnet
##
## 125 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 113, 113, 112, 111, 113, 112, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.05 0.3 11.46992 0.5174306 8.325353
## 0.05 0.4 11.42031 0.5291059 8.342046
## 0.05 0.5 11.46413 0.5376186 8.405431
## 0.05 0.6 11.61456 0.5392510 8.531699
## 0.05 0.7 11.83651 0.5352553 8.685677
## 0.05 0.8 12.11730 0.5250646 8.868102
## 0.15 0.3 11.40476 0.5346272 8.202513
## 0.15 0.4 11.40103 0.5372319 8.260046
## 0.15 0.5 11.35377 0.5479232 8.256574
## 0.15 0.6 11.40346 0.5537248 8.341004
## 0.15 0.7 11.53468 0.5533554 8.476566
## 0.15 0.8 11.68379 0.5511552 8.644600
## 0.25 0.3 11.47817 0.5428264 8.197650
## 0.25 0.4 11.52868 0.5433812 8.306259
## 0.25 0.5 11.52209 0.5513574 8.359469
## 0.25 0.6 11.55478 0.5589836 8.419891
## 0.25 0.7 11.67172 0.5607577 8.562790
## 0.25 0.8 11.80448 0.5603285 8.723607
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.5 and lambda = 0.15.
set.seed(366)
ridgeGrid <- data.frame(.lambda = seq(.01, .1, .02))
ridgeRegFit <- train(x = x_train, y = y_train,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl,
preProcess = c("center", "scale")
)
ridgeRegFit
## Ridge Regression
##
## 125 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 113, 113, 112, 111, 113, 112, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.01 230.91239 0.3879363 153.505661
## 0.03 13.11317 0.4820800 9.433966
## 0.05 12.70618 0.5022619 9.265668
## 0.07 12.35203 0.5173963 9.091836
## 0.09 12.18974 0.5264338 9.021394
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.09.
set.seed(366)
library(xgboost)
tune_grid <- expand.grid(nrounds = 200,
max_depth = 5,
eta = c(.005, .01, 0.025, 0.05),
gamma = 0,
colsample_bytree = 0.75,
min_child_weight = 1,
subsample = .5)
xgbFit <- train(y_train ~., data = trainingData, method = "xgbTree",
trControl=ctrl,
tuneGrid = tune_grid,
tuneLength = 10)
xgbFit
## eXtreme Gradient Boosting
##
## 125 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 113, 113, 112, 111, 113, 112, ...
## Resampling results across tuning parameters:
##
## eta RMSE Rsquared MAE
## 0.005 12.60147 0.5720172 7.800377
## 0.010 10.82119 0.5715736 7.186332
## 0.025 10.59984 0.5574392 7.196675
## 0.050 10.85645 0.5410035 7.333159
##
## Tuning parameter 'nrounds' was held constant at a value of 200
## 0.75
## Tuning parameter 'min_child_weight' was held constant at a value
## of 1
## Tuning parameter 'subsample' was held constant at a value of 0.5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 200, max_depth = 5,
## eta = 0.025, gamma = 0, colsample_bytree = 0.75, min_child_weight = 1
## and subsample = 0.5.
allResamples <- resamples(list(#"PLS" = plsFit,
"Elastic Net" = fpENet,
"Ridge Regression" = ridgeRegFit,
"XGBTree" = xgbFit)
)
parallelplot(allResamples, metric = "Rsquared")
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176 58
# missing observations
table(is.na(ChemicalManufacturingProcess))
##
## FALSE TRUE
## 10102 106
# remove near zero variables
rev_col <- nearZeroVar(ChemicalManufacturingProcess)
CMP <- ChemicalManufacturingProcess[, -rev_col]
dim(CMP)
## [1] 176 57
missingCol = sapply(CMP, function(x) sum(is.na(x)))
missingCol = sort(missingCol,decreasing=TRUE)
topMissingCol = missingCol[1:3]
missingRow = apply(CMP, 1, function(x) sum(is.na(x)))
missingRow = sort(missingRow,decreasing=TRUE)
topMissingRow = missingRow[1:3]
par(mfrow=c(1,2))
data.frame(Frequency=topMissingCol)
## Frequency
## ManufacturingProcess03 15
## ManufacturingProcess11 10
## ManufacturingProcess10 9
data.frame(Frequency=topMissingRow)
## Frequency
## 1 16
## 172 11
## 173 11
# impute missing values and transform variables.
CMP_Pre <- preProcess(CMP,method=c("BoxCox","center","scale","knnImpute"))
CMP_df <- predict(CMP_Pre, CMP)
table(is.na(CMP_df))
##
## FALSE
## 10032
x <- subset(CMP_df,select= -Yield)
y <- subset(CMP_df,select="Yield")
set.seed(366)
splitIndex <- createDataPartition(y$Yield,
p = 0.7,
list = FALSE)
x_train <- x[splitIndex,]
y_train <- y[splitIndex,]
x_test <- x[-splitIndex,]
y_test <- y[-splitIndex,]
set.seed(366)
ctrl <- trainControl(method = "cv", number = 10)
lmFit <- train(x = x_train_re, y = y_train,
method = "lm",
trControl = ctrl)
lmFit1 <- lm(y_train ~ ., cbind(x_train_re,y_train))
lmPred1 <- predict(lmFit1, x_test_re)
head(lmPred1)
## 4 8 10 12 13 15
## 0.3937543 1.8673617 1.4571583 1.8143363 1.4455131 0.3799791
lmValues1 <- data.frame(obs = y_test, pred = lmPred1)
defaultSummary(lmValues1)
## RMSE Rsquared MAE
## 0.7403734 0.5115070 0.6025287
lmImp <- varImp(lmFit, scale = FALSE)
plot(lmImp, top=20, scales = list(y = list(cex = 0.8)))
viporder <- order(abs(lmImp$importance),decreasing=TRUE)
topVIP = rownames(lmImp$importance)[viporder[c(1:3)]]
featurePlot(x_train_re[, topVIP],
y_train,
plot = "scatter",
between = list(x = 1, y = 1),
type = c("g", "p", "smooth"),
layout = c(3,1),
labels = rep("", 2))
Reference:
https://github.com/topepo/APM_Exercises/blob/master/Ch_06.pdf