(a) Upload library and data. Fingerprints has 1107 predictors and 165 observation, and permeability has 1 response column and 165 observations.

library(AppliedPredictiveModeling)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
data(permeability)

(b) Take out predictors with near zero values, there are 388 predictors left.

fprints <- fingerprints[, -nearZeroVar(fingerprints)]
dim(fprints)
## [1] 165 388

(c) Split train and test data set 75/25, and tune a pls model with train data set, and 10-fold cross validation. The optimal latent variable is 4, and correponding resampled estimate of Rsqured is 0.534.

PLS model building.

set.seed(366)
split_data <- createDataPartition(permeability, p = .75, list = FALSE)
x_train <- fprints[split_data, ]
x_test <- fprints[-split_data, ]
y_train <- permeability[split_data, ]
y_test <- permeability[-split_data, ]

plsFit <- train(x = x_train, y = y_train,
                method = "pls",
                tuneGrid = expand.grid(ncomp = 1:10),
                trControl = trainControl(method = "cv", number = 10)
          )

plsFit$results
##    ncomp     RMSE  Rsquared       MAE   RMSESD RsquaredSD    MAESD
## 1      1 13.69502 0.2728105 10.461630 1.613545  0.1872376 1.631011
## 2      2 11.60005 0.4785462  8.273946 2.824921  0.2269299 1.927435
## 3      3 11.42837 0.5127289  8.544649 1.923615  0.1721747 1.527194
## 4      4 11.32388 0.5340053  8.734486 2.282156  0.2072709 1.850056
## 5      5 11.70014 0.5175264  8.731123 2.317717  0.1742210 1.766777
## 6      6 11.77121 0.5047029  8.922775 2.486218  0.1736720 1.771602
## 7      7 11.91085 0.4971937  9.231209 2.789628  0.1920959 2.043239
## 8      8 12.09044 0.4828316  9.241331 2.938882  0.2135404 2.056671
## 9      9 12.11666 0.4842464  9.190571 3.353800  0.2337441 2.392811
## 10    10 12.38206 0.4818630  9.403694 3.667824  0.2399279 2.680955
plot(x=plsFit$results$ncomp, y=plsFit$results$Rsquared,
     xlab = "ncomp",
     ylab = "Rsquared"
     )

(d) Test model performance using hold out test data set, and got R sqaured 0.491, and RMSE 11.734.

trainingData <- cbind(as.data.frame(x_train), as.data.frame(y_train))
set.seed(366)
plsFit1 <- plsr(y_train ~ ., data = trainingData)
plsTest1 <- predict(plsFit1, as.data.frame(x_test), ncomp = 7)
head(plsTest1)
## [1]  9.3000051 -2.5675600 -5.6664192 -0.3679551  3.7408979 14.0725069
postResample(pred = plsTest1, obs = y_test)
##      RMSE  Rsquared       MAE 
## 11.734741  0.491182  7.573310

(e) Ridge Regression, Elastic Net models, and xgbTree are applied to the train data set. Elastic net has Rsquared .548, and RMSE 11.353, Ridge Regression model has Rsquared .526 and RMSE 12.19, and xgbTree has Rsqured 0.557 and RMSE 10.600. XGBTree has slightly better performance than the other 2 models with higher Rsquared and lower RMSE.

Elastic Net model:

fractionTune = seq(.3,.8,.1)
lambdaTune = seq(.05, .3, .1)
ctrl <- trainControl(method = "repeatedcv", repeats = 5)

set.seed(366)
fpENet <- train(x = x_train, y = y_train, 
                  method = "enet", 
                  trControl = ctrl, 
                  preProcess = c("center", "scale"), 
                  tuneGrid = expand.grid(lambda = lambdaTune,
                                         fraction = fractionTune)
                )

fpENet
## Elasticnet 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 113, 113, 112, 111, 113, 112, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE      Rsquared   MAE     
##   0.05    0.3       11.46992  0.5174306  8.325353
##   0.05    0.4       11.42031  0.5291059  8.342046
##   0.05    0.5       11.46413  0.5376186  8.405431
##   0.05    0.6       11.61456  0.5392510  8.531699
##   0.05    0.7       11.83651  0.5352553  8.685677
##   0.05    0.8       12.11730  0.5250646  8.868102
##   0.15    0.3       11.40476  0.5346272  8.202513
##   0.15    0.4       11.40103  0.5372319  8.260046
##   0.15    0.5       11.35377  0.5479232  8.256574
##   0.15    0.6       11.40346  0.5537248  8.341004
##   0.15    0.7       11.53468  0.5533554  8.476566
##   0.15    0.8       11.68379  0.5511552  8.644600
##   0.25    0.3       11.47817  0.5428264  8.197650
##   0.25    0.4       11.52868  0.5433812  8.306259
##   0.25    0.5       11.52209  0.5513574  8.359469
##   0.25    0.6       11.55478  0.5589836  8.419891
##   0.25    0.7       11.67172  0.5607577  8.562790
##   0.25    0.8       11.80448  0.5603285  8.723607
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.5 and lambda = 0.15.

Ridge Regression model:

set.seed(366)
ridgeGrid <- data.frame(.lambda = seq(.01, .1, .02))
ridgeRegFit <- train(x = x_train, y = y_train,
                method = "ridge",
                tuneGrid = ridgeGrid,
                trControl = ctrl,
                preProcess = c("center", "scale")
                )
ridgeRegFit
## Ridge Regression 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 113, 113, 112, 111, 113, 112, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared   MAE       
##   0.01    230.91239  0.3879363  153.505661
##   0.03     13.11317  0.4820800    9.433966
##   0.05     12.70618  0.5022619    9.265668
##   0.07     12.35203  0.5173963    9.091836
##   0.09     12.18974  0.5264338    9.021394
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.09.

XGBTree model:

set.seed(366)
library(xgboost)
tune_grid <- expand.grid(nrounds = 200,
                        max_depth = 5,
                        eta = c(.005, .01, 0.025, 0.05),
                        gamma = 0,
                        colsample_bytree = 0.75,
                        min_child_weight = 1,
                        subsample = .5)

xgbFit <- train(y_train ~., data = trainingData, method = "xgbTree",
                trControl=ctrl,
                tuneGrid = tune_grid,
                tuneLength = 10)

xgbFit
## eXtreme Gradient Boosting 
## 
## 125 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 113, 113, 112, 111, 113, 112, ... 
## Resampling results across tuning parameters:
## 
##   eta    RMSE      Rsquared   MAE     
##   0.005  12.60147  0.5720172  7.800377
##   0.010  10.82119  0.5715736  7.186332
##   0.025  10.59984  0.5574392  7.196675
##   0.050  10.85645  0.5410035  7.333159
## 
## Tuning parameter 'nrounds' was held constant at a value of 200
##  0.75
## Tuning parameter 'min_child_weight' was held constant at a value
##  of 1
## Tuning parameter 'subsample' was held constant at a value of 0.5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 200, max_depth = 5,
##  eta = 0.025, gamma = 0, colsample_bytree = 0.75, min_child_weight = 1
##  and subsample = 0.5.

(f) I would recommend Elastic Net model, not only it has better performance classical model feature that predictors are explained in the model. Even though from the all resample plot below, the differences on Rsquared are not significant.

allResamples <- resamples(list(#"PLS" = plsFit,
                               "Elastic Net" = fpENet,
                               "Ridge Regression" = ridgeRegFit,
                               "XGBTree" = xgbFit)
                          )

parallelplot(allResamples, metric = "Rsquared")

(a) Upload data and exam missing values. There are 106 out of 10,102 data points are missing. Remove 1 near zero variable.

data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176  58
# missing observations
table(is.na(ChemicalManufacturingProcess))
## 
## FALSE  TRUE 
## 10102   106
# remove near zero variables
rev_col <- nearZeroVar(ChemicalManufacturingProcess)
CMP <- ChemicalManufacturingProcess[, -rev_col]
dim(CMP)
## [1] 176  57

(b) Allocate top 3 missing columns and rows. As the top 3 columns have missing values 9-15, and the row 1 has the most missing value 16 out of 176 samples, data missing situation is not severe. Impute missing values using K nearest neighbors and transform variables.

missingCol = sapply(CMP, function(x) sum(is.na(x)))
missingCol = sort(missingCol,decreasing=TRUE)
topMissingCol = missingCol[1:3]

missingRow = apply(CMP, 1, function(x) sum(is.na(x)))
missingRow = sort(missingRow,decreasing=TRUE)
topMissingRow = missingRow[1:3]

par(mfrow=c(1,2))
data.frame(Frequency=topMissingCol)
##                        Frequency
## ManufacturingProcess03        15
## ManufacturingProcess11        10
## ManufacturingProcess10         9
data.frame(Frequency=topMissingRow)
##     Frequency
## 1          16
## 172        11
## 173        11
# impute missing values and transform variables.
CMP_Pre <- preProcess(CMP,method=c("BoxCox","center","scale","knnImpute"))
CMP_df <- predict(CMP_Pre, CMP)
table(is.na(CMP_df))
## 
## FALSE 
## 10032

(c) Split data into train and test data sets.

x <- subset(CMP_df,select= -Yield)
y <- subset(CMP_df,select="Yield")

set.seed(366)
splitIndex <- createDataPartition(y$Yield, 
                                    p = 0.7, 
                                    list = FALSE)
x_train <- x[splitIndex,]
y_train <- y[splitIndex,]

x_test <- x[-splitIndex,]
y_test <- y[-splitIndex,]

Remove 8 high correlated variables above .9. After removing the highly correlated variables, there are 48 predictors left out of 56. The correlation plot before and after removing high correlated variables are compared.

library(corrplot)
## corrplot 0.84 loaded
## 
## Attaching package: 'corrplot'
## The following object is masked from 'package:pls':
## 
##     corrplot
correlations <- cor(cbind(x_train,y_train), use="pairwise.complete.obs")
corrplot::corrplot(correlations, type="lower", tl.cex = 0.5, mar=c(0,0.2,0,0))

corthresh <- .9
tooH <- findCorrelation(cor(x_train), corthresh)
corrPred <- names(x_train)[tooH]
x_train_re <- x_train[, -tooH]
x_test_re <- x_test[, -tooH]
dim(x_train_re)
## [1] 124  48
correlations_re <- cor(cbind(x_train_re,y_train), use="pairwise.complete.obs")
corrplot::corrplot(correlations_re, type="lower", tl.cex = 0.5, mar=c(0,0.2,0,0))

Tune a lm model, the optimal value of performance metric Rsquared .431 and RMSE 1.104.

set.seed(366)
ctrl <- trainControl(method = "cv", number = 10)

lmFit <- train(x = x_train_re, y = y_train,
                 method = "lm",
                 trControl = ctrl)

(d) Predict lm response for the hold out test set. The Rsquared is 0.512 and RMSE is 0.740. The model is doing better than the resampled performance metric on the training set.

lmFit1 <- lm(y_train ~ ., cbind(x_train_re,y_train))
lmPred1 <- predict(lmFit1, x_test_re)
head(lmPred1)
##         4         8        10        12        13        15 
## 0.3937543 1.8673617 1.4571583 1.8143363 1.4455131 0.3799791
lmValues1 <- data.frame(obs = y_test, pred = lmPred1)
defaultSummary(lmValues1)
##      RMSE  Rsquared       MAE 
## 0.7403734 0.5115070 0.6025287

(e) The most important top 20 predictors are listed as below, manufacturing process dominates the list, takes 18 out of 20 predictors.

lmImp <- varImp(lmFit, scale = FALSE)
plot(lmImp, top=20, scales = list(y = list(cex = 0.8)))

(f) The top 3 most important predictors are examined in the relationship with the response. In the manufacturing process, if we can control these predictors, they can make a bigger impact to yield than the rest of variables. Only concern here is that ManufacturingProcess43 has an outlier, which causes the strong relationship with the response. This outlier should be investigated, and it might affect the relationship with yield.

viporder <- order(abs(lmImp$importance),decreasing=TRUE)
topVIP = rownames(lmImp$importance)[viporder[c(1:3)]]

featurePlot(x_train_re[, topVIP],
            y_train,
            plot = "scatter",
            between = list(x = 1, y = 1),
            type = c("g", "p", "smooth"),
            layout = c(3,1),
            labels = rep("", 2))

Reference:

https://github.com/topepo/APM_Exercises/blob/master/Ch_06.pdf