Data624 Assignment 8

suppressMessages(suppressWarnings(library(fpp2)))
suppressMessages(suppressWarnings(library(readxl)))
suppressMessages(suppressWarnings(library(seasonal)))
suppressMessages(suppressWarnings(library(rdatamarket)))
suppressMessages(suppressWarnings(library(tseries)))
suppressMessages(suppressWarnings(library(AppliedPredictiveModeling)))
suppressMessages(suppressWarnings(library(fma)))
suppressMessages(suppressWarnings(library(corrplot)))
suppressMessages(suppressWarnings(library(caret)))
suppressMessages(suppressWarnings(library(pls)))
suppressMessages(suppressWarnings(library(glmnet)))
suppressMessages(suppressWarnings(library(missForest)))
suppressMessages(suppressWarnings(library(mlbench)))
suppressMessages(suppressWarnings(library(nnet)))
suppressMessages(suppressWarnings(library(earth)))
suppressMessages(suppressWarnings(library(kernlab)))
suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(forecast)))
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(Metrics)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(doParallel)))
suppressMessages(suppressWarnings(library(cwhmisc)))

7.2 Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations used the following nonlinear equation to create data:

\[y = 10sin(\pi x_1 x_2) + 20(x_3 -0.5)^2+10x_4 +5x_5 + N(0,\sigma^2)\]

where the ex values are random variables uniformly distributed between [0,1] (there are also 5 0other non-informative variables also created in simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
#featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

KNN

knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.565620  0.4887976  2.886629
##    7  3.422420  0.5300524  2.752964
##    9  3.368072  0.5536927  2.715310
##   11  3.323010  0.5779056  2.669375
##   13  3.275835  0.6030846  2.628663
##   15  3.261864  0.6163510  2.621192
##   17  3.261973  0.6267032  2.616956
##   19  3.286299  0.6281075  2.640585
##   21  3.280950  0.6390386  2.643807
##   23  3.292397  0.6440392  2.656080
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

Make the prediction

knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

The RMSE is 3.17

MARS

set.seed(123)  
mars_grid = expand.grid(degree =1:2, nprune=seq(2,14,by=2))
marsTune = train(x= trainingData$x, y=trainingData$y, method='earth', tuneGrid=mars_grid, trControl = trainControl(method = "cv"))
marsTune
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.456357  0.2147829  3.763412
##   1        4      2.600345  0.7422959  2.090504
##   1        6      2.203236  0.8204569  1.775677
##   1        8      1.729425  0.8852780  1.370689
##   1       10      1.683149  0.8887776  1.328496
##   1       12      1.634706  0.8956391  1.283077
##   1       14      1.645803  0.8934351  1.296689
##   2        2      4.456357  0.2147829  3.763412
##   2        4      2.600345  0.7422959  2.090504
##   2        6      2.237171  0.8118249  1.753283
##   2        8      1.652901  0.8930780  1.264622
##   2       10      1.447568  0.9203813  1.142897
##   2       12      1.293865  0.9377882  1.013377
##   2       14      1.251764  0.9421335  1.003794
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.

Variable importance

varImp(marsTune)
## earth variable importance
## 
##     Overall
## X1   100.00
## X4    85.13
## X2    69.22
## X5    49.28
## X3    39.95
## X8     0.00
## X10    0.00
## X7     0.00
## X6     0.00
## X9     0.00

Make the prediction using MARS.

mars_pred = predict (marsTune, testData$x)
postResample(pred = mars_pred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.1722635 0.9448890 0.9324923

The RMSE is 1.17, better than KNN

SVM

set.seed(124)
svmTune = train(x= trainingData$x, y=trainingData$y, method='svmRadial', tuneLength = 14, trControl = trainControl(method = "cv"))
svmTune$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 16 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0646229473798898 
## 
## Number of Support Vectors : 151 
## 
## Objective Function Value : -70.893 
## Training error : 0.00852

Variable importance

varImp(svmTune)
## loess r-squared variable importance
## 
##      Overall
## X4  100.0000
## X1   95.5047
## X2   89.6186
## X5   45.2170
## X3   29.9330
## X9    6.3299
## X10   5.5182
## X8    3.2527
## X6    0.8884
## X7    0.0000
svmPred <- predict(svmTune, newdata = testData$x)
postResample(pred = svmPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.0776847 0.8250301 1.5783076

The RMSE is 2.07

Which models appear to give the best performance? Does MARS select theinformative predictors (those named X1-X5)?

With the above results, MARS is the most accurate of the 3 models. It has the lowest RMSE of 1.17.

7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

data(ChemicalManufacturingProcess)
df = ChemicalManufacturingProcess
#summary(df)
df_imp1 = missForest(df)
##   missForest iteration 1 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
##   missForest iteration 2 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
##   missForest iteration 3 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
##   missForest iteration 4 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?

## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
df_imp = df_imp1$ximp
data = df_imp[,2:58]
target = df_imp[,1]
training = createDataPartition( target, p=0.75 )
predictor_training = data[training$Resample1,]
target_training = target[training$Resample]
predictor_testing = data[-training$Resample1,]
target_testing = target[-training$Resample1]

KNN

knnModel1 <- train(x = predictor_training, y = target_training, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
knnModel1
## k-Nearest Neighbors 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.453045  0.3846283  1.150241
##    7  1.428844  0.4034515  1.147851
##    9  1.425711  0.4108613  1.147274
##   11  1.403578  0.4325944  1.130706
##   13  1.409387  0.4289528  1.137463
##   15  1.400754  0.4404230  1.136332
##   17  1.403127  0.4411012  1.145331
##   19  1.414829  0.4349747  1.156591
##   21  1.421582  0.4343973  1.164287
##   23  1.423165  0.4387631  1.167686
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

The best RMSE is when n = 15.

knnPred1 <- predict(knnModel1, newdata = predictor_testing)
postResample(pred = knnPred1, obs = target_testing)
##      RMSE  Rsquared       MAE 
## 1.4284096 0.4920472 1.0870170

The RMSE of 1.42.

MARS

mars1 = expand.grid(degree =1:2, nprune=seq(2,14,by=2))
mars2 = train(x = predictor_training, y = target_training, method='earth', tuneGrid=mars1, trControl = trainControl(method = "cv"))
mars2
## Multivariate Adaptive Regression Spline 
## 
## 132 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 118, 118, 118, 120, 119, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      1.364681  0.4585905  1.0829231
##   1        4      1.158041  0.6160310  0.9236491
##   1        6      1.179410  0.6167058  0.9184239
##   1        8      1.156381  0.6228721  0.9230851
##   1       10      1.116534  0.6632506  0.8977107
##   1       12      1.180115  0.6426455  0.9332636
##   1       14      1.158161  0.6490796  0.9310004
##   2        2      1.378750  0.4476384  1.0889202
##   2        4      1.245372  0.5489994  1.0113678
##   2        6      1.250174  0.5475575  1.0159313
##   2        8      1.259254  0.5487133  0.9912936
##   2       10      1.300642  0.5315020  1.0289910
##   2       12      1.413606  0.5430414  1.0841638
##   2       14      1.453732  0.5376685  1.1181972
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.

Variable importance

marspred = predict (mars2, predictor_testing)
postResample(pred = marspred, obs = target_testing)
##      RMSE  Rsquared       MAE 
## 1.2696435 0.5787261 1.0359571

The RMSE is 1.27, better than KNN model.

SVM

svm1 = train(x = predictor_training, y = target_training, method='svmRadial', tuneLength = 14, trControl = trainControl(method = "cv"))
svm1
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 132 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 119, 120, 119, 119, 118, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE      
##      0.25  1.394172  0.5192581  1.1378896
##      0.50  1.303427  0.5364552  1.0639061
##      1.00  1.255315  0.5511798  1.0128133
##      2.00  1.240910  0.5612984  0.9909936
##      4.00  1.249447  0.5606367  0.9851349
##      8.00  1.262441  0.5513856  0.9972423
##     16.00  1.253929  0.5565101  0.9895903
##     32.00  1.253929  0.5565101  0.9895903
##     64.00  1.253929  0.5565101  0.9895903
##    128.00  1.253929  0.5565101  0.9895903
##    256.00  1.253929  0.5565101  0.9895903
##    512.00  1.253929  0.5565101  0.9895903
##   1024.00  1.253929  0.5565101  0.9895903
##   2048.00  1.253929  0.5565101  0.9895903
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01267093
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01267093 and C = 2.
svmPred <- predict(svm1, newdata = predictor_testing)
postResample(pred = svmPred, obs = target_testing)
##      RMSE  Rsquared       MAE 
## 1.0739452 0.7611615 0.8975338

The RMSE is 1.07 which is better then the previous models.

A. Which nonlinear regression model gives the optimal resampling and test set performance?

The SVM model has the best accuracy with RMSE of 1.07

B. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

Lets see the predictor importance of SVM model as this has the best RSME scores.

SVM top predictors

plot(varImp(svm1))

v_imp <- varImp(svm1)
v_imp
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial06     91.24
## BiologicalMaterial03     82.81
## ManufacturingProcess13   81.60
## ManufacturingProcess36   74.52
## BiologicalMaterial02     72.85
## ManufacturingProcess33   66.93
## BiologicalMaterial12     64.92
## ManufacturingProcess17   63.51
## BiologicalMaterial04     60.09
## ManufacturingProcess31   56.14
## ManufacturingProcess09   53.86
## BiologicalMaterial01     53.63
## ManufacturingProcess29   44.00
## ManufacturingProcess02   43.34
## BiologicalMaterial11     38.87
## ManufacturingProcess20   37.71
## BiologicalMaterial08     37.09
## ManufacturingProcess06   32.90
## ManufacturingProcess27   30.15

ManufacturingProcess32 and BiologicalMaterial06 are the top predictors. Both ManufacturingProcess and BiologicalMaterial contribute to the list if we consider the SVM algorithm.

C. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

plot(data$ManufacturingProcess32, target)
abline(lm(target~data$ManufacturingProcess32),col="red",lwd=1.5)

plot(data$BiologicalMaterial06, target)
abline(lm(target~data$BiologicalMaterial06),col="red",lwd=1.5)

Both predictors have positive corelation. Lets check the exact corelation

cor(data$ManufacturingProcess32, target)
## [1] 0.6083321
cor(data$BiologicalMaterial06, target)
## [1] 0.4781634

ManufacturingProcess32 and BiologicalMaterial06 are the top predictors. Both ManufacturingProcess and BiologicalMaterial contribute to the list if we consider the SVM algorithm.