Joshua Hummell

Homework 8

7.2

set.seed(42)
#install.packages("earth")


library('urca')
library("fpp3")
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.3
## ✔ dplyr       1.1.1     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.3.1
## ✔ lubridate   1.9.2     ✔ fable       0.3.3
## ✔ ggplot2     3.4.1     ✔ fabletools  0.3.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library('readr')
library('tsibble')
library('dplyr')
library('GGally')
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library("corrplot")
## corrplot 0.92 loaded
library("reshape2")
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library("mlbench")
library("kernlab")
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library("Seurat")
## Attaching SeuratObject
library("caret")
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
## 
##     MAE, RMSE
library("earth")
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
## 
## Attaching package: 'plotmo'
## The following object is masked from 'package:urca':
## 
##     plotres
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
caret::featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

## KNN 
knnModel <- caret::train(x = trainingData$x,
  y = trainingData$y,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
knnres <- postResample(pred = knnPred, obs = testData$y)
#################################################################


##MARS
mars <- expand.grid(.degree = 1:2, .nprune = 2:ncol(trainingData$x))
mars.fit <- earth(trainingData$x, trainingData$y)

mars.fit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
mars <- expand.grid(.degree = 1:2, .nprune = 2:6)
mars.fit <- earth(trainingData$x, trainingData$y)

mars.pred <- predict(mars.fit, newdata = testData$x)
mars.results <- postResample(pred = mars.pred, obs = testData$y)
#################################################################


## MARS Tuned
marst <- expand.grid(.degree = 1:2, .nprune = 2:6)
marst.fit <- train(trainingData$x, trainingData$y, method = "earth",
                   tuneGrid = marst,
                   trControl = trainControl(method = "cv"))


marst.pred <- predict(marst.fit, newdata = testData$x)
marst.results <- postResample(pred = marst.pred, obs = testData$y)
#################################################################

## SVM Tuned
svm <- train(trainingData$x, trainingData$y,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),
                   tuneLength = 15,
                   trControl = trainControl(method = "cv"))


svm.pred <- predict(svm, newdata = testData$x)
svm.results <- postResample(svm.pred, testData$y)


#################################################################

## NNEt AVGTuned


nnet <- avNNet(trainingData$x, trainingData$y,
                  size = ncol(trainingData$x),
                  decay = .035,
                  ## Specify how many models to average
                  repeats = 50,
                  linout = TRUE,
                  ## Reduce the amount of printed output
                  trace = FALSE,
                  ## Expand the number of iterations to find parameter estimates
                  maxit = 500)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnet.pred <- predict(nnet, newdata = testData$x)
nnet.results <- postResample(pred = nnet.pred, obs = testData$y)




rbind(knnres, mars.results, marst.results, svm.results, nnet.results)
##                   RMSE  Rsquared      MAE
## knnres        3.204059 0.6819919 2.568346
## mars.results  1.813647 0.8677298 1.391184
## marst.results 2.367035 0.7726677 1.871360
## svm.results   2.059968 0.8280925 1.563584
## nnet.results  1.558694 0.9019289 1.201263

Q -Which models appear to give the best performance?

  • It appears that a NN is the best with an \(r^2\) of ~90% and the lowest RMSE.

Q - Does MARS select the informative predictors (those named X1–X5)?

summary(mars.fit)
## Call: earth(x=trainingData$x, y=trainingData$y)
## 
##                coefficients
## (Intercept)       18.451984
## h(0.621722-X1)   -11.074396
## h(0.601063-X2)   -10.744225
## h(X3-0.281766)    20.607853
## h(0.447442-X3)    17.880232
## h(X3-0.447442)   -23.282007
## h(X3-0.636458)    15.150350
## h(0.734892-X4)   -10.027487
## h(X4-0.734892)     9.092045
## h(0.850094-X5)    -4.723407
## h(X5-0.850094)    10.832932
## h(X6-0.361791)    -1.956821
## 
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
  • Yes, it does!

7.5

7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

library("AppliedPredictiveModeling")
library("RANN")

data("ChemicalManufacturingProcess")

impute <- preProcess(ChemicalManufacturingProcess, "knnImpute")

chem_data <- predict(impute, ChemicalManufacturingProcess)

# We want to filter out all the ones near 0 again
chem_data_f <- chem_data %>% select(!nearZeroVar(.))

# Split into train and test
train_split <- createDataPartition(chem_data_f$Yield , p=.8, list=F)

trainingData <- chem_data_f[train_split,] 
testData <- chem_data_f[-train_split,]
  1. Which nonlinear regression model gives the optimal resampling and test set performance?
set.seed(44)

## KNN 
knnModel <- caret::train(x = trainingData[,-1],
  y = trainingData$Yield,
  method = "knn",
   metric = "Rsquared",
  preProc = c("center", "scale"),
  tuneLength = 10)


knnPred <- predict(knnModel, newdata = testData[,-1])
## The function 'postResample' can be used to get the test set
## perforamnce values
knnres <- postResample(pred = knnPred, obs = testData$Yield)
#################################################################


##MARS
mars <- expand.grid()
mars.fit <- earth(trainingData[,-1], trainingData$Yield)
mars.fit
## Selected 13 of 22 terms, and 7 of 56 predictors
## Termination condition: RSq changed by less than 0.001 at 22 terms
## Importance: ManufacturingProcess32, ManufacturingProcess13, ...
## Number of terms at each degree of interaction: 1 12 (additive model)
## GCV 0.3325095    RSS 32.69907    GRSq 0.6916942    RSq 0.7864972
mars <- expand.grid(.degree = 1:2, .nprune = 2:56)
mars.fit <- earth(trainingData[,-1], trainingData$Yield)


mars.pred <- predict(mars.fit, newdata = testData[,-1])
mars.results <- postResample(pred = mars.pred, obs = testData$Yield)
#################################################################


## MARS Tuned
marst <- expand.grid(.degree = 1:2, .nprune = 2:56)
marst.fit <- train(trainingData[,-1], trainingData$Yield, method = "earth",
                   metric = "Rsquared",
                   tuneGrid = marst,
                   na.action(na.pass),
                   trControl = trainControl(method = "cv"))


marst.pred <- predict(marst.fit, newdata = testData[,-1])
marst.results <- postResample(pred = marst.pred, obs = testData$Yield)
#################################################################

## SVM Tuned
svm <- train(trainingData[,-1], trainingData$Yield,
                   method = "svmRadial",
                   metric = "Rsquared",
                   preProcess = c("center", "scale"),
                   trControl = trainControl(method = "cv"))


svm.pred <- predict(svm, newdata = testData[,-1])
svm.results <- postResample(svm.pred, testData$Yield)


#################################################################


rbind(knnres, mars.results, marst.results, svm.results)
##                    RMSE  Rsquared       MAE
## knnres        0.5948008 0.4981031 0.5008508
## mars.results  0.6721636 0.4639389 0.5749884
## marst.results 0.8622959 0.2817729 0.5711889
## svm.results   0.5437172 0.5823322 0.4138903
  • The SVM has the best \(r^2\) at ~58%, so overall not great results. I tried to run the NN again, but my computer nearly died from the amount of columns.
  1. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
varImp(svm)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   94.73
## BiologicalMaterial06     85.40
## ManufacturingProcess17   78.64
## ManufacturingProcess09   77.96
## BiologicalMaterial03     77.86
## BiologicalMaterial12     74.91
## ManufacturingProcess36   74.14
## ManufacturingProcess31   63.23
## BiologicalMaterial02     63.16
## BiologicalMaterial11     60.49
## ManufacturingProcess06   59.31
## ManufacturingProcess33   51.79
## ManufacturingProcess11   49.99
## ManufacturingProcess29   49.64
## BiologicalMaterial08     48.28
## BiologicalMaterial04     46.83
## ManufacturingProcess30   43.64
## BiologicalMaterial01     41.92
## BiologicalMaterial09     33.97
plot(varImp(svm), top = 10)

  1. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
corr_vals <- chem_data %>% 
  dplyr::select('Yield', 'ManufacturingProcess32','ManufacturingProcess36',
         'BiologicalMaterial06','ManufacturingProcess13',
         'BiologicalMaterial03','ManufacturingProcess17',
         'BiologicalMaterial02','BiologicalMaterial12',
         'ManufacturingProcess09','ManufacturingProcess31')

corr_plot_vals <- cor(corr_vals)

corrplot.mixed(corr_plot_vals, tl.pos = 'lt', lower = NULL)

We can see that most are fairly strong, with the exception of MP31, which has little correlation with any other variable. There is also a fair bit of correlation between BM06 and BM03, as well as BM06 and BM02, and BM06 and BM12. for MP, there is a strong negative correlation between MP13 and MP09, 32 and 36, 9 and 17.