Joshua Hummell
set.seed(42)
#install.packages("earth")
library('urca')
library("fpp3")
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.3
## ✔ dplyr 1.1.1 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.0 ✔ feasts 0.3.1
## ✔ lubridate 1.9.2 ✔ fable 0.3.3
## ✔ ggplot2 3.4.1 ✔ fabletools 0.3.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library('readr')
library('tsibble')
library('dplyr')
library('GGally')
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library("corrplot")
## corrplot 0.92 loaded
library("reshape2")
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library("mlbench")
library("kernlab")
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library("Seurat")
## Attaching SeuratObject
library("caret")
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
##
## MAE, RMSE
library("earth")
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
##
## Attaching package: 'plotmo'
## The following object is masked from 'package:urca':
##
## plotres
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
caret::featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
## KNN
knnModel <- caret::train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
knnres <- postResample(pred = knnPred, obs = testData$y)
#################################################################
##MARS
mars <- expand.grid(.degree = 1:2, .nprune = 2:ncol(trainingData$x))
mars.fit <- earth(trainingData$x, trainingData$y)
mars.fit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
mars <- expand.grid(.degree = 1:2, .nprune = 2:6)
mars.fit <- earth(trainingData$x, trainingData$y)
mars.pred <- predict(mars.fit, newdata = testData$x)
mars.results <- postResample(pred = mars.pred, obs = testData$y)
#################################################################
## MARS Tuned
marst <- expand.grid(.degree = 1:2, .nprune = 2:6)
marst.fit <- train(trainingData$x, trainingData$y, method = "earth",
tuneGrid = marst,
trControl = trainControl(method = "cv"))
marst.pred <- predict(marst.fit, newdata = testData$x)
marst.results <- postResample(pred = marst.pred, obs = testData$y)
#################################################################
## SVM Tuned
svm <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 15,
trControl = trainControl(method = "cv"))
svm.pred <- predict(svm, newdata = testData$x)
svm.results <- postResample(svm.pred, testData$y)
#################################################################
## NNEt AVGTuned
nnet <- avNNet(trainingData$x, trainingData$y,
size = ncol(trainingData$x),
decay = .035,
## Specify how many models to average
repeats = 50,
linout = TRUE,
## Reduce the amount of printed output
trace = FALSE,
## Expand the number of iterations to find parameter estimates
maxit = 500)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnet.pred <- predict(nnet, newdata = testData$x)
nnet.results <- postResample(pred = nnet.pred, obs = testData$y)
rbind(knnres, mars.results, marst.results, svm.results, nnet.results)
## RMSE Rsquared MAE
## knnres 3.204059 0.6819919 2.568346
## mars.results 1.813647 0.8677298 1.391184
## marst.results 2.367035 0.7726677 1.871360
## svm.results 2.059968 0.8280925 1.563584
## nnet.results 1.558694 0.9019289 1.201263
Q -Which models appear to give the best performance?
Q - Does MARS select the informative predictors (those named X1–X5)?
summary(mars.fit)
## Call: earth(x=trainingData$x, y=trainingData$y)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
library("AppliedPredictiveModeling")
library("RANN")
data("ChemicalManufacturingProcess")
impute <- preProcess(ChemicalManufacturingProcess, "knnImpute")
chem_data <- predict(impute, ChemicalManufacturingProcess)
# We want to filter out all the ones near 0 again
chem_data_f <- chem_data %>% select(!nearZeroVar(.))
# Split into train and test
train_split <- createDataPartition(chem_data_f$Yield , p=.8, list=F)
trainingData <- chem_data_f[train_split,]
testData <- chem_data_f[-train_split,]
set.seed(44)
## KNN
knnModel <- caret::train(x = trainingData[,-1],
y = trainingData$Yield,
method = "knn",
metric = "Rsquared",
preProc = c("center", "scale"),
tuneLength = 10)
knnPred <- predict(knnModel, newdata = testData[,-1])
## The function 'postResample' can be used to get the test set
## perforamnce values
knnres <- postResample(pred = knnPred, obs = testData$Yield)
#################################################################
##MARS
mars <- expand.grid()
mars.fit <- earth(trainingData[,-1], trainingData$Yield)
mars.fit
## Selected 13 of 22 terms, and 7 of 56 predictors
## Termination condition: RSq changed by less than 0.001 at 22 terms
## Importance: ManufacturingProcess32, ManufacturingProcess13, ...
## Number of terms at each degree of interaction: 1 12 (additive model)
## GCV 0.3325095 RSS 32.69907 GRSq 0.6916942 RSq 0.7864972
mars <- expand.grid(.degree = 1:2, .nprune = 2:56)
mars.fit <- earth(trainingData[,-1], trainingData$Yield)
mars.pred <- predict(mars.fit, newdata = testData[,-1])
mars.results <- postResample(pred = mars.pred, obs = testData$Yield)
#################################################################
## MARS Tuned
marst <- expand.grid(.degree = 1:2, .nprune = 2:56)
marst.fit <- train(trainingData[,-1], trainingData$Yield, method = "earth",
metric = "Rsquared",
tuneGrid = marst,
na.action(na.pass),
trControl = trainControl(method = "cv"))
marst.pred <- predict(marst.fit, newdata = testData[,-1])
marst.results <- postResample(pred = marst.pred, obs = testData$Yield)
#################################################################
## SVM Tuned
svm <- train(trainingData[,-1], trainingData$Yield,
method = "svmRadial",
metric = "Rsquared",
preProcess = c("center", "scale"),
trControl = trainControl(method = "cv"))
svm.pred <- predict(svm, newdata = testData[,-1])
svm.results <- postResample(svm.pred, testData$Yield)
#################################################################
rbind(knnres, mars.results, marst.results, svm.results)
## RMSE Rsquared MAE
## knnres 0.5948008 0.4981031 0.5008508
## mars.results 0.6721636 0.4639389 0.5749884
## marst.results 0.8622959 0.2817729 0.5711889
## svm.results 0.5437172 0.5823322 0.4138903
varImp(svm)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 94.73
## BiologicalMaterial06 85.40
## ManufacturingProcess17 78.64
## ManufacturingProcess09 77.96
## BiologicalMaterial03 77.86
## BiologicalMaterial12 74.91
## ManufacturingProcess36 74.14
## ManufacturingProcess31 63.23
## BiologicalMaterial02 63.16
## BiologicalMaterial11 60.49
## ManufacturingProcess06 59.31
## ManufacturingProcess33 51.79
## ManufacturingProcess11 49.99
## ManufacturingProcess29 49.64
## BiologicalMaterial08 48.28
## BiologicalMaterial04 46.83
## ManufacturingProcess30 43.64
## BiologicalMaterial01 41.92
## BiologicalMaterial09 33.97
plot(varImp(svm), top = 10)
corr_vals <- chem_data %>%
dplyr::select('Yield', 'ManufacturingProcess32','ManufacturingProcess36',
'BiologicalMaterial06','ManufacturingProcess13',
'BiologicalMaterial03','ManufacturingProcess17',
'BiologicalMaterial02','BiologicalMaterial12',
'ManufacturingProcess09','ManufacturingProcess31')
corr_plot_vals <- cor(corr_vals)
corrplot.mixed(corr_plot_vals, tl.pos = 'lt', lower = NULL)
We can see that most are fairly strong, with the exception of MP31,
which has little correlation with any other variable. There is also a
fair bit of correlation between BM06 and BM03, as well as BM06 and BM02,
and BM06 and BM12. for MP, there is a strong negative correlation
between MP13 and MP09, 32 and 36, 9 and 17.