library(tidyverse)
library(fpp3)
library(nnet)
library(kernlab)
library(AppliedPredictiveModeling)
library(RANN)
library(dplyr)
library(mlbench)
library(caret)
library(earth)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
# Training Support Vector Machine model.
sv_machine <- ksvm(x = as.matrix(trainingData$x), y = trainingData$y,
kernel = "rbfdot", kpar = "automatic",
C = 1, epsilon = 0.1)
svm_pred1 <- predict(sv_machine, newdata = testData$x)
postResample(pred = svm_pred1, obs = testData$y)
## RMSE Rsquared MAE
## 2.2574332 0.8002276 1.7267134
# Tuned SVM Model with Centering and Scaling
svmtun <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmtun
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.492694 0.8045274 1.985791
## 0.50 2.221932 0.8241857 1.772240
## 1.00 2.028981 0.8484381 1.606503
## 2.00 1.920108 0.8595796 1.527846
## 4.00 1.856634 0.8648675 1.496154
## 8.00 1.835321 0.8649239 1.504354
## 16.00 1.834390 0.8648690 1.504262
## 32.00 1.834390 0.8648690 1.504262
## 64.00 1.834390 0.8648690 1.504262
## 128.00 1.834390 0.8648690 1.504262
## 256.00 1.834390 0.8648690 1.504262
## 512.00 1.834390 0.8648690 1.504262
## 1024.00 1.834390 0.8648690 1.504262
## 2048.00 1.834390 0.8648690 1.504262
##
## Tuning parameter 'sigma' was held constant at a value of 0.06591656
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06591656 and C = 16.
svmtun$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0659165645018617
##
## Number of Support Vectors : 152
##
## Objective Function Value : -68.9631
## Training error : 0.008533
svm_pred2 <- predict(svmtun, newdata = testData$x)
postResample(pred = svm_pred2, obs = testData$y)
## RMSE Rsquared MAE
## 2.0808502 0.8245383 1.5808595
# Regular MARS Model
marsmod <- earth(trainingData$x, trainingData$y)
marsmod
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
mars_pred <- predict(marsmod, newdata = testData$x)
postResample(pred = mars_pred, obs = testData$y)
## RMSE Rsquared MAE
## 1.8136467 0.8677298 1.3911836
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:16)
set.seed(334)
# Tuning MARS based on Cross-Validation
marstun <- train(trainingData$x, trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marstun
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.215043 0.2958829 3.4715006
## 1 3 3.596326 0.4925289 2.9105998
## 1 4 2.685802 0.7205596 2.1332921
## 1 5 2.457939 0.7645088 1.9835490
## 1 6 2.408785 0.7936804 1.9029015
## 1 7 1.946327 0.8555392 1.5609017
## 1 8 1.780400 0.8811200 1.4047687
## 1 9 1.688583 0.8901135 1.3192783
## 1 10 1.682062 0.8909456 1.3250403
## 1 11 1.646291 0.8973754 1.2914038
## 1 12 1.656040 0.8952787 1.2772892
## 1 13 1.653213 0.8953591 1.2818334
## 1 14 1.680246 0.8916559 1.3003684
## 1 15 1.680656 0.8915064 1.2995570
## 1 16 1.680656 0.8915064 1.2995570
## 2 2 4.215043 0.2958829 3.4715006
## 2 3 3.594757 0.4933080 2.9114298
## 2 4 2.683231 0.7210424 2.1310233
## 2 5 2.449310 0.7637838 1.9578090
## 2 6 2.333471 0.7894762 1.8489458
## 2 7 1.897656 0.8619660 1.5248216
## 2 8 1.719145 0.8846421 1.3595734
## 2 9 1.473660 0.9158201 1.1974286
## 2 10 1.380194 0.9257455 1.0992083
## 2 11 1.272798 0.9368827 1.0053695
## 2 12 1.227547 0.9419180 0.9692761
## 2 13 1.230587 0.9413298 0.9725188
## 2 14 1.202656 0.9445373 0.9394496
## 2 15 1.190089 0.9458320 0.9283424
## 2 16 1.189953 0.9456648 0.9346434
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 16 and degree = 2.
mars_pred1 <- predict(marstun, newdata = testData$x)
postResample(pred = mars_pred1, obs = testData$y)
## RMSE Rsquared MAE
## 1.1492504 0.9471145 0.9158382
# 1 Layer Neural Network with 5 hidden Layer
neuralfit <- nnet(trainingData$x, trainingData$y,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)
neural_pred <- predict(neuralfit, newdata = testData$x)
postResample(pred = neural_pred, obs = testData$y)
## RMSE Rsquared MAE
## 1.9410594 0.8505786 1.4725251
# Averaged Neural Network Model
neuralavg <- avNNet(trainingData$x, trainingData$y,
size = 5,
decay = 0.01,
repeats = 5,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)
## Warning: executing %dopar% sequentially: no parallel backend registered
neural_pred2 <- predict(neuralavg, newdata = testData$x)
postResample(pred = neural_pred2, obs = testData$y)
## RMSE Rsquared MAE
## 1.9024516 0.8557419 1.4076236
data("ChemicalManufacturingProcess")
impute <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
imputed <- predict(impute, ChemicalManufacturingProcess)
X <- dplyr::select(imputed, -Yield)
y <- imputed$Yield
set.seed(22)
index <- createDataPartition(y, p = .8, list = FALSE)
train_X <- X[index, ] %>% as.matrix()
test_X <- X[-index, ] %>% as.matrix()
train_y <- y[index]
test_y <- y[-index]
knnmodel_2 <- train(x = train_X, y = train_y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knnpred2 <- predict(knnmodel_2, newdata = test_X)
postResample(pred = knnpred2, obs = test_y)
## RMSE Rsquared MAE
## 0.6514929 0.4499752 0.5493612
marsGrid2 <- expand.grid(.degree = 1:2, .nprune = 2:58)
marstuned2 <- train(x = train_X, y = train_y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marspred2 <- predict(marstuned2, newdata = test_X)
postResample(pred = marspred2, obs = test_y)
## RMSE Rsquared MAE
## 0.6384544 0.4975974 0.4921832
svmrtuned2 <- train(x = train_X, y = train_y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmrpred2 <- predict(svmrtuned2, newdata = test_X)
postResample(pred = svmrpred2, obs = test_y)
## RMSE Rsquared MAE
## 0.6449523 0.4399861 0.5114197
nnetavg2 <- avNNet(x = train_X, y = train_y,
size = 5,
decay = 0.01,
repeats = 5,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(train_X) + 1) + 5 + 1)
nnetpred2 <- predict(nnetavg2, newdata = test_X)
postResample(pred = nnetpred2, obs = test_y)
## RMSE Rsquared MAE
## 0.8184245 0.4615477 0.6572464
varImp(marstuned2)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess09 47.83
## ManufacturingProcess13 0.00
marstuned3 <- earth(x = train_X, y = train_y)
imp1 <- varImp(marstuned3)
varImp(marstuned3)
## Overall
## ManufacturingProcess32 100.000000
## ManufacturingProcess09 65.120510
## ManufacturingProcess13 35.988820
## ManufacturingProcess39 23.602022
## ManufacturingProcess22 23.211861
## ManufacturingProcess28 21.953670
## BiologicalMaterial12 21.541077
## BiologicalMaterial03 19.654206
## ManufacturingProcess01 15.680530
## ManufacturingProcess33 8.971403
The most important predictor in the SVM model is “ManufacturingProcess32”, we can clearly see the significant amount of variables compared with the rest of predictors.
top_pred <- c("ManufacturingProcess32", "ManufacturingProcess09", "ManufacturingProcess13",
"ManufacturingProcess39", "ManufacturingProcess22", "ManufacturingProcess28",
"BiologicalMaterial12", "BiologicalMaterial03", "ManufacturingProcess01",
"ManufacturingProcess33")
for(i in top_pred) {
plot_data <- data.frame(X = X[[i]], Y = y)
p <- ggplot(plot_data, aes_string(x = "X", y = "Y")) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
labs(title = paste("Relationship between", i, "and Yield"),
x = i, y = "Yield") +
theme_minimal()
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Based on the information provided by the plots above of the first 20 important predictors and yield, some of the process variables seem to have either a positive or negative relationship, in other cases it seems that there is do not a defined relationship such as in the case with process “36”, “31”, “29” and “30”. In the case of the biological variables, they all seem to have a positive relationship with yield. To answer the question above, we can say that these plots reveal intuition about the biological predictors in respect to their relationship with yield.