library(mlbench)
library(caret)
library(kernlab)
library(earth)
library(nnet)
library(AppliedPredictiveModeling)
library(dplyr)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn",preProc = c("center", "scale"),tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
nnetfit_model = nnet(trainingData$x,trainingData$y,size=5,
decay=.01,
linout=TRUE,
trace=FALSE,
maxit = 500,
MaxNWts=5* (ncol(trainingData$x) + 1) + 5 + 1)
avnnetfit_model = avNNet(trainingData$x,trainingData$y,size=5,
decay=.01,
repeats=5,
linout=TRUE,
trace=FALSE,
maxit = 500,
MaxNWts=5* (ncol(trainingData$x) + 1) + 5 + 1)
nnetfit_pred = predict(nnetfit_model, testData$x)
avnnetfit_pred = predict(avnnetfit_model, testData$x)
nnet_grid = expand.grid(.decay=c(0,.01,.1),.size=c(1:10),.bag=FALSE)
tooHigh <- findCorrelation(cor(trainingData$x), cutoff = .75)
trainXnnet <- trainingData$x[, -tooHigh]
set.seed(100)
nnetTune <- train(trainingData$x, trainingData$y,
method = "avNNet",
tuneGrid = nnet_grid,
trControl = trainControl(method = "cv"),
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainXnnet) + 1) + 10 + 1,
maxit = 500)
nnet_predictions <- predict(nnetTune, newdata = testData$x)
marsfit = earth(trainingData$x,trainingData$y)
marsfit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
summary(marsfit)
## Call: earth(x=trainingData$x, y=trainingData$y)
##
## coefficients
## (Intercept) 18.451984
## h(0.621722-X1) -11.074396
## h(0.601063-X2) -10.744225
## h(X3-0.281766) 20.607853
## h(0.447442-X3) 17.880232
## h(X3-0.447442) -23.282007
## h(X3-0.636458) 15.150350
## h(0.734892-X4) -10.027487
## h(X4-0.734892) 9.092045
## h(0.850094-X5) -4.723407
## h(X5-0.850094) 10.832932
## h(X6-0.361791) -1.956821
##
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556 RSS 397.9654 GRSq 0.8968524 RSq 0.9183982
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(100)
marsTuned <- train(trainingData$x, trainingData$y,
method = "earth",
# Explicitly declare the candidate models to test
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marpred = predict(marsTuned, testData$x)
varImp(marsTuned)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.24
## X2 48.73
## X5 15.52
## X3 0.00
svmrtuned <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmpred = predict(svmrtuned, testData$x)
The tuned MARS model gives the best results and it selects the best predictors.
paste0(c('KNN RMSE: ','KNN R^2: ','KNN MAE: '),postResample(pred = knnPred, obs = testData$y))
## [1] "KNN RMSE: 3.20405949374908" "KNN R^2: 0.681991908733613"
## [3] "KNN MAE: 2.56834607044141"
paste0(c('NNET RMSE: ','NNET R^2: ','NNET MAE: '),postResample(pred = nnetfit_pred, obs = testData$y))
## [1] "NNET RMSE: 2.52448774141056" "NNET R^2: 0.762184886857288"
## [3] "NNET MAE: 1.87227276448127"
paste0(c('AVENNET RMSE: ','AVENNET R^2: ','AVENNET MAE: '),postResample(pred = avnnetfit_pred, obs = testData$y))
## [1] "AVENNET RMSE: 2.04047704241771" "AVENNET R^2: 0.833194690303"
## [3] "AVENNET MAE: 1.53290935877455"
paste0(c('NNET Tuned RMSE: ','NNET Tuned R^2: ','NNET Tuned MAE: '),postResample(pred = nnet_predictions, obs = testData$y))
## [1] "NNET Tuned RMSE: 2.64322123366103" "NNET Tuned R^2: 0.719458551000228"
## [3] "NNET Tuned MAE: 2.02318139598534"
paste0(c('MARS Tuned RMSE: ','MARS Tuned R^2: ','MARS Tuned MAE: '),postResample(pred = marpred, obs = testData$y))
## [1] "MARS Tuned RMSE: 1.15899475879587" "MARS Tuned R^2: 0.946041768997385"
## [3] "MARS Tuned MAE: 0.925023038062675"
paste0(c('SVM Tuned RMSE: ','SVM Tuned R^2: ','SVM Tuned MAE: '),postResample(pred = svmpred, obs = testData$y))
## [1] "SVM Tuned RMSE: 2.07414727885981" "SVM Tuned R^2: 0.825584809510461"
## [3] "SVM Tuned MAE: 1.57551847254893"
data(ChemicalManufacturingProcess)
set.seed(100)
preprocess_data_model = preProcess(ChemicalManufacturingProcess, "knnImpute")
new_dataset = predict(preprocess_data_model,ChemicalManufacturingProcess)
clean_dataset = new_dataset %>% select_at(vars(-one_of(nearZeroVar(.,names=TRUE))))
chem_training_filter = createDataPartition(clean_dataset$Yield,p=.8,list=FALSE)
training_df_chem = clean_dataset[chem_training_filter,]
test_df_chem = clean_dataset[-chem_training_filter,]
chem_knn_model = train(Yield~.,data=training_df_chem,method='knn',
center=TRUE,scale=TRUE,trControl=trainControl('cv',number=10))
chem_knn_pred = predict(chem_knn_model, test_df_chem)
chemknnresults = postResample(pred = chem_knn_pred, obs = test_df_chem$Yield)
chemmarsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(100)
chemmarsTuned <- train(Yield~.,data=training_df_chem,
method = "earth",
# Explicitly declare the candidate models to test
tuneGrid = chemmarsGrid,
trControl = trainControl(method = "cv"))
chemmarpred = predict(chemmarsTuned, test_df_chem)
chemmarresults = postResample(pred = chemmarpred, obs = test_df_chem$Yield)
chemsvmrtuned <- train(Yield~.,data=training_df_chem,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
chemsvmpred = predict(chemsvmrtuned, test_df_chem)
chemsvmresults = postResample(pred = chemsvmpred, obs = test_df_chem$Yield)
chemavnnetfit_model = avNNet(Yield~.,data=training_df_chem,size=5,
decay=.01,
repeats=5,
linout=TRUE,
trace=FALSE,
maxit = 500,
MaxNWts=5* (ncol(training_df_chem) + 1) + 5 + 1)
chemavnnetfit_pred = predict(chemavnnetfit_model, test_df_chem)
chemavnnetresults = postResample(pred = chemavnnetfit_pred, obs = test_df_chem$Yield)
SVM Model gives the optimal results and test set performance.
paste0(c('KNN RMSE: ','KNN R^2: ','KNN MAE: '),chemknnresults)
## [1] "KNN RMSE: 0.651042547924618" "KNN R^2: 0.486951818645308"
## [3] "KNN MAE: 0.507404802868369"
paste0(c('AVENNET RMSE: ','AVENNET R^2: ','AVENNET MAE: '),chemavnnetresults)
## [1] "AVENNET RMSE: 0.657411384473877" "AVENNET R^2: 0.635461641363329"
## [3] "AVENNET MAE: 0.446987464000779"
paste0(c('MARS Tuned RMSE: ','MARS Tuned R^2: ','MARS Tuned MAE: '),chemmarresults)
## [1] "MARS Tuned RMSE: 0.586319691349099" "MARS Tuned R^2: 0.576421502805941"
## [3] "MARS Tuned MAE: 0.452870034271226"
paste0(c('SVM Tuned RMSE: ','SVM Tuned R^2: ','SVM Tuned MAE: '),chemsvmresults)
## [1] "SVM Tuned RMSE: 0.578640487605658" "SVM Tuned R^2: 0.591018997487922"
## [3] "SVM Tuned MAE: 0.461827743118046"
linear_pls_model = train(
Yield ~ ., data = training_df_chem, method = "pls",
center = TRUE,
scale = TRUE,
trControl = trainControl("cv", number = 10))
linear_imp_vars = varImp(linear_pls_model)
svm_imp_vars = varImp(chemsvmrtuned)
print(paste("Manufacturing:",length(svm_imp_vars$importance[grepl('Manufacturing', rownames(svm_imp_vars$importance), fixed = TRUE),])))
## [1] "Manufacturing: 45"
print(paste("Biology:",length(svm_imp_vars$importance[grepl('Biological', rownames(svm_imp_vars$importance), fixed = TRUE),])))
## [1] "Biology: 11"
svm_imp_vars$importance %>% slice_max(Overall, n = 10)
## Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess13 97.83640
## BiologicalMaterial06 82.21744
## ManufacturingProcess17 77.26777
## BiologicalMaterial03 76.21094
## ManufacturingProcess36 70.96498
## BiologicalMaterial02 68.78876
## ManufacturingProcess09 67.86384
## BiologicalMaterial12 63.36203
## ManufacturingProcess06 55.15443
linear_imp_vars$importance %>% slice_max(Overall, n = 10)
## Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess13 87.96802
## ManufacturingProcess09 83.43661
## ManufacturingProcess36 81.95628
## ManufacturingProcess17 78.28842
## BiologicalMaterial02 71.46014
## BiologicalMaterial06 69.40365
## ManufacturingProcess33 65.04595
## BiologicalMaterial03 64.91328
## BiologicalMaterial04 63.58236
Below are the top two overlapping variables between the two models and the top variables that do not overlap between the two models.
Of the four charts, ManufacturingProcess32, BiologicalMaterial06, and ManufacturingProcess09 had positive correlations to Yield.
ManufacturingProcess13 had a negative correlation to Yield. This is specifically from the SVM model and indicates a key difference in the underlying logic and results from the models. Negatively correlated variables have a larger importance in the results versus the linear model. This can be due to the lack of capturing hidden components of the data that nonlinear models are more specifically designed for.
svm_filt_vars = row.names(svm_imp_vars$importance)
linear_filt_vars = row.names(linear_imp_vars$importance)
training_df_chem %>% select(c(svm_filt_vars, Yield)) %>% ggplot(aes(ManufacturingProcess32,Yield)) + geom_point()
training_df_chem %>% select(c(svm_filt_vars, Yield)) %>% ggplot(aes(ManufacturingProcess13,Yield)) + geom_point()
training_df_chem %>% select(c(svm_filt_vars, Yield)) %>% ggplot(aes(BiologicalMaterial06,Yield)) + geom_point()
training_df_chem %>% select(c(linear_filt_vars, Yield)) %>% ggplot(aes(ManufacturingProcess09,Yield)) + geom_point()