library(mlbench)
library(caret)
library(kernlab)
library(earth)
library(nnet)
library(AppliedPredictiveModeling)
library(dplyr)

Q.7.2

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn",preProc = c("center", "scale"),tuneLength = 10)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
nnetfit_model = nnet(trainingData$x,trainingData$y,size=5,
                     decay=.01,
                     linout=TRUE,
                     trace=FALSE,
                     maxit = 500,
                     MaxNWts=5* (ncol(trainingData$x) + 1) + 5 + 1)

avnnetfit_model = avNNet(trainingData$x,trainingData$y,size=5,
                     decay=.01,
                     repeats=5,
                     linout=TRUE,
                     trace=FALSE,
                     maxit = 500,
                     MaxNWts=5* (ncol(trainingData$x) + 1) + 5 + 1)

nnetfit_pred = predict(nnetfit_model, testData$x)

avnnetfit_pred = predict(avnnetfit_model, testData$x)
nnet_grid = expand.grid(.decay=c(0,.01,.1),.size=c(1:10),.bag=FALSE)
tooHigh <- findCorrelation(cor(trainingData$x), cutoff = .75)
trainXnnet <- trainingData$x[, -tooHigh]


set.seed(100)
nnetTune <- train(trainingData$x, trainingData$y,
                  method = "avNNet",
                  tuneGrid = nnet_grid,
                  trControl = trainControl(method = "cv"),
                  preProc = c("center", "scale"),
                  linout = TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(trainXnnet) + 1) + 10 + 1,
                  maxit = 500)
                  
                  
nnet_predictions <- predict(nnetTune, newdata = testData$x)
marsfit = earth(trainingData$x,trainingData$y)
marsfit
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
summary(marsfit)
## Call: earth(x=trainingData$x, y=trainingData$y)
## 
##                coefficients
## (Intercept)       18.451984
## h(0.621722-X1)   -11.074396
## h(0.601063-X2)   -10.744225
## h(X3-0.281766)    20.607853
## h(0.447442-X3)    17.880232
## h(X3-0.447442)   -23.282007
## h(X3-0.636458)    15.150350
## h(0.734892-X4)   -10.027487
## h(X4-0.734892)     9.092045
## h(0.850094-X5)    -4.723407
## h(X5-0.850094)    10.832932
## h(X6-0.361791)    -1.956821
## 
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(100)
marsTuned <- train(trainingData$x, trainingData$y,
                   method = "earth",
                   # Explicitly declare the candidate models to test
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

marpred = predict(marsTuned, testData$x)

varImp(marsTuned)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.24
## X2   48.73
## X5   15.52
## X3    0.00
svmrtuned <- train(trainingData$x, trainingData$y,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))

svmpred = predict(svmrtuned, testData$x)

The tuned MARS model gives the best results and it selects the best predictors.

paste0(c('KNN RMSE: ','KNN R^2: ','KNN MAE: '),postResample(pred = knnPred, obs = testData$y))
## [1] "KNN RMSE: 3.20405949374908" "KNN R^2: 0.681991908733613"
## [3] "KNN MAE: 2.56834607044141"
paste0(c('NNET RMSE: ','NNET R^2: ','NNET MAE: '),postResample(pred = nnetfit_pred, obs = testData$y))
## [1] "NNET RMSE: 2.52448774141056" "NNET R^2: 0.762184886857288"
## [3] "NNET MAE: 1.87227276448127"
paste0(c('AVENNET RMSE: ','AVENNET R^2: ','AVENNET MAE: '),postResample(pred = avnnetfit_pred, obs = testData$y))
## [1] "AVENNET RMSE: 2.04047704241771" "AVENNET R^2: 0.833194690303"   
## [3] "AVENNET MAE: 1.53290935877455"
paste0(c('NNET Tuned RMSE: ','NNET Tuned R^2: ','NNET Tuned MAE: '),postResample(pred = nnet_predictions, obs = testData$y))
## [1] "NNET Tuned RMSE: 2.64322123366103" "NNET Tuned R^2: 0.719458551000228"
## [3] "NNET Tuned MAE: 2.02318139598534"
paste0(c('MARS Tuned RMSE: ','MARS Tuned R^2: ','MARS Tuned MAE: '),postResample(pred = marpred, obs = testData$y))
## [1] "MARS Tuned RMSE: 1.15899475879587" "MARS Tuned R^2: 0.946041768997385"
## [3] "MARS Tuned MAE: 0.925023038062675"
paste0(c('SVM Tuned RMSE: ','SVM Tuned R^2: ','SVM Tuned MAE: '),postResample(pred = svmpred, obs = testData$y))
## [1] "SVM Tuned RMSE: 2.07414727885981" "SVM Tuned R^2: 0.825584809510461"
## [3] "SVM Tuned MAE: 1.57551847254893"

Q.7.5

data(ChemicalManufacturingProcess)

set.seed(100)

preprocess_data_model = preProcess(ChemicalManufacturingProcess, "knnImpute")



new_dataset = predict(preprocess_data_model,ChemicalManufacturingProcess)

clean_dataset = new_dataset %>% select_at(vars(-one_of(nearZeroVar(.,names=TRUE))))

chem_training_filter = createDataPartition(clean_dataset$Yield,p=.8,list=FALSE)

training_df_chem = clean_dataset[chem_training_filter,]
test_df_chem = clean_dataset[-chem_training_filter,]
chem_knn_model = train(Yield~.,data=training_df_chem,method='knn',
                       center=TRUE,scale=TRUE,trControl=trainControl('cv',number=10))


chem_knn_pred = predict(chem_knn_model, test_df_chem)
chemknnresults = postResample(pred = chem_knn_pred, obs = test_df_chem$Yield)
chemmarsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(100)
chemmarsTuned <- train(Yield~.,data=training_df_chem,
                   method = "earth",
                   # Explicitly declare the candidate models to test
                   tuneGrid = chemmarsGrid,
                   trControl = trainControl(method = "cv"))

chemmarpred = predict(chemmarsTuned, test_df_chem)

chemmarresults = postResample(pred = chemmarpred, obs = test_df_chem$Yield)
chemsvmrtuned <- train(Yield~.,data=training_df_chem,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))

chemsvmpred = predict(chemsvmrtuned, test_df_chem)
chemsvmresults = postResample(pred = chemsvmpred, obs = test_df_chem$Yield)
chemavnnetfit_model = avNNet(Yield~.,data=training_df_chem,size=5,
                     decay=.01,
                     repeats=5,
                     linout=TRUE,
                     trace=FALSE,
                     maxit = 500,
                     MaxNWts=5* (ncol(training_df_chem) + 1) + 5 + 1)

chemavnnetfit_pred = predict(chemavnnetfit_model, test_df_chem)
chemavnnetresults = postResample(pred = chemavnnetfit_pred, obs = test_df_chem$Yield)

SVM Model gives the optimal results and test set performance.

paste0(c('KNN RMSE: ','KNN R^2: ','KNN MAE: '),chemknnresults)
## [1] "KNN RMSE: 0.651042547924618" "KNN R^2: 0.486951818645308" 
## [3] "KNN MAE: 0.507404802868369"
paste0(c('AVENNET RMSE: ','AVENNET R^2: ','AVENNET MAE: '),chemavnnetresults)
## [1] "AVENNET RMSE: 0.657411384473877" "AVENNET R^2: 0.635461641363329" 
## [3] "AVENNET MAE: 0.446987464000779"
paste0(c('MARS Tuned RMSE: ','MARS Tuned R^2: ','MARS Tuned MAE: '),chemmarresults)
## [1] "MARS Tuned RMSE: 0.586319691349099" "MARS Tuned R^2: 0.576421502805941" 
## [3] "MARS Tuned MAE: 0.452870034271226"
paste0(c('SVM Tuned RMSE: ','SVM Tuned R^2: ','SVM Tuned MAE: '),chemsvmresults)
## [1] "SVM Tuned RMSE: 0.578640487605658" "SVM Tuned R^2: 0.591018997487922" 
## [3] "SVM Tuned MAE: 0.461827743118046"
linear_pls_model = train(
  Yield ~ ., data = training_df_chem, method = "pls",
  center = TRUE,
  scale = TRUE,
  trControl = trainControl("cv", number = 10))



linear_imp_vars = varImp(linear_pls_model)
svm_imp_vars = varImp(chemsvmrtuned)

print(paste("Manufacturing:",length(svm_imp_vars$importance[grepl('Manufacturing', rownames(svm_imp_vars$importance), fixed = TRUE),])))
## [1] "Manufacturing: 45"
print(paste("Biology:",length(svm_imp_vars$importance[grepl('Biological', rownames(svm_imp_vars$importance), fixed = TRUE),])))
## [1] "Biology: 11"
svm_imp_vars$importance %>% slice_max(Overall, n = 10)
##                          Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess13  97.83640
## BiologicalMaterial06    82.21744
## ManufacturingProcess17  77.26777
## BiologicalMaterial03    76.21094
## ManufacturingProcess36  70.96498
## BiologicalMaterial02    68.78876
## ManufacturingProcess09  67.86384
## BiologicalMaterial12    63.36203
## ManufacturingProcess06  55.15443
linear_imp_vars$importance %>% slice_max(Overall, n = 10)
##                          Overall
## ManufacturingProcess32 100.00000
## ManufacturingProcess13  87.96802
## ManufacturingProcess09  83.43661
## ManufacturingProcess36  81.95628
## ManufacturingProcess17  78.28842
## BiologicalMaterial02    71.46014
## BiologicalMaterial06    69.40365
## ManufacturingProcess33  65.04595
## BiologicalMaterial03    64.91328
## BiologicalMaterial04    63.58236

Below are the top two overlapping variables between the two models and the top variables that do not overlap between the two models.

Of the four charts, ManufacturingProcess32, BiologicalMaterial06, and ManufacturingProcess09 had positive correlations to Yield.

ManufacturingProcess13 had a negative correlation to Yield. This is specifically from the SVM model and indicates a key difference in the underlying logic and results from the models. Negatively correlated variables have a larger importance in the results versus the linear model. This can be due to the lack of capturing hidden components of the data that nonlinear models are more specifically designed for.

svm_filt_vars = row.names(svm_imp_vars$importance)
linear_filt_vars = row.names(linear_imp_vars$importance)


training_df_chem %>% select(c(svm_filt_vars, Yield)) %>% ggplot(aes(ManufacturingProcess32,Yield)) +   geom_point()

training_df_chem %>% select(c(svm_filt_vars, Yield)) %>% ggplot(aes(ManufacturingProcess13,Yield)) +   geom_point()

training_df_chem %>% select(c(svm_filt_vars, Yield)) %>% ggplot(aes(BiologicalMaterial06,Yield)) +   geom_point()

training_df_chem %>% select(c(linear_filt_vars, Yield)) %>% ggplot(aes(ManufacturingProcess09,Yield)) +   geom_point()