exercise4

library(mlbench)

set.seed(200)

trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

dim(trainingData$x)

## [1] 200  10

length(trainingData$y)

## [1] 200

dim(testData$x)

## [1] 5000   10

length(testData$y)

## [1] 5000

head(trainingData$x)

##          X1        X2         X3         X4         X5         X6        X7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
##          X8         X9       X10
## 1 0.4214081 0.59111440 0.5886216
## 2 0.8446239 0.92819306 0.7584008
## 3 0.3495908 0.01759542 0.4441185
## 4 0.7970260 0.68986918 0.4450716
## 5 0.9038919 0.39696995 0.5500808
## 6 0.6489177 0.53116033 0.9066182

head(trainingData$y)

## [1] 18.46398 16.09836 17.76165 13.78730 18.42984 20.85817

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(earth)

## Loading required package: Formula

## Loading required package: plotmo

## Loading required package: plotrix

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

ctrl <- trainControl(method = "boot", number = 25)

ctrl

## $method
## [1] "boot"
## 
## $number
## [1] 25
## 
## $repeats
## [1] NA
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL
## 
## $horizon
## [1] 1
## 
## $fixedWindow
## [1] TRUE
## 
## $skip
## [1] 0
## 
## $verboseIter
## [1] FALSE
## 
## $returnData
## [1] TRUE
## 
## $returnResamp
## [1] "final"
## 
## $savePredictions
## [1] FALSE
## 
## $classProbs
## [1] FALSE
## 
## $summaryFunction
## function (data, lev = NULL, model = NULL) 
## {
##     if (is.character(data$obs)) 
##         data$obs <- factor(data$obs, levels = lev)
##     postResample(data[, "pred"], data[, "obs"])
## }
## <bytecode: 0x1695d00a8>
## <environment: namespace:caret>
## 
## $selectionFunction
## [1] "best"
## 
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
## 
## $preProcOptions$ICAcomp
## [1] 3
## 
## $preProcOptions$k
## [1] 5
## 
## $preProcOptions$freqCut
## [1] 19
## 
## $preProcOptions$uniqueCut
## [1] 10
## 
## $preProcOptions$cutoff
## [1] 0.9
## 
## 
## $sampling
## NULL
## 
## $index
## NULL
## 
## $indexOut
## NULL
## 
## $indexFinal
## NULL
## 
## $timingSamps
## [1] 0
## 
## $predictionBounds
## [1] FALSE FALSE
## 
## $seeds
## [1] NA
## 
## $adaptive
## $adaptive$min
## [1] 5
## 
## $adaptive$alpha
## [1] 0.05
## 
## $adaptive$method
## [1] "gls"
## 
## $adaptive$complete
## [1] TRUE
## 
## 
## $trim
## [1] FALSE
## 
## $allowParallel
## [1] TRUE

set.seed(200)

knnModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "knn",
  preProcess = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl,
  metric = "RMSE"
)

knnModel

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.654912  0.4779838  2.958475
##    7  3.529432  0.5118581  2.861742
##    9  3.446330  0.5425096  2.780756
##   11  3.378049  0.5723793  2.719410
##   13  3.332339  0.5953773  2.692863
##   15  3.309235  0.6111389  2.663046
##   17  3.317408  0.6201421  2.678898
##   19  3.311667  0.6333800  2.682098
##   21  3.316340  0.6407537  2.688887
##   23  3.326040  0.6491480  2.705915
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

knnModel$bestTune

##    k
## 6 15

knnPred <- predict(knnModel, newdata = testData$x)

knnPerf <- postResample(pred = knnPred, obs = testData$y)

knnPerf

##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

set.seed(200)

marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",
  tuneGrid = expand.grid(degree = 1:2, nprune = 2:20),
  trControl = ctrl,
  metric = "RMSE"
)

marsModel

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.447045  0.2249607  3.650128
##   1        3      3.744821  0.4546610  3.019175
##   1        4      2.828643  0.6892908  2.244131
##   1        5      2.524326  0.7516356  2.027435
##   1        6      2.406670  0.7747079  1.906733
##   1        7      2.027113  0.8375721  1.594956
##   1        8      1.874633  0.8618476  1.474219
##   1        9      1.800794  0.8728377  1.411703
##   1       10      1.810047  0.8721377  1.412023
##   1       11      1.821314  0.8714221  1.427124
##   1       12      1.831608  0.8700790  1.430044
##   1       13      1.839717  0.8686550  1.440537
##   1       14      1.849381  0.8672327  1.450876
##   1       15      1.856211  0.8663787  1.452430
##   1       16      1.857086  0.8661612  1.454255
##   1       17      1.853742  0.8667095  1.452920
##   1       18      1.853742  0.8667095  1.452920
##   1       19      1.853742  0.8667095  1.452920
##   1       20      1.853742  0.8667095  1.452920
##   2        2      4.447045  0.2249607  3.650128
##   2        3      3.742704  0.4519073  3.020158
##   2        4      2.821346  0.6910533  2.235202
##   2        5      2.518642  0.7535135  2.018685
##   2        6      2.376391  0.7812055  1.885731
##   2        7      2.013411  0.8396557  1.587164
##   2        8      1.888879  0.8599653  1.484953
##   2        9      1.799932  0.8721481  1.393774
##   2       10      1.660130  0.8907628  1.310950
##   2       11      1.550005  0.9059968  1.231012
##   2       12      1.505830  0.9110970  1.190294
##   2       13      1.511926  0.9103431  1.186123
##   2       14      1.527242  0.9088055  1.197070
##   2       15      1.542393  0.9075705  1.207119
##   2       16      1.550245  0.9063093  1.207792
##   2       17      1.570877  0.9039487  1.222541
##   2       18      1.584427  0.9025470  1.233851
##   2       19      1.592059  0.9015600  1.237827
##   2       20      1.592059  0.9015600  1.237827
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 2.

marsModel$bestTune

##    nprune degree
## 30     12      2

marsPred <- predict(marsModel, newdata = testData$x)

marsPerf <- postResample(pred = marsPred, obs = testData$y)

marsPerf

##      RMSE  Rsquared       MAE 
## 1.2803060 0.9335241 1.0168673

marsImp <- varImp(marsModel, scale = FALSE)

marsImp

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   85.05
## X2   69.03
## X5   48.88
## X3   39.40

marsImp$importance

##      Overall
## X1 100.00000
## X4  85.05066
## X2  69.02600
## X5  48.87728
## X3  39.40404

marsImp_df <- data.frame(
  Predictor = rownames(marsImp$importance),
  Importance = marsImp$importance$Overall
)

marsImp_df <- marsImp_df[order(-marsImp_df$Importance), ]
marsImp_df

##   Predictor Importance
## 1        X1  100.00000
## 2        X4   85.05066
## 3        X2   69.02600
## 4        X5   48.87728
## 5        X3   39.40404

Yes. MARS selected X1, X2, X3, X4, and X5 as the important predictors, which are the true informative variables in the Friedman simulation. The noise variables X6 to X10 were not identified as important.

set.seed(200)

svmModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial",
  preProcess = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl,
  metric = "RMSE"
)

svmModel

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.635010  0.7685188  2.074977
##     0.50  2.423373  0.7839086  1.902162
##     1.00  2.284137  0.8001534  1.791779
##     2.00  2.196624  0.8126474  1.713560
##     4.00  2.143035  0.8209820  1.668024
##     8.00  2.119159  0.8246308  1.649388
##    16.00  2.117440  0.8248675  1.648570
##    32.00  2.117440  0.8248675  1.648570
##    64.00  2.117440  0.8248675  1.648570
##   128.00  2.117440  0.8248675  1.648570
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 16.

svmModel$bestTune

##        sigma  C
## 7 0.06299324 16

svmPred <- predict(svmModel, newdata = testData$x)

svmPerf <- postResample(pred = svmPred, obs = testData$y)

svmPerf

##      RMSE  Rsquared       MAE 
## 2.0736997 0.8256573 1.5751967

Three nonlinear regression models were fitted for this exercise:kNN, MARS, and SVM

results <- rbind(
  kNN = knnPerf,
  MARS = marsPerf,
  SVM_Radial = svmPerf
)

results <- data.frame(Model = rownames(results), results, row.names = NULL)
results <- results[order(results$RMSE), ]

results

##        Model     RMSE  Rsquared      MAE
## 2       MARS 1.280306 0.9335241 1.016867
## 3 SVM_Radial 2.073700 0.8256573 1.575197
## 1        kNN 3.175066 0.6785946 2.544317

Among the three models, MARS gave the best performance. It had the lowest test RMSE (1.280306) and the highest R^2 (0.9335241). SVM with radial kernel performed better than kNN, but worse than MARS.

exercise4

Wei You

2026-03-28