Problem 7.2

Setup

set.seed(200)
training_data <- mlbench.friedman1(200, sd=1)

training_data$x <- data.frame(training_data$x)

featurePlot(training_data$x, training_data$y)

test_data <- mlbench.friedman1(5000,sd=1)

test_data$x <- data.frame(test_data$x)

Model 1 - KNN

knnModel <- train(x = training_data$x, 
                  y = training_data$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = test_data$x)

postResample(pred = knnPred, obs = test_data$y)
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Model 2 - Multivariate Adaptive Regression Splines

marsModel <- earth(training_data$x, training_data$y)

marsModel
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
summary(marsModel)
## Call: earth(x=training_data$x, y=training_data$y)
## 
##                coefficients
## (Intercept)       18.451984
## h(0.621722-X1)   -11.074396
## h(0.601063-X2)   -10.744225
## h(X3-0.281766)    20.607853
## h(0.447442-X3)    17.880232
## h(X3-0.447442)   -23.282007
## h(X3-0.636458)    15.150350
## h(0.734892-X4)   -10.027487
## h(X4-0.734892)     9.092045
## h(0.850094-X5)    -4.723407
## h(X5-0.850094)    10.832932
## h(X6-0.361791)    -1.956821
## 
## Selected 12 of 18 terms, and 6 of 10 predictors
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## GCV 2.540556    RSS 397.9654    GRSq 0.8968524    RSq 0.9183982
marsPred <- predict(marsModel, test_data$x)

postResample(marsPred, test_data$y)
##      RMSE  Rsquared       MAE 
## 1.8136467 0.8677298 1.3911836

Model 3 - Supper Vector Machines

svmModel <- train(training_data$x, training_data$y,
      method = "svmRadial",
      preProc = c("center", "scale"),
      tuneLength = 14,
      trControl = trainControl(method="cv"))

svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.505383  0.8031869  1.999381
##      0.50  2.290725  0.8103140  1.829703
##      1.00  2.105086  0.8302040  1.677851
##      2.00  2.014620  0.8418576  1.598814
##      4.00  1.965196  0.8491165  1.567327
##      8.00  1.927625  0.8538991  1.542238
##     16.00  1.924260  0.8545305  1.539289
##     32.00  1.924260  0.8545305  1.539289
##     64.00  1.924260  0.8545305  1.539289
##    128.00  1.924260  0.8545305  1.539289
##    256.00  1.924260  0.8545305  1.539289
##    512.00  1.924260  0.8545305  1.539289
##   1024.00  1.924260  0.8545305  1.539289
##   2048.00  1.924260  0.8545305  1.539289
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06802164
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06802164 and C = 16.
svmModel$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 16 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0680216365076835 
## 
## Number of Support Vectors : 152 
## 
## Objective Function Value : -66.0924 
## Training error : 0.008551
svmPred <- predict(svmModel$finalModel, test_data$x)

postResample(svmPred, test_data$y)
##     RMSE Rsquared      MAE 
## 7.156075 0.654429 6.169075

Model 4 - Neural Networks

nnetModel <- nnet(x = training_data$x,
     y = training_data$y,
     size = 5,
     decay = 0.01,
     linout = TRUE,
     trace = FALSE,
     maxit = 500,
     MaxNWts = 5 * (ncol(training_data$x) +1) +5 +1)

nnetPred <- predict(nnetModel, test_data$x)

postResample(nnetPred, test_data$y)
##      RMSE  Rsquared       MAE 
## 2.8753511 0.6879212 2.2585071

The Neural Net model seems to give the best performance. It has the lowest RMSE of the four models at 1.68 and the highest R-Squared, at .887. Additionally, the MARS model does select the informative predictors.

Problem 7.5

Setup

data("ChemicalManufacturingProcess")

columns <- colnames(ChemicalManufacturingProcess)

for(col in columns) {
  print(col)
  
  median_value <- median(ChemicalManufacturingProcess[[col]],na.rm=TRUE)
  ChemicalManufacturingProcess[col][is.na(ChemicalManufacturingProcess[col])] <- median_value
}
## [1] "Yield"
## [1] "BiologicalMaterial01"
## [1] "BiologicalMaterial02"
## [1] "BiologicalMaterial03"
## [1] "BiologicalMaterial04"
## [1] "BiologicalMaterial05"
## [1] "BiologicalMaterial06"
## [1] "BiologicalMaterial07"
## [1] "BiologicalMaterial08"
## [1] "BiologicalMaterial09"
## [1] "BiologicalMaterial10"
## [1] "BiologicalMaterial11"
## [1] "BiologicalMaterial12"
## [1] "ManufacturingProcess01"
## [1] "ManufacturingProcess02"
## [1] "ManufacturingProcess03"
## [1] "ManufacturingProcess04"
## [1] "ManufacturingProcess05"
## [1] "ManufacturingProcess06"
## [1] "ManufacturingProcess07"
## [1] "ManufacturingProcess08"
## [1] "ManufacturingProcess09"
## [1] "ManufacturingProcess10"
## [1] "ManufacturingProcess11"
## [1] "ManufacturingProcess12"
## [1] "ManufacturingProcess13"
## [1] "ManufacturingProcess14"
## [1] "ManufacturingProcess15"
## [1] "ManufacturingProcess16"
## [1] "ManufacturingProcess17"
## [1] "ManufacturingProcess18"
## [1] "ManufacturingProcess19"
## [1] "ManufacturingProcess20"
## [1] "ManufacturingProcess21"
## [1] "ManufacturingProcess22"
## [1] "ManufacturingProcess23"
## [1] "ManufacturingProcess24"
## [1] "ManufacturingProcess25"
## [1] "ManufacturingProcess26"
## [1] "ManufacturingProcess27"
## [1] "ManufacturingProcess28"
## [1] "ManufacturingProcess29"
## [1] "ManufacturingProcess30"
## [1] "ManufacturingProcess31"
## [1] "ManufacturingProcess32"
## [1] "ManufacturingProcess33"
## [1] "ManufacturingProcess34"
## [1] "ManufacturingProcess35"
## [1] "ManufacturingProcess36"
## [1] "ManufacturingProcess37"
## [1] "ManufacturingProcess38"
## [1] "ManufacturingProcess39"
## [1] "ManufacturingProcess40"
## [1] "ManufacturingProcess41"
## [1] "ManufacturingProcess42"
## [1] "ManufacturingProcess43"
## [1] "ManufacturingProcess44"
## [1] "ManufacturingProcess45"
set.seed(1234)

sample_set <- sample(nrow(ChemicalManufacturingProcess),round(nrow(ChemicalManufacturingProcess)*.75), replace=FALSE)

train_set <- ChemicalManufacturingProcess[sample_set, ]
test_set <- ChemicalManufacturingProcess[-sample_set, ]

train_set$x <- data.frame(train_set[2:length(train_set)])
train_set$y <- train_set$Yield

test_set$x <- data.frame(test_set[2:length(test_set)])
test_set$y <- test_set$Yield

PART A

Model 1 - KNN

knnModel <- train(x = train_set$x, 
                  y = train_set$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knnModel
## k-Nearest Neighbors 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.603946  0.3448619  1.252542
##    7  1.589598  0.3513882  1.256307
##    9  1.579211  0.3576659  1.245723
##   11  1.580346  0.3592996  1.244035
##   13  1.578822  0.3642831  1.240903
##   15  1.588478  0.3616403  1.244373
##   17  1.594748  0.3602025  1.248224
##   19  1.599287  0.3623210  1.251679
##   21  1.608488  0.3578763  1.261870
##   23  1.613881  0.3570849  1.263968
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knnPred <- predict(knnModel, newdata = test_set$x)

postResample(pred = knnPred, obs = test_set$y)
##      RMSE  Rsquared       MAE 
## 1.2529999 0.5507756 1.0233417

Model 2 - Multivariate Adaptive Regression Splines

marsModel <- earth(train_set$x, train_set$y)

marsModel
## Selected 10 of 22 terms, and 7 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 22 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 9 (additive model)
## GCV 1.159598    RSS 112.1736    GRSq 0.669192    RSq 0.7538554
summary(marsModel)
## Call: earth(x=train_set$x, y=train_set$y)
## 
##                                 coefficients
## (Intercept)                        40.989164
## h(72.06-BiologicalMaterial03)      -0.120984
## h(935-ManufacturingProcess04)      -0.056689
## h(46.78-ManufacturingProcess09)    -0.438879
## h(33.2-ManufacturingProcess13)      2.582101
## h(ManufacturingProcess30-9.2)       1.049130
## h(ManufacturingProcess32-151)      -0.796270
## h(ManufacturingProcess32-152)       1.012482
## h(7-ManufacturingProcess39)        -0.238456
## h(ManufacturingProcess39-7)        -2.526595
## 
## Selected 10 of 22 terms, and 7 of 57 predictors
## Termination condition: RSq changed by less than 0.001 at 22 terms
## Importance: ManufacturingProcess32, ManufacturingProcess09, ...
## Number of terms at each degree of interaction: 1 9 (additive model)
## GCV 1.159598    RSS 112.1736    GRSq 0.669192    RSq 0.7538554
marsPred <- predict(marsModel, test_set$x)

postResample(marsPred, test_set$y)
##      RMSE  Rsquared       MAE 
## 1.0869876 0.6678080 0.9027886

Model 3 - Supper Vector Machines

svmModel <- train(train_set$x, train_set$y,
      method = "svmRadial",
      preProc = c("center", "scale"),
      tuneLength = 14,
      trControl = trainControl(method="cv"))

svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 120, 116, 119, 119, 118, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE      
##      0.25  1.400729  0.5098299  1.1280766
##      0.50  1.289760  0.5714499  1.0167799
##      1.00  1.197715  0.6145619  0.9203828
##      2.00  1.139773  0.6363078  0.8799873
##      4.00  1.092132  0.6612965  0.8726417
##      8.00  1.069655  0.6747031  0.8679349
##     16.00  1.067343  0.6759214  0.8662318
##     32.00  1.067343  0.6759214  0.8662318
##     64.00  1.067343  0.6759214  0.8662318
##    128.00  1.067343  0.6759214  0.8662318
##    256.00  1.067343  0.6759214  0.8662318
##    512.00  1.067343  0.6759214  0.8662318
##   1024.00  1.067343  0.6759214  0.8662318
##   2048.00  1.067343  0.6759214  0.8662318
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01482714
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01482714 and C = 16.
svmModel$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 16 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0148271350472524 
## 
## Number of Support Vectors : 111 
## 
## Objective Function Value : -75.8277 
## Training error : 0.00893
svmPred <- predict(svmModel$finalModel, test_set$x)

postResample(svmPred, test_set$y)
##     RMSE Rsquared      MAE 
## 1.781326       NA 1.445993

Model 4 - Neural Networks

nnetModel <- nnet(x = train_set$x,
     y = train_set$y,
     size = 5,
     decay = 0.01,
     linout = TRUE,
     trace = FALSE,
     maxit = 500,
     MaxNWts = 5 * (ncol(train_set$x) +1) +5 +1)

nnetPred <- predict(nnetModel, test_set$x)

postResample(nnetPred, test_set$y)
##     RMSE Rsquared      MAE 
## 2.506230 0.144617 2.029931

For this data set, the MARS model gives the optimal resampling and test set performance

PART B

In the MARS model, the following terms are the most important terms.

important_terms <- marsModel$selected.terms

(important_term_names <- colnames(train_set$x[important_terms]))
##  [1] "BiologicalMaterial01"   "BiologicalMaterial02"   "BiologicalMaterial05"  
##  [4] "BiologicalMaterial07"   "BiologicalMaterial08"   "BiologicalMaterial09"  
##  [7] "BiologicalMaterial10"   "ManufacturingProcess01" "ManufacturingProcess03"
## [10] "ManufacturingProcess04"

As shown in the list above, 70% of the top terms are biological terms.

PART C

important_predictors <- ChemicalManufacturingProcess[important_term_names]

ChemicalManufacturingProcess$Yield
##   [1] 38.00 42.44 42.03 41.42 42.49 43.57 43.12 43.06 41.49 42.45 42.04 42.68
##  [13] 43.44 40.28 41.50 41.21 40.89 40.14 39.30 39.53 40.22 41.18 40.70 41.89
##  [25] 43.38 36.83 35.25 36.12 38.52 38.35 39.98 41.87 43.62 38.60 39.65 40.87
##  [37] 42.46 42.66 42.23 41.43 41.47 42.07 44.35 44.16 43.33 42.61 42.96 43.84
##  [49] 46.34 39.74 41.12 40.14 42.69 40.15 39.77 39.40 39.14 40.36 42.31 40.49
##  [61] 40.57 38.20 38.70 38.94 41.90 42.03 41.96 41.85 39.71 39.38 39.16 39.38
##  [73] 40.08 39.17 38.37 38.76 38.73 38.95 40.41 39.90 39.79 41.25 41.00 41.59
##  [85] 40.91 38.99 38.81 39.30 40.77 39.27 40.06 39.17 39.98 39.91 40.77 39.86
##  [97] 40.03 40.81 37.94 37.73 37.30 37.86 38.05 37.87 38.60 38.44 39.42 39.75
## [109] 39.51 38.35 40.38 40.19 39.96 39.79 41.86 42.15 43.88 39.58 40.19 39.84
## [121] 40.59 40.66 42.58 43.42 41.45 41.31 42.28 41.62 42.73 41.66 40.89 40.82
## [133] 39.77 38.05 37.86 38.03 37.39 39.16 37.64 39.77 38.66 40.31 40.54 40.64
## [145] 38.60 38.13 40.10 39.14 38.63 41.43 40.96 37.89 37.42 37.51 37.92 36.77
## [157] 37.14 37.73 38.03 37.86 38.31 38.66 38.65 38.67 38.42 39.15 38.82 39.08
## [169] 38.90 39.62 39.77 39.66 39.68 42.23 38.48 39.49
chem_man_process_data <- cbind(important_predictors, Yield = ChemicalManufacturingProcess$Yield)
for(i in 1:length(important_term_names)) {
  
  plt <- ggplot(data = chem_man_process_data, aes(x=chem_man_process_data[important_term_names[i]][[1]], y=Yield)) + 
    geom_point() + 
    geom_smooth(method=lm, se=FALSE) + 
    labs(x=important_term_names[i])

  
  print(plt)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'