7.2

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)

trainingData$x <- data.frame(trainingData$x)
#Look at the data using feature
featurePlot(trainingData$x, trainingData$y)

#Estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
ctrl <- trainControl(
  method = "boot",
  number = 25
)

#Linear regression
set.seed(200)
lmModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "lm",
  trControl = ctrl
)

#KNN

knnModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl
)

#MARS

marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",
  tuneLength = 10,
  trControl = ctrl
)
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.5.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.5.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.5.2
#Random forest

rfModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "rf",
  tuneLength = 5,
  trControl = ctrl
)

#Support vector machine

svmModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl
)

#Neural network

nnetModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "nnet",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trace = FALSE,
  linout = TRUE,
  trControl = ctrl
)

model_list <- list(
  Linear_Regression = lmModel,
  KNN = knnModel,
  MARS = marsModel,
  Random_Forest = rfModel,
  SVM_Radial = svmModel,
  Neural_Network = nnetModel
)

test_results <- lapply(model_list, function(model) {
  pred <- predict(model, newdata = testData$x)
  postResample(pred = pred, obs = testData$y)
})

test_results <- do.call(rbind, test_results)
test_results <- data.frame(
  Model = rownames(test_results),
  test_results,
  row.names = NULL
)

test_results
##               Model     RMSE  Rsquared      MAE
## 1 Linear_Regression 2.697068 0.7084666 2.060054
## 2               KNN 3.228683 0.6871735 2.593973
## 3              MARS 1.776575 0.8727000 1.358367
## 4     Random_Forest 2.413570 0.7930923 1.907306
## 5        SVM_Radial 2.076083 0.8252816 1.577022
## 6    Neural_Network 2.645210 0.7188479 2.025391
#Mars results on importance of informative predictors
marsModel$finalModel
## Selected 10 of 18 terms, and 6 of 10 predictors (nprune=10)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 9 (additive model)
## GCV 2.731203    RSS 447.3848    GRSq 0.889112    RSq 0.9082649
marsImp <- varImp(marsModel)
marsImp
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   82.78
## X2   64.18
## X5   40.21
## X3   28.14
## X6    0.00
plot(marsImp, top = 10)

The results show that the MARS model achieved the best predictive performance, with the lowest RMSE (1.78) and highest R^2 (0.873).The SVM and Random Forest models also did well but not as strongly as MARS. Linear regression and neural networks produced moderately, while KNN performed the worst, probably due to the presence of irrelevant predictors and the high dimensionality of the data as shown in the textbook example

The MARS informative predictors importance results indicate that X1–X5 were identified as the most important predictors, while X6 and lower had little or no visible importance. This agrees with the true underlying function used to generate this data which shown only depends on X1–X5.

7.5

#A
#Testing different models from chapter and using same seed as hw for repetition 
set.seed(62462)

data(ChemicalManufacturingProcess)

chem <- ChemicalManufacturingProcess

chem <- ChemicalManufacturingProcess

ProcessPredictors <- chem[, names(chem) != "Yield"]
yield <- chem$Yield


Training_split <- createDataPartition(yield, p = 0.80, list = FALSE)

trainsplit_x <- ProcessPredictors[Training_split, ]
testsplit_x  <- ProcessPredictors[-Training_split, ]

trainsplit_y <- yield[Training_split]
testsplit_y  <- yield[-Training_split]

medprocess <- preProcess(
  trainsplit_x,
  method = c("medianImpute", "center", "scale")
)

training_x_f <- predict(medprocess, trainsplit_x)
test_x_f    <- predict(medprocess, testsplit_x)

ctrl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 5
)


mars_fit <- train(
  x = training_x_f,
  y = trainsplit_y,
  method = "earth",
  trControl = ctrl,
  tuneLength = 10,
  metric = "RMSE"
)


svm_fit <- train(
  x = training_x_f,
  y = trainsplit_y,
  method = "svmRadial",
  trControl = ctrl,
  tuneLength = 10,
  metric = "RMSE"
)
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
rf_fit <- train(
  x = training_x_f,
  y = trainsplit_y,
  method = "rf",
  trControl = ctrl,
  tuneLength = 10,
  metric = "RMSE"
)


gbm_fit <- train(
  x = training_x_f,
  y = trainsplit_y,
  method = "gbm",
  trControl = ctrl,
  tuneLength = 10,
  metric = "RMSE",
  verbose = FALSE
)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
nonlinear_resamples <- resamples(list(
  MARS = mars_fit,
  SVM = svm_fit,
  RandomForest = rf_fit,
  GBM = gbm_fit
))

summary(nonlinear_resamples)
## 
## Call:
## summary.resamples(object = nonlinear_resamples)
## 
## Models: MARS, SVM, RandomForest, GBM 
## Number of resamples: 50 
## 
## MAE 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
## MARS         0.4841509 0.8062760 0.9294589 0.9465545 1.0441384 1.803642    0
## SVM          0.5215231 0.7027138 0.7939892 0.8113800 0.9065167 1.178549    0
## RandomForest 0.5057704 0.7250368 0.8399093 0.8386686 0.9361618 1.240473    0
## GBM          0.5267122 0.7160739 0.8153555 0.8374412 0.9377986 1.179194    0
## 
## RMSE 
##                   Min.   1st Qu.    Median     Mean  3rd Qu.     Max. NA's
## MARS         0.6084239 1.0047145 1.1503193 1.193842 1.273466 3.088726    0
## SVM          0.6475951 0.8547185 0.9979235 1.017687 1.125255 1.564169    0
## RandomForest 0.6620880 0.9175951 1.0492676 1.085483 1.260341 1.638098    0
## GBM          0.6332132 0.9260535 1.0152287 1.062458 1.233116 1.579060    0
## 
## Rsquared 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS         0.1065626 0.5511950 0.6400761 0.6252242 0.7429767 0.8401023    0
## SVM          0.3740499 0.6540322 0.7264424 0.7128082 0.7952394 0.8963321    0
## RandomForest 0.3799379 0.6118480 0.6794435 0.6791131 0.7639873 0.8841133    0
## GBM          0.3294154 0.5712696 0.7224334 0.6786335 0.8003342 0.9192390    0
dotplot(nonlinear_resamples, metric = "RMSE")

nonlinear_test_results <- data.frame(
  Model = c("MARS", "SVM", "Random Forest", "GBM"),
  RMSE = c(
    caret::RMSE(as.vector(predict(mars_fit, test_x_f)), testsplit_y),
    caret::RMSE(as.vector(predict(svm_fit, test_x_f)), testsplit_y),
    caret::RMSE(as.vector(predict(rf_fit, test_x_f)), testsplit_y),
    caret::RMSE(as.vector(predict(gbm_fit, test_x_f)), testsplit_y)
  ),
  Rsquared = c(
    caret::R2(as.vector(predict(mars_fit, test_x_f)), testsplit_y),
    caret::R2(as.vector(predict(svm_fit, test_x_f)), testsplit_y),
    caret::R2(as.vector(predict(rf_fit, test_x_f)), testsplit_y),
    caret::R2(as.vector(predict(gbm_fit, test_x_f)), testsplit_y)
  )
)

nonlinear_test_results
##           Model     RMSE  Rsquared
## 1          MARS 1.164975 0.5940395
## 2           SVM 1.283718 0.5269012
## 3 Random Forest 1.266489 0.5143470
## 4           GBM 1.158034 0.5929873
#B
#Best model
best_nonlinear <- svm_fit   

nonlinear_imp <- varImp(best_nonlinear)

plot(nonlinear_imp, top = 20)

nonlinear_imp_df <- nonlinear_imp$importance |>
  as.data.frame() |>
  tibble::rownames_to_column("Predictor") |>
  arrange(desc(Overall))

head(nonlinear_imp_df, 10)
##                 Predictor   Overall
## 1  ManufacturingProcess32 100.00000
## 2  ManufacturingProcess13  90.39975
## 3    BiologicalMaterial06  89.74252
## 4  ManufacturingProcess36  81.53643
## 5    BiologicalMaterial12  78.41002
## 6    BiologicalMaterial02  78.16844
## 7  ManufacturingProcess09  76.38185
## 8  ManufacturingProcess17  74.88325
## 9    BiologicalMaterial03  73.17861
## 10 ManufacturingProcess31  63.57876

Among top ten predictors 6 are manufacturing process variables and 4 are biological material variables suggesting that process variables slightly hold more influence. Biological variables still play a significant role suggesting that both process conditions and raw material characteristics hold a critical importance for predicting yield. Comparing these results to the optimal linear model there is a strong overlap. 8 of the top 10 predictors appear in both models. This indicates that many predictors have strong main effects that are consistently identified in every model.

The nonlinear model identifies two additional predictors ManufacturingProcess31 and BiologicalMaterial12 that were not among the top predictors in the linear model. This suggests that these variables may have some additional influence or collinearity being applied. It also may suggest there is a nonlinear relationship that isn’t captured by a linear model.

#C
#extract nonlinear top 10
nonlinear_top10 <- head(nonlinear_imp_df, 10)$Predictor

#Top10 from 6.3 results hw
linear_top10 <- c(
  "ManufacturingProcess32",
  "ManufacturingProcess36",
  "ManufacturingProcess13",
  "ManufacturingProcess09",
  "ManufacturingProcess17",
  "ManufacturingProcess33",
  "BiologicalMaterial02",
  "BiologicalMaterial03",
  "BiologicalMaterial08",
  "BiologicalMaterial06"
)

#find unique nonlinear predictors
unique_nonlinear <- setdiff(nonlinear_top10, linear_top10)

unique_nonlinear
## [1] "BiologicalMaterial12"   "ManufacturingProcess31"
#Make Plotting data
plot_data <- training_x_f |>
  as.data.frame() |>
  mutate(Yield = trainsplit_y)

#Plot unique nonlinear predictors
for (var in unique_nonlinear) {
  p <- ggplot(plot_data, aes(x = .data[[var]], y = Yield)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "loess", se = FALSE) +
    labs(
      title = paste("Yield vs", var),
      x = var,
      y = "Yield"
    )
  
  print(p)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

The predictors that were unique to the nonlinear model, ManufacturingProcess31 and BiologicalMaterial12, were plotted against yield to better understand their relationships. The plot for ManufacturingProcess31 shows a nonlinear downward trend, where yield decreases as the predictor increases, but the rate of decrease is not constant. In particular, the relationship becomes steeper in certain regions and then levels off, indicating that an optimal range exists and around -0.5 we see a decrease.

The plot for BiologicalMaterial12 shows yield initially increases as the predictor increases then reaches a peak and then decreases at higher values. This also suggests the presence of an optimal range for this biological variable where yield is maximized with a penalty for too little or too much. The nonlinear patterns help explain why these predictors were not identified as highly important in the linear model that assumes a constant linear relationship. The SVM model is able to capture these more complex relationships, indicating that both process and biological variables can influence yield in a nonlinear way.