Data_HW8

library(mlbench)
library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Loading required package: lattice

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
 ## We convert the 'x' data from a matrix to a data frame
 ## One reason is that this will give the columns names.
 trainingData$x <- data.frame(trainingData$x)
 ## Look at the data using
 featurePlot(trainingData$x, trainingData$y)

 ## or other methods.
 ## This creates a list with a vector 'y' and a matrix
 ## of predictors 'x'. Also simulate a large test set to
 ## estimate the true error rate with good precision:
 testData <- mlbench.friedman1(5000, sd = 1)
 testData$x <- data.frame(testData$x)

# Train a k-NN model
knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)

# Print the model details
print(knnModel)

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.

# Predict on test data
knnPred <- predict(knnModel, newdata = testData$x)

# Evaluate the model performance
postResample(pred = knnPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

plot(knnModel)

library(caret)
library(mlbench) 
library(earth)

## Warning: package 'earth' was built under R version 4.3.3

## Loading required package: Formula

## Loading required package: plotmo

## Warning: package 'plotmo' was built under R version 4.3.3

## Loading required package: plotrix

trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

# Train a MARS model
marsModel <- train(x = trainingData$x, y = trainingData$y, method = "earth",
                   preProc = c("center", "scale"),
                   tuneLength = 10)

# Print the MARS model details
print(marsModel)

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      4.227779  0.3439281  3.472885
##    3      3.667327  0.5056307  2.921129
##    4      2.954499  0.6800392  2.337076
##    6      2.513412  0.7699536  1.993898
##    7      2.076450  0.8416729  1.641145
##    8      1.968359  0.8577336  1.520758
##   10      1.815065  0.8793842  1.391116
##   11      1.804839  0.8816600  1.386985
##   12      1.800057  0.8821750  1.389060
##   14      1.837364  0.8781616  1.417508
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 1.

# Predict on test data
marsPred <- predict(marsModel, newdata = testData$x)

# Evaluate the MARS model performance
marsPerformance <- postResample(pred = marsPred, obs = testData$y)
print(marsPerformance)

##      RMSE  Rsquared       MAE 
## 1.8041213 0.8693145 1.4033565

plot(marsModel)

library(caret)
library(kernlab)  # For SVM training

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

# Train an SVM model
svmModel <- train(x = trainingData$x, y = trainingData$y, method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 10)

# Print the SVM model details
print(svmModel)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.878715  0.7174967  2.240240
##     0.50  2.633289  0.7439245  2.006067
##     1.00  2.494958  0.7626134  1.881132
##     2.00  2.421590  0.7749178  1.818691
##     4.00  2.396770  0.7793750  1.803700
##     8.00  2.389620  0.7805680  1.807756
##    16.00  2.380454  0.7821085  1.797683
##    32.00  2.380454  0.7821085  1.797683
##    64.00  2.380454  0.7821085  1.797683
##   128.00  2.380454  0.7821085  1.797683
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06012792
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06012792 and C = 16.

# Predict on test data
svmPred <- predict(svmModel, newdata = testData$x)

# Evaluate the SVM model performance
svmPerformance <- postResample(pred = svmPred, obs = testData$y)
print(svmPerformance)

##      RMSE  Rsquared       MAE 
## 1.9373452 0.8478925 1.4816257

MARS has lowest RMSE

7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models. (a) Which nonlinear regression model gives the optimal resampling and test set performance? (b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model? (c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ kernlab::alpha() masks ggplot2::alpha()
## ✖ purrr::cross()   masks kernlab::cross()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ purrr::lift()    masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)
library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3

data("ChemicalManufacturingProcess")

# Setting up k-NN imputation
impute <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
# Applying the k-NN imputation
ChemicalManufacturingProcess <- predict(impute, ChemicalManufacturingProcess)

set.seed(27) 

# Remove features with near-zero variance from the dataset
ChemicalManufacturingProcess <- ChemicalManufacturingProcess %>% 
                                select(-nearZeroVar(.))

# Partition the data into training and testing sets with 80% of the data allocated for training
train_indices <- createDataPartition(ChemicalManufacturingProcess$Yield, p = 0.8, list = FALSE)
train_set <- ChemicalManufacturingProcess[train_indices, ]
test_set <- ChemicalManufacturingProcess[-train_indices, ]

# Train a k-NN model
knnModel <- train(Yield ~ ., data = train_set, method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10,
                  trControl = trainControl(method = "cv", number = 10))

# Print the model details
print(knnModel)

## k-Nearest Neighbors 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 131, 130, 130, 129, 130, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.6997785  0.5387560  0.5523811
##    7  0.7006560  0.5446173  0.5649727
##    9  0.7108708  0.5336977  0.5748528
##   11  0.7084606  0.5378014  0.5688690
##   13  0.7307448  0.5002814  0.5876982
##   15  0.7295459  0.5119678  0.5892702
##   17  0.7269048  0.5270187  0.5856231
##   19  0.7326932  0.5223252  0.5880881
##   21  0.7299442  0.5384177  0.5839868
##   23  0.7452009  0.5123186  0.5981073
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.

# Predict on test data
knnPred <- predict(knnModel, newdata = test_set)

# Evaluate the k-NN model performance
knnPerformance <- postResample(pred = knnPred, obs = test_set$Yield)
print(knnPerformance)

##      RMSE  Rsquared       MAE 
## 0.8178698 0.3601269 0.7282600

# Train a MARS model using the 'earth' method
marsModel <- train(Yield ~ ., data = train_set, method = "earth",
                   preProc = c("center", "scale"),
                   tuneLength = 10,
                   trControl = trainControl(method = "cv", number = 10))

# Print the MARS model details
print(marsModel)

## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 129, 129, 129, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE       Rsquared   MAE      
##    2      0.7441366  0.4668953  0.5827163
##    3      0.6819378  0.5600650  0.5445381
##    5      0.6079391  0.6566517  0.4981536
##    7      0.6038160  0.6741141  0.4892117
##    8      0.5814847  0.6898357  0.4736876
##   10      0.5674119  0.7007906  0.4670387
##   12      0.5779343  0.6950919  0.4738279
##   13      0.5740130  0.6967953  0.4684200
##   15      0.5627214  0.7069651  0.4512807
##   17      0.5636383  0.7059825  0.4522115
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 1.

# Predict on the test data using the MARS model
marsPred <- predict(marsModel, newdata = test_set)

# Evaluate the MARS model performance
marsPerformance <- postResample(pred = marsPred, obs = test_set$Yield)
print(marsPerformance)

##      RMSE  Rsquared       MAE 
## 0.7891474 0.4046041 0.6115062

# Train a neural network model
nnModel <- train(Yield ~ ., data = train_set, method = "nnet",
                 preProcess = c("center", "scale"),
                 tuneLength = 3,  # This can be adjusted for more detailed tuning
                 trControl = trainControl(method = "cv", number = 10, allowParallel = TRUE),
                 linout = TRUE,  # This ensures the output layer is linear
                 trace = FALSE,  # Turn off training verbosity
                 maxit = 1000)   # Maximum iterations

# Print the neural network model details
print(nnModel)

## Neural Network 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 128, 131, 130, 129, 131, 129, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE       Rsquared   MAE      
##   1     0e+00  0.9239346  0.3997760  0.7248345
##   1     1e-04  0.9055726  0.3740634  0.7166098
##   1     1e-01  0.7463082  0.5255808  0.6318330
##   3     0e+00  1.0845597  0.3577501  0.8697639
##   3     1e-04  1.3733806  0.2880356  1.1180445
##   3     1e-01  0.9039541  0.4355431  0.6950002
##   5     0e+00  1.1535052  0.2746253  0.9725682
##   5     1e-04  0.8763224  0.4241868  0.6858085
##   5     1e-01  0.7086386  0.5517979  0.5652947
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5 and decay = 0.1.

# Predict on the test data using the neural network model
nnPred <- predict(nnModel, newdata = test_set)

# Evaluate the neural network model performance
nnPerformance <- postResample(pred = nnPred, obs = test_set$Yield)
print(nnPerformance)

##      RMSE  Rsquared       MAE 
## 0.8497308 0.4285875 0.6298227

MARS has the lowest RMSE

Data_HW8

2024-05-15