library(caret)
library(glmnet)
library(AppliedPredictiveModeling)
library(e1071)
library(dplyr)
library(magrittr)
data(permeability)
str(fingerprints)
## num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:165] "1" "2" "3" "4" ...
## ..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
# Remove predictors with near-zero variance
fingerprints <- fingerprints[, -nearZeroVar(fingerprints)]
# Check the dimensions of the filtered matrix
dim(fingerprints)
## [1] 165 388
set.seed(123)
# Create a training-test split (80% training, 20% testing)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
trainData <- fingerprints[trainIndex, ]
testData <- fingerprints[-trainIndex, ]
trainResponse <- permeability[trainIndex]
testResponse <- permeability[-trainIndex]
trainSet <- data.frame(permeability = trainResponse, trainData)
preProc <- preProcess(trainSet[, -1], method = c("center", "scale"))
trainDataProcessed <- predict(preProc, trainSet[, -1])
# Apply preprocessing to the test set using the same preProc object
testDataProcessed <- predict(preProc, testData)
# Combine processed data with the response variable for training set
trainDataProcessed <- data.frame(permeability = trainResponse, trainDataProcessed)
# We’ll use cross-validation to find the optimal number of latent variables
control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Train the PLS model and tune the number of components
plsFit <- train(
permeability ~ .,
data = trainDataProcessed,
method = "pls",
tuneLength = 20,
trControl = control,
preProcess = c("center", "scale")
)
print(plsFit)
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.31894 0.3442124 10.254018
## 2 11.78898 0.4830504 8.534741
## 3 11.98818 0.4792649 9.219285
## 4 12.04349 0.4923322 9.448926
## 5 11.79823 0.5193195 9.049121
## 6 11.53275 0.5335956 8.658301
## 7 11.64053 0.5229621 8.878265
## 8 11.86459 0.5144801 9.265252
## 9 11.98385 0.5188205 9.218594
## 10 12.55634 0.4808614 9.610747
## 11 12.69674 0.4758068 9.702325
## 12 13.01534 0.4538906 9.956623
## 13 13.12637 0.4367362 9.878017
## 14 13.44865 0.4140715 10.065088
## 15 13.60135 0.4034269 10.188150
## 16 13.79361 0.3943904 10.247160
## 17 14.00756 0.3845119 10.412776
## 18 14.18113 0.3711378 10.587027
## 19 14.25674 0.3703610 10.575726
## 20 14.33121 0.3723176 10.679764
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
plsPredictions <- predict(plsFit, newdata = testDataProcessed)
# Create a data frame for observed and predicted values
plsAcc <- data.frame(obs = testResponse, pred = plsPredictions)
# Calculate the test set estimate of R^2
testR2 <- cor(plsAcc$obs, plsAcc$pred)^2
# Display the test set R^2
print(paste("Test set estimate of R^2 for the PLS model:", round(testR2, 4)))
## [1] "Test set estimate of R^2 for the PLS model: 0.3245"
set.seed(997)
lassoControl <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Train the Lasso model
lassoModel <- train(
permeability ~ .,
data = trainDataProcessed,
method = "lasso",
tuneLength = 10,
trControl = lassoControl,
preProcess = c("center", "scale")
)
# Predictions for Lasso
lassoPredictions <- predict(lassoModel, newdata = testDataProcessed)
# Calculate test set R^2 for Lasso
lassoAcc <- data.frame(obs = testResponse, pred = lassoPredictions)
lassoR2 <- cor(lassoAcc$obs, lassoAcc$pred)^2
# Print Lasso model performance
print(paste("Test set estimate of R^2 for Lasso model:", round(lassoR2, 4)))
## [1] "Test set estimate of R^2 for Lasso model: 0.3877"
set.seed(991)
larsControl <- trainControl(method = "cv", number = 10) # 10-fold cross-validation for LARS
# Train the LARS model
larsModel <- train(
permeability ~ ., # Formula interface, using processed training data
data = trainDataProcessed,
method = "lars", # Specify method as lars
tuneLength = 10, # Use 10 values for tuning the lambda
trControl = larsControl,
preProcess = c("center", "scale")
)
# Predictions for LARS
larsPredictions <- predict(larsModel, newdata = testDataProcessed)
# Calculate test set R^2 for LARS
larsAcc <- data.frame(obs = testResponse, pred = larsPredictions)
larsR2 <- cor(larsAcc$obs, larsAcc$pred)^2
# Print LARS model performance
print(paste("Test set estimate of R^2 for LARS model:", round(larsR2, 4)))
## [1] "Test set estimate of R^2 for LARS model: 0.3648"
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
chem <- ChemicalManufacturingProcess
sum(is.na(chem))
## [1] 106
nas <- preProcess(chem, method = "bagImpute")
chem_filled <- predict(nas, chem)
# Check if it worked
sum(is.na(chem_filled))
## [1] 0
####(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
# filtering low frequencies
chem_filled <- chem_filled[, -nearZeroVar(chem_filled)]
ctrl <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
set.seed(667)
index <- createDataPartition(chem_filled$Yield, p = .8, list = FALSE)
set.seed(667)
# Training set
train_chem <- chem_filled[index, ]
# Test set
test_chem <- chem_filled[-index, ]
# Train the PLS model with tuning
plsTune <- train(Yield ~ .,
data = train_chem,
method = "pls",
tuneLength = 20,
trControl = ctrl,
preProc = c("center", "scale"))
print(plsTune)
## Partial Least Squares
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 129, 129, 128, 131, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.410880 0.4465833 1.129983
## 2 1.408656 0.4790663 1.079502
## 3 1.600099 0.5325645 1.128815
## 4 2.046151 0.4953885 1.254460
## 5 2.230639 0.4861753 1.290749
## 6 2.341817 0.4924156 1.318533
## 7 2.403372 0.4926435 1.340414
## 8 2.507791 0.4861479 1.392677
## 9 2.487955 0.4841401 1.385031
## 10 2.420416 0.4693683 1.372123
## 11 2.337806 0.4676636 1.352844
## 12 2.302591 0.4680067 1.354070
## 13 2.363504 0.4801033 1.371043
## 14 2.457141 0.4798591 1.387208
## 15 2.492464 0.4744437 1.394941
## 16 2.502890 0.4727444 1.396571
## 17 2.482216 0.4740394 1.388951
## 18 2.471576 0.4753953 1.386554
## 19 2.481864 0.4782736 1.391768
## 20 2.483192 0.4773869 1.392706
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
plot(plsTune)
test_predictions <- predict(plsTune, newdata = test_chem)
# Step 2: Calculate performance metrics on the test set
# Create a data frame for observed and predicted values
performance_df <- data.frame(obs = test_chem$Yield, pred = test_predictions)
# Calculate RMSE
rmse_test <- sqrt(mean((performance_df$obs - performance_df$pred)^2))
# Calculate R^2
r_squared_test <- cor(performance_df$obs, performance_df$pred)^2
cat("Test Set RMSE:", rmse_test, "\n")
## Test Set RMSE: 3.317378
cat("Test Set R^2:", r_squared_test, "\n")
## Test Set R^2: 0.1187605
# Compare with training set performance metrics
train_rmse <- min(plsTune$results$RMSE)
train_r_squared <- max(plsTune$results$Rsquared)
cat("Training Set RMSE:", train_rmse, "\n")
## Training Set RMSE: 1.408656
cat("Training Set R^2:", train_r_squared, "\n")
## Training Set R^2: 0.5325645
importance <- varImp(plsTune, scale = FALSE)
## Warning: package 'pls' was built under R version 4.3.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
print(importance)
## pls variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 0.13004
## ManufacturingProcess13 0.12050
## ManufacturingProcess09 0.11409
## ManufacturingProcess36 0.10753
## ManufacturingProcess17 0.10724
## BiologicalMaterial02 0.09911
## BiologicalMaterial06 0.09718
## BiologicalMaterial03 0.09141
## BiologicalMaterial08 0.08380
## ManufacturingProcess06 0.08335
## ManufacturingProcess33 0.08332
## BiologicalMaterial12 0.08220
## BiologicalMaterial01 0.08055
## BiologicalMaterial11 0.07931
## ManufacturingProcess12 0.07887
## BiologicalMaterial04 0.07765
## ManufacturingProcess11 0.06901
## ManufacturingProcess28 0.06407
## ManufacturingProcess04 0.05419
## BiologicalMaterial10 0.04837
library(corrplot)
## corrplot 0.94 loaded
##
## Attaching package: 'corrplot'
## The following object is masked from 'package:pls':
##
## corrplot
topPr <- varImp(plsTune)$importance %>%
arrange(-Overall) %>%
head(10)
# Select the yield and the top predictors
top_predictors_data <- chem_filled %>%
select(c("Yield", row.names(topPr)))
# Calculate the correlation matrix
cor_matrix <- cor(top_predictors_data, use = "complete.obs")
corrplot(cor_matrix,
method = "circle",
type = "full",
tl.col = "black",
tl.srt = 29,
tl.cex = 0.7,
cex.axis = 0.7,
font.lab = 2,
font.axis = 2,
diag = TRUE)