library(AppliedPredictiveModeling)
data(permeability)
The matrix fingerprints contains the 1,107 binary
molecular predictors for the 165 compounds, while
permeability contains permeability response.
nearZeroVar
function from the caret package. How many predictors are
left for modeling?library(caret)
library(dplyr)
modeling <- fingerprints[,-nearZeroVar(fingerprints)]
as.data.frame(modeling)
model_df <- as.data.frame(modeling) %>% bind_cols(permeability)
PLS model. How many latent variables are optimal
and what is the corresponding re sampled estimate of R2?set.seed(123)
train_index <- createDataPartition(model_df$permeability , p=.8, list=F)
train <- model_df[ train_index,]
test <- model_df[-train_index,]
pls_model <- train(
permeability ~ ., data = train, method = "pls",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
plot(pls_model)
pls_model$results %>% inner_join( pls_model$bestTune)
postResample(pred = predict(pls_model,test), obs =test$permeability)
RMSE Rsquared MAE
11.9762238 0.3497131 8.4615399
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))
set.seed(100)
ridgeRegFit <- train(
permeability ~ ., data = train,
method = "ridge",
center = TRUE,
tuneGrid = ridgeGrid,
trControl = trainControl("cv", number = 10))
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1),
.fraction = seq(.05, 1, length = 20))
enetTune <- train(
permeability ~ ., data = train,
method = "enet",
center = TRUE,
tuneGrid = enetGrid,
trControl = trainControl("cv", number = 10)
)
pcr_model <- train(
permeability ~ ., data = train, method = "pcr",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
glmnet_model <- train(
permeability~ ., data = train,
center = TRUE,
method = "glmnet",
trControl = trainControl("cv", number = 10)
)
postResample(pred = predict(ridgeRegFit,test), obs =test$permeability)
RMSE Rsquared MAE
12.8430320 0.4004427 9.2116952
postResample(pred = predict(enetTune,test), obs =test$permeability)
RMSE Rsquared MAE
11.5871592 0.3519218 7.2484765
postResample(pred = predict(pcr_model,test), obs =test$permeability)
RMSE Rsquared MAE
11.7605501 0.3336618 7.5132196
postResample(pred = predict(glmnet_model,test), obs =test$permeability)
RMSE Rsquared MAE
11.0800848 0.3581228 7.3179572
postResample(pred = predict(pls_model,test), obs =test$permeability)
RMSE Rsquared MAE
11.9762238 0.3497131 8.4615399
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
The matrix processPredictors contains the 57 predictors
(12 describing the input biological material and 45 describing the
process predictors) for the 176 manufacturing runs. yield
contains the percent yield for each run.
ChemicalManufacturingProcess
library(RANN)
impute <- preProcess(ChemicalManufacturingProcess, "knnImpute")
chem_data <- predict(impute, ChemicalManufacturingProcess)
set.seed(123)
chem_data <- chem_data %>% select(!nearZeroVar(.))
train_index_chen <- createDataPartition(chem_data$Yield , p=.8, list=F)
train_chem <- chem_data[ train_index_chen,]
test_chem <- chem_data[-train_index_chen,]
pls_model_chem <- train(
Yield ~ ., data = train_chem, method = "pls",
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
pls_model_chem$results %>% inner_join(pls_model_chem$bestTune)
plot(pls_model_chem)
postResample(pred = predict(pls_model_chem,test_chem), obs =test_chem$Yield)
RMSE Rsquared MAE
0.7313840 0.4838632 0.6282700
plot(varImp(pls_model_chem), top = 10)
library(DataExplorer)
chem_data %>% select(Yield , ends_with(c("32","13","17","09","36","33","11","08") )) %>%
plot_correlation()