library(caret)
library(tidyverse)
library(tidymodels)
library(pls)
library(glmnet)
library(AppliedPredictiveModeling)
data(permeability)
# Filter out the predictors that have low frequencies
nzv_labels <- nearZeroVar(fingerprints, names = TRUE)
filtered_fingerprints <- as_tibble(fingerprints) %>% select(-nzv_labels)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(nzv_labels)
##
## # Now:
## data %>% select(all_of(nzv_labels))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
dim(filtered_fingerprints)
## [1] 165 388
Without changing the default values fof the parameters freqcut or uniquecut, the nearZeroVar function filtered out 719 predictors leaving 388.
# Bind the response variable to the fingerprints
filtered_fingerprints <- cbind(filtered_fingerprints, permeability)
# Set prop to 0.7 so 70% goes into our training data and 30% goes into our testing set.
set.seed(123)
fp_split <- initial_split(filtered_fingerprints,
prop = 0.75)
# Create the training data
fp_training <- fp_split %>%
training()
fpXtrain <- fp_training %>% select(-permeability)
fpYtrain <- fp_training %>% select(permeability)
# Create the test data
fp_test <- fp_split %>%
testing()
fpXtest <- fp_test %>% select(-permeability)
fpYtest <- fp_test %>% select(permeability)
set.seed(124)
ctrl <- trainControl(method = "cv", number = 10)
plsFit <- train(fpXtrain, fpYtrain$permeability, method = "pls",
## The default tuning grid evaluates
## components 1... tuneLength
tuneLength = 20,
trControl = ctrl,
preProc = c("center", "scale"))
# Obtain report
summary(plsFit)
## Data: X dimension: 123 388
## Y dimension: 123 1
## Fit method: oscorespls
## Number of components considered: 2
## TRAINING: % variance explained
## 1 comps 2 comps
## X 23.42 34.83
## .outcome 29.24 47.27
According to the output there were 6 components considered and the \(R^2\) of 47.27%.
plot(plsFit, metric ="Rsquared", main = "PLS Tuning Parameter for Permeability Data")
I used 2 components to predict the values in the test set
# Predict permeability in the test set
set.seed(124)
plsPredict <- predict(plsFit, ncomp = 2, newdata = fpXtest)
# Calculate RMSEP
rmse_pls = sqrt(mean((plsPredict-fpYtest$permeability)^2 ))
# Calculate R^2 using R2()
r2_pls = cor(plsPredict,fpYtest$permeability,method="pearson")^2
The test set estimate of RMSE is 9.7610595
The test set estimate of the R_squared is 54.5152162
Ridege Regression
# Fit ridge model with glmnet function using cross validation
set.seed(125)
ridgeModel <- cv.glmnet(x = as.matrix(fpXtrain), y = fpYtrain$permeability, type.measure="mse", alpha=0)
summary(ridgeModel)
## Length Class Mode
## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 5 -none- call
## name 1 -none- character
## glmnet.fit 12 elnet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
## index 2 -none- numeric
# Plot lambda value against the MSE
plot(ridgeModel)
# Predict with largest value of lambda that provides cross-validated error within one standard error of the minimum.
ridgePred <- predict(ridgeModel, ridgeModel$lambda.1se, new = as.matrix(fpXtest))
# Compute MSE
rmse_ridge <- sqrt(mean((fpYtest$permeability - ridgePred)^2))
Compared to the RMSE of the PLS model: 9.7610595, the RMSE of the ridge regression was higher: 11.1569029.
Lasso Regression
# Fit a Lasso regression model
set.seed(126)
lassoModel <- cv.glmnet(x = as.matrix(fpXtrain), y = fpYtrain$permeability,
type.measure="mse",
alpha=1)
summary(lassoModel)
## Length Class Mode
## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 5 -none- call
## name 1 -none- character
## glmnet.fit 12 elnet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
## index 2 -none- numeric
plot(lassoModel)
# Predict values
lassoPred <- predict(lassoModel, lassoModel$lambda.1se, new=as.matrix(fpXtest))
# RMSE
rmse_lasso <- sqrt(mean((fpYtest$permeability - lassoPred)^2))
rmse_lasso
## [1] 12.4413
Out of the three regression models, the pls model had the lowest RMSE so in this case I would not recommend either model over the pls model.
# Load data
data("ChemicalManufacturingProcess")
chemical_df <- ChemicalManufacturingProcess
# head(chemical_df)
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
# Find number of missing values in each predictor
chemical_df %>% summarise_all(~ sum(is.na(.))) %>% pivot_longer(cols = colnames(chemical_df), names_to = "Variable", values_to = "Counts") %>%
filter(Counts > 0)
## # A tibble: 28 × 2
## Variable Counts
## <chr> <int>
## 1 ManufacturingProcess01 1
## 2 ManufacturingProcess02 3
## 3 ManufacturingProcess03 15
## 4 ManufacturingProcess04 1
## 5 ManufacturingProcess05 1
## 6 ManufacturingProcess06 2
## 7 ManufacturingProcess07 1
## 8 ManufacturingProcess08 1
## 9 ManufacturingProcess10 9
## 10 ManufacturingProcess11 10
## # ℹ 18 more rows
There are 28 predictors with missing values. Of the 28 predictors all are from predictors that describe the manufacturing process.
# Create a function that imputes the median of the predictor for the missing values
for(i in 1:ncol(chemical_df)) {
chemical_df[ , i][is.na(chemical_df[ , i])] <- median(chemical_df[ , i], na.rm=TRUE)
}
set.seed(129)
chemical_split <- initial_split(chemical_df,
prop = 0.7)
# Create the training data
chemical_train <- chemical_split %>%
training()
chemXtrain <- chemical_train %>% select(-Yield)
chemYtrain <- chemical_train %>% select(Yield)
# Create the test data
chemical_test <- chemical_split %>%
testing()
chemXtest <- chemical_test %>% select(-Yield)
chemYtest <- chemical_test %>% select(Yield)
I choose to fit an elasticnet model. The elastic net model assists with determining which variables are most useful by employing both lasso and ridge regularization strategies.
Rather than using two different values of lambda for the lasso and ridge component, the glmnet function uses an alpha parameter. When \(0 < \alpha < 1\) we get a mix of the two regularization methods. Cross validation will also be used to find the best lambda value. The first model fitted with glmnet is with an alpha value of 0.5.
# Fit an elastic net model
alpha0_5_fit <- cv.glmnet(as.matrix(chemXtrain), chemYtrain$Yield,
type.measure="mse",
alpha=0.5, family="gaussian")
summary(alpha0_5_fit)
## Length Class Mode
## lambda 100 -none- numeric
## cvm 100 -none- numeric
## cvsd 100 -none- numeric
## cvup 100 -none- numeric
## cvlo 100 -none- numeric
## nzero 100 -none- numeric
## call 6 -none- call
## name 1 -none- character
## glmnet.fit 12 elnet list
## lambda.min 1 -none- numeric
## lambda.1se 1 -none- numeric
## index 2 -none- numeric
What is the optimal value of the performance metric?
#alpha0_5_fit$glmnet.fit
print(alpha0_5_fit)
##
## Call: cv.glmnet(x = as.matrix(chemXtrain), y = chemYtrain$Yield, type.measure = "mse", alpha = 0.5, family = "gaussian")
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.0702 39 1.112 0.1485 27
## 1se 0.4110 20 1.246 0.1632 9
plot(alpha0_5_fit)
With an alpha of 0.5, and lambda of 0.4110, the Mean square error is 1.246.
alpha0_5_pred <-
predict(alpha0_5_fit, s=alpha0_5_fit$lambda.1se, newx=as.matrix(chemXtest))
mean((chemYtest$Yield - alpha0_5_pred)^2)
## [1] 3.467167
The model does fairly well on the test set, producing a mean square error of 3.467. This is higher and other values of alpha might be better. Next steps would be to compare models using different values of alpha and see which preforms best.
Based on the plot above we should have between 9-10 non zero. Looking at what predictors are most important based on their coefficients we ses that manufacturing processes dominate the list, with only two biologic predictor making the list.
alpha0_5_fit_coef <- coef(alpha0_5_fit)
predictor_name <- alpha0_5_fit_coef@Dimnames[[1]][alpha0_5_fit_coef@i+1]
predictor_coeff <- alpha0_5_fit_coef@x
top_predictors <- tibble(Predictor_Name = predictor_name, Predictor_Coefficient = predictor_coeff) %>% arrange(desc(Predictor_Coefficient))
top_predictors
## # A tibble: 10 × 2
## Predictor_Name Predictor_Coefficient
## <chr> <dbl>
## 1 (Intercept) 23.5
## 2 ManufacturingProcess09 0.124
## 3 ManufacturingProcess32 0.117
## 4 ManufacturingProcess06 0.0770
## 5 BiologicalMaterial02 0.0133
## 6 BiologicalMaterial03 0.0106
## 7 ManufacturingProcess31 -0.125
## 8 ManufacturingProcess17 -0.148
## 9 ManufacturingProcess13 -0.211
## 10 ManufacturingProcess36 -192.
Looking the correlations between the response variable and the top predictors in the correlation plot we can see that there are some moderate relationships between biological material #3, manufacturing processes numbers 9, 13, 32, and 36. Manufacturing processes numbers 9 and 32 have positive correlations so optimizing those would increase the yield. On the other hand manufacturing processes 13 and 36 have negative correlations so minimizing the outputs from these would also increase the yield. Looking the correlations between the top predictors also reveal that minimizing the output of those that have a negative correlation with the predictors that increase yield might also be important. For example, manufacturing process #32 has a negative correlation with #36. As mention before #32 is positively correlated with Yield.
library(corrplot)
## corrplot 0.92 loaded
##
## Attaching package: 'corrplot'
## The following object is masked from 'package:pls':
##
## corrplot
predictor_name <- append(predictor_name, "Yield")
#chemical_df %>% select(predictor_name[c(2:length(predictor_name))])
corr <- cor(chemical_df %>% select(predictor_name[c(2:length(predictor_name))]))
corrplot(corr, method = 'square', type = 'lower', diag = FALSE)