library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(glmnet)
library(RANN)
data(permeability)
data(ChemicalManufacturingProcess)
nzv_fingerprints <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv_fingerprints]
num_predictors_left <- ncol(filtered_fingerprints)
num_predictors_left
## [1] 388
The number of predictors left after filtering is 388.
set.seed(1048)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
trainData <- filtered_fingerprints[trainIndex, ]
testData <- filtered_fingerprints[-trainIndex, ]
trainY <- permeability[trainIndex]
testY <- permeability[-trainIndex]
# Preprocessing: Centering & Scaling
preProc <- preProcess(trainData, method = c("center", "scale"))
trainTransformed <- predict(preProc, trainData)
testTransformed <- predict(preProc, testData)
# Train PLS model
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
plsFit <- train(trainTransformed, trainY,
method = "pls",
tuneLength = 20,
trControl = ctrl,
metric = "Rsquared")
plsFit$bestTune
## ncomp
## 2 2
max(plsFit$results$Rsquared)
## [1] 0.4869915
The optimal number of latent variables is 2. The resampled estimate of R^2 is 0.4869915.
plsPred <- predict(plsFit, testTransformed)
testR2 <- caret::R2(plsPred, testY, form = "traditional")
testR2
## [1] 0.2852142
The test set estimate of R^2 is 0.2852142.
# Linear Regression (with PCA to reduce multicollinearity)
preProcPCA <- preProcess(trainTransformed, method = c("center", "scale", "pca"), thresh = 0.95)
trainPCA <- predict(preProcPCA, trainTransformed)
testPCA <- predict(preProcPCA, testTransformed)
lmFit <- train(trainPCA, trainY,
method = "lm",
trControl = ctrl)
lmPred <- predict(lmFit, testPCA)
lmR2 <- caret::R2(lmPred, testY, form = "traditional")
lmR2
## [1] 0.3404494
The estimate of R^2 for linear regression model is 0.3404494.
# Ridge Regression (use glmnet with alpha = 0)
lambdaGrid <- 10^seq(2, -2, length = 100)
ridgeGrid <- expand.grid(alpha = 0, lambda = lambdaGrid)
ridgeFit <- train(trainTransformed, trainY,
method = "glmnet",
trControl = ctrl,
tuneGrid = ridgeGrid,
preProcess = c("center", "scale"))
ridgePred <- predict(ridgeFit, testTransformed)
ridgeR2 <- caret::R2(ridgePred, testY, form = "traditional")
ridgeR2
## [1] 0.3879057
The estimate of R^2 for ridge regression model is 0.3879057. The ridge regression model has better predictive performance.
Yes, the ridge regression model has higher R^2 than the permeability laboratory experiment. This means that the ridge regression model has better predictive performance.
# Impute missing values using KNN, with scaling and centering
processed <- preProcess(ChemicalManufacturingProcess,
method = c("knnImpute", "center", "scale"))
imputedData <- predict(processed, ChemicalManufacturingProcess)
set.seed(1225)
trainIndex2 <- createDataPartition(imputedData$Yield, p = 0.8, list = FALSE)
trainData2 <- imputedData[trainIndex2, ]
testData2 <- imputedData[-trainIndex2, ]
# Prepare x and y matrices for glmnet
x <- model.matrix(Yield ~ ., trainData2)[, -1] # Remove intercept column
y <- trainData2$Yield
# Set up cross-validation
ctrl2 <- trainControl(method = "cv", number = 10)
# Tune ridge regression (alpha = 0 for ridge, alpha = 1 for lasso)
ridgeTune2 <- train(x = x, y = y,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0, # Pure ridge regression
lambda = 10^seq(-3, 3, length = 100)),
trControl = ctrl2)
# View results
ridgeTune2$bestTune # Optimal lambda value
## alpha lambda
## 46 0 0.5336699
ridgeTune2$results$Rsquared # Performance metrics
## [1] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
## [8] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
## [15] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
## [22] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
## [29] 0.5392414 0.5392414 0.5404528 0.5417727 0.5428218 0.5435560 0.5440543
## [36] 0.5443117 0.5443889 0.5443055 0.5440833 0.5437533 0.5433368 0.5428677
## [43] 0.5423155 0.5416253 0.5407493 0.5395484 0.5378202 0.5353941 0.5320763
## [50] 0.5280050 0.5233821 0.5185832 0.5136991 0.5088727 0.5040066 0.4991455
## [57] 0.4941804 0.4891816 0.4840871 0.4789875 0.4738457 0.4687579 0.4637139
## [64] 0.4588072 0.4540358 0.4494818 0.4451549 0.4411105 0.4373426 0.4338860
## [71] 0.4307316 0.4278884 0.4253430 0.4230793 0.4210890 0.4193418 0.4178241
## [78] 0.4164987 0.4153578 0.4143631 0.4135114 0.4127678 0.4121329 0.4115767
## [85] 0.4111024 0.4106855 0.4103338 0.4100202 0.4097523 0.4095155 0.4093137
## [92] 0.4091351 0.4089830 0.4088481 0.4087332 0.4086602 NaN NaN
## [99] NaN NaN
# Prepare test data for prediction
testX2 <- model.matrix(Yield ~ ., testData2)[, -1] # Remove intercept
testY2 <- testData2$Yield
# Predict on test set
ridgePred2 <- predict(ridgeTune2, newx = testX2) # For glmnet, use newx=
# Calculate performance
testResults2 <- postResample(ridgePred2, testY)
testResults2 # Shows RMSE, R-squared, and MAE
## RMSE Rsquared MAE
## NA 0.0004047611 NA
The test performance is worse.
# Extract coefficients for the best lambda
coefs <- coef(ridgeTune2$finalModel, ridgeTune2$bestTune$lambda)
coefs <- as.matrix(coefs)
coefs <- data.frame(Predictor = rownames(coefs), Coefficient = coefs[, 1])
coefs <- coefs[coefs$Predictor != "(Intercept)", ] # Remove intercept
# Sort by absolute magnitude of coefficients
coefs$AbsCoefficient <- abs(coefs$Coefficient)
coefs <- coefs[order(-coefs$AbsCoefficient), ]
# Top 10 predictors
head(coefs, 10)
## Predictor Coefficient AbsCoefficient
## ManufacturingProcess32 ManufacturingProcess32 0.18101209 0.18101209
## ManufacturingProcess09 ManufacturingProcess09 0.13784359 0.13784359
## ManufacturingProcess36 ManufacturingProcess36 -0.11752078 0.11752078
## ManufacturingProcess34 ManufacturingProcess34 0.10722632 0.10722632
## ManufacturingProcess13 ManufacturingProcess13 -0.10396974 0.10396974
## ManufacturingProcess17 ManufacturingProcess17 -0.09185220 0.09185220
## ManufacturingProcess37 ManufacturingProcess37 -0.08085014 0.08085014
## BiologicalMaterial03 BiologicalMaterial03 0.07811426 0.07811426
## ManufacturingProcess33 ManufacturingProcess33 0.06988847 0.06988847
## ManufacturingProcess11 ManufacturingProcess11 0.06925365 0.06925365
# Check if biological or process predictors dominate
# Biological predictors are named 'BiologicalMaterialXX', process predictors are 'ManufacturingProcessXX'
sum(grepl("Biological", coefs$Predictor[1:10])) # Count bio predictors in top 10
## [1] 1
sum(grepl("Manufacturing", coefs$Predictor[1:10])) # Count process predictors in top 10
## [1] 9
Process predictors dominate the top of the list because they are more directly controllable and likely have stronger relationships with yield.
# Plot top 3 predictors vs. yield
top3 <- coefs$Predictor[1:3]
# Biological vs. Process predictors may need different interpretation
plot_list <- list()
for (pred in top3) {
plot_list[[pred]] <- ggplot(imputedData, aes(x = .data[[pred]], y = Yield)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE, color = "red") +
labs(title = paste("Yield vs.", pred))
}
# Display plots
library(gridExtra)
grid.arrange(grobs = plot_list, ncol = 1)
Process parameters (e.g., temperature, reaction time) often show direct, actionable trends—yield may peak at specific values, suggesting optimal settings for future runs.
Biological inputs (e.g., raw material properties) help screen batches; if yield consistently drops for certain measurements, those materials can be flagged early.
Actionable insight: Prioritize adjusting controllable process variables for immediate yield gains, while using biological data for quality control of raw materials.