Homework7

library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(glmnet)
library(RANN)
data(permeability)
data(ChemicalManufacturingProcess)

6.2

(b) How many predictors are left for modeling?

nzv_fingerprints <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv_fingerprints]
num_predictors_left <- ncol(filtered_fingerprints)
num_predictors_left

## [1] 388

The number of predictors left after filtering is 388.

(c) How many latent variables are optimal and what is the orresponding resampled estimate of R^2?

set.seed(1048)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
trainData <- filtered_fingerprints[trainIndex, ]
testData <- filtered_fingerprints[-trainIndex, ]
trainY <- permeability[trainIndex]
testY <- permeability[-trainIndex]

# Preprocessing: Centering & Scaling
preProc <- preProcess(trainData, method = c("center", "scale"))
trainTransformed <- predict(preProc, trainData)
testTransformed <- predict(preProc, testData)

# Train PLS model
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
plsFit <- train(trainTransformed, trainY,
                method = "pls",
                tuneLength = 20,
                trControl = ctrl,
                metric = "Rsquared")

plsFit$bestTune

##   ncomp
## 2     2

max(plsFit$results$Rsquared)

## [1] 0.4869915

The optimal number of latent variables is 2. The resampled estimate of R^2 is 0.4869915.

(d) What is the test set estimate of R^2?

plsPred <- predict(plsFit, testTransformed)
testR2 <- caret::R2(plsPred, testY, form = "traditional")
testR2

## [1] 0.2852142

The test set estimate of R^2 is 0.2852142.

(e) Do any have better predictive performance?

# Linear Regression (with PCA to reduce multicollinearity)
preProcPCA <- preProcess(trainTransformed, method = c("center", "scale", "pca"), thresh = 0.95)
trainPCA <- predict(preProcPCA, trainTransformed)
testPCA <- predict(preProcPCA, testTransformed)

lmFit <- train(trainPCA, trainY,
               method = "lm",
               trControl = ctrl)
lmPred <- predict(lmFit, testPCA)
lmR2 <- caret::R2(lmPred, testY, form = "traditional")

lmR2

## [1] 0.3404494

The estimate of R^2 for linear regression model is 0.3404494.

# Ridge Regression (use glmnet with alpha = 0)
lambdaGrid <- 10^seq(2, -2, length = 100)
ridgeGrid <- expand.grid(alpha = 0, lambda = lambdaGrid)

ridgeFit <- train(trainTransformed, trainY,
                  method = "glmnet",
                  trControl = ctrl,
                  tuneGrid = ridgeGrid,
                  preProcess = c("center", "scale"))
ridgePred <- predict(ridgeFit, testTransformed)
ridgeR2 <- caret::R2(ridgePred, testY, form = "traditional")

ridgeR2

## [1] 0.3879057

The estimate of R^2 for ridge regression model is 0.3879057. The ridge regression model has better predictive performance.

6.3

(b) Using an imputation function to fill in the missing values

# Impute missing values using KNN, with scaling and centering
processed <- preProcess(ChemicalManufacturingProcess, 
                       method = c("knnImpute", "center", "scale"))
imputedData <- predict(processed, ChemicalManufacturingProcess)

(c) What is the optimal value of the performance metric?

set.seed(1225)
trainIndex2 <- createDataPartition(imputedData$Yield, p = 0.8, list = FALSE)
trainData2 <- imputedData[trainIndex2, ]
testData2 <- imputedData[-trainIndex2, ]

# Prepare x and y matrices for glmnet
x <- model.matrix(Yield ~ ., trainData2)[, -1]  # Remove intercept column
y <- trainData2$Yield

# Set up cross-validation
ctrl2 <- trainControl(method = "cv", number = 10)

# Tune ridge regression (alpha = 0 for ridge, alpha = 1 for lasso)
ridgeTune2 <- train(x = x, y = y,
                  method = "glmnet",
                  tuneGrid = expand.grid(alpha = 0,  # Pure ridge regression
                                       lambda = 10^seq(-3, 3, length = 100)),
                  trControl = ctrl2)

# View results
ridgeTune2$bestTune  # Optimal lambda value

##    alpha    lambda
## 46     0 0.5336699

ridgeTune2$results$Rsquared   # Performance metrics

##   [1] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
##   [8] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
##  [15] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
##  [22] 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414 0.5392414
##  [29] 0.5392414 0.5392414 0.5404528 0.5417727 0.5428218 0.5435560 0.5440543
##  [36] 0.5443117 0.5443889 0.5443055 0.5440833 0.5437533 0.5433368 0.5428677
##  [43] 0.5423155 0.5416253 0.5407493 0.5395484 0.5378202 0.5353941 0.5320763
##  [50] 0.5280050 0.5233821 0.5185832 0.5136991 0.5088727 0.5040066 0.4991455
##  [57] 0.4941804 0.4891816 0.4840871 0.4789875 0.4738457 0.4687579 0.4637139
##  [64] 0.4588072 0.4540358 0.4494818 0.4451549 0.4411105 0.4373426 0.4338860
##  [71] 0.4307316 0.4278884 0.4253430 0.4230793 0.4210890 0.4193418 0.4178241
##  [78] 0.4164987 0.4153578 0.4143631 0.4135114 0.4127678 0.4121329 0.4115767
##  [85] 0.4111024 0.4106855 0.4103338 0.4100202 0.4097523 0.4095155 0.4093137
##  [92] 0.4091351 0.4089830 0.4088481 0.4087332 0.4086602       NaN       NaN
##  [99]       NaN       NaN

(d) What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

# Prepare test data for prediction
testX2 <- model.matrix(Yield ~ ., testData2)[, -1]  # Remove intercept
testY2 <- testData2$Yield

# Predict on test set
ridgePred2 <- predict(ridgeTune2, newx = testX2)  # For glmnet, use newx=

# Calculate performance
testResults2 <- postResample(ridgePred2, testY)
testResults2  # Shows RMSE, R-squared, and MAE

##         RMSE     Rsquared          MAE 
##           NA 0.0004047611           NA

The test performance is worse.

(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

# Extract coefficients for the best lambda
coefs <- coef(ridgeTune2$finalModel, ridgeTune2$bestTune$lambda)
coefs <- as.matrix(coefs)
coefs <- data.frame(Predictor = rownames(coefs), Coefficient = coefs[, 1])
coefs <- coefs[coefs$Predictor != "(Intercept)", ]  # Remove intercept

# Sort by absolute magnitude of coefficients
coefs$AbsCoefficient <- abs(coefs$Coefficient)
coefs <- coefs[order(-coefs$AbsCoefficient), ]

# Top 10 predictors
head(coefs, 10)

##                                     Predictor Coefficient AbsCoefficient
## ManufacturingProcess32 ManufacturingProcess32  0.18101209     0.18101209
## ManufacturingProcess09 ManufacturingProcess09  0.13784359     0.13784359
## ManufacturingProcess36 ManufacturingProcess36 -0.11752078     0.11752078
## ManufacturingProcess34 ManufacturingProcess34  0.10722632     0.10722632
## ManufacturingProcess13 ManufacturingProcess13 -0.10396974     0.10396974
## ManufacturingProcess17 ManufacturingProcess17 -0.09185220     0.09185220
## ManufacturingProcess37 ManufacturingProcess37 -0.08085014     0.08085014
## BiologicalMaterial03     BiologicalMaterial03  0.07811426     0.07811426
## ManufacturingProcess33 ManufacturingProcess33  0.06988847     0.06988847
## ManufacturingProcess11 ManufacturingProcess11  0.06925365     0.06925365

# Check if biological or process predictors dominate
# Biological predictors are named 'BiologicalMaterialXX', process predictors are 'ManufacturingProcessXX'
sum(grepl("Biological", coefs$Predictor[1:10]))  # Count bio predictors in top 10

## [1] 1

sum(grepl("Manufacturing", coefs$Predictor[1:10]))  # Count process predictors in top 10

## [1] 9

Process predictors dominate the top of the list because they are more directly controllable and likely have stronger relationships with yield.

(f) How ould this information be helpful in improving yield in future runs of the manufacturing process?

# Plot top 3 predictors vs. yield
top3 <- coefs$Predictor[1:3]

# Biological vs. Process predictors may need different interpretation
plot_list <- list()
for (pred in top3) {
  plot_list[[pred]] <- ggplot(imputedData, aes(x = .data[[pred]], y = Yield)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "loess", se = FALSE, color = "red") +
    labs(title = paste("Yield vs.", pred))
}

# Display plots
library(gridExtra)
grid.arrange(grobs = plot_list, ncol = 1)

Process parameters (e.g., temperature, reaction time) often show direct, actionable trends—yield may peak at specific values, suggesting optimal settings for future runs.

Biological inputs (e.g., raw material properties) help screen batches; if yield consistently drops for certain measurements, those materials can be flagged early.

Actionable insight: Prioritize adjusting controllable process variables for immediate yield gains, while using biological data for quality control of raw materials.

Homework7

Yanyi Li

2025-04-06

6.2

(b) How many predictors are left for modeling?

(c) How many latent variables are optimal and what is the orresponding resampled estimate of R^2?

(d) What is the test set estimate of R^2?

(e) Do any have better predictive performance?

6.3

(b) Using an imputation function to fill in the missing values

(c) What is the optimal value of the performance metric?

(d) What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

(f) How ould this information be helpful in improving yield in future runs of the manufacturing process?

Homework7

Yanyi Li

2025-04-06

6.2

(b) How many predictors are left for modeling?

(c) How many latent variables are optimal and what is the orresponding resampled estimate of R^2?

(d) What is the test set estimate of R^2?

(e) Do any have better predictive performance?

(f) Would you recommend any of your models to replace the permeability laboratory experiment?

6.3

(b) Using an imputation function to fill in the missing values

(c) What is the optimal value of the performance metric?

(d) What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

(f) How ould this information be helpful in improving yield in future runs of the manufacturing process?