data(permeability)
low_freq <- nearZeroVar(fingerprints, saveMetrics = TRUE)
fingerprints_filtered <- fingerprints[, !low_freq$nzv]
ncol(fingerprints_filtered)
## [1] 388
\(R^2\)
set.seed(123)
train_idx <- createDataPartition(permeability, p = 0.8, list = FALSE)
train_data <- fingerprints_filtered[train_idx, ]
test_data <- fingerprints_filtered[-train_idx, ]
train_labels <- permeability[train_idx]
test_labels <- permeability[-train_idx]
preProc <- preProcess(train_data, method = c("center", "scale"))
train_trans <- predict(preProc, train_data)
test_trans <- predict(preProc, test_data)
plsFit <- train(train_trans, train_labels, method = "pls", tuneLength = 10)
plsFit$bestTune
## ncomp
## 3 3
\(R^2\) ?
pls_preds <- predict(plsFit, test_trans)
pls_r2 <- cor(test_labels, pls_preds)^2
paste("PLS R^2:", round(pls_r2, 4))
## [1] "PLS R^2: 0.2695"
Tried to do a Linear model for this but I failed to get it working.
lm_model <- lm(train_labels ~ ., data = as.data.frame(train_trans))
lm_preds <- predict(lm_model, as.data.frame(test_trans))
lm_r2 <- cor(test_labels, lm_preds)^2
pcr_model <- train(train_trans, train_labels, method = "pcr", tuneLength = 10)
pcr_preds <- predict(pcr_model, test_trans)
pcr_r2 <- cor(test_labels, pcr_preds)^2
cat("PLS R^2:", round(pls_r2, 4), "\n")
## PLS R^2: 0.2695
cat("PCR R^2:", round(pcr_r2, 4), "\n")
## PCR R^2: 0.2687
cat("Linear Model R^2:", round(lm_r2, 4), "\n")
## Linear Model R^2: 0.0785
No, Although the Partial Least Squares model shows the best performance with an \(R^2\) of about 0.296, which is less than 30% of the variance in permeability data. This level of predictive accuracy is not enough to fully replace the detailed and highly accurate data obtained from laboratory experiments.
data(ChemicalManufacturingProcess)
predictors <- ChemicalManufacturingProcess[, 2:58]
yield <- ChemicalManufacturingProcess$Yield
predictors <- predictors %>%
mutate(across(everything(), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
zero_var_cols <- nearZeroVar(predictors)
predictors <- predictors[, -zero_var_cols, drop = FALSE]
cat("Number of predictors after preprocessing:", ncol(predictors), "\n")
## Number of predictors after preprocessing: 56
cat("Any NAs? ", any(is.na(predictors)), "\n")
## Any NAs? FALSE
cat("Any columns with zero variance remaining? ", any(apply(predictors, 2, var) == 0), "\n")
## Any columns with zero variance remaining? FALSE
set.seed(0)
train_idx <- createDataPartition(yield, p = 0.8, list = FALSE)
x_train <- predictors[train_idx, ]
x_test <- predictors[-train_idx, ]
y_train <- yield[train_idx]
y_test <- yield[-train_idx]
preProc <- preProcess(x_train, method = c("center", "scale"))
x_train_scaled <- predict(preProc, x_train)
x_test_scaled <- predict(preProc, x_test)
pls_model <- train(x_train_scaled, y_train, method = "pls",
tuneLength = 10,
trControl = trainControl(method = "repeatedcv", number = 10, repeats = 3))
best_ncomp <- pls_model$bestTune$ncomp
optimal_r2 <- max(pls_model$results$R2)
cat("Optimal ncomp:", best_ncomp, "\n")
## Optimal ncomp: 4
cat("Optimal R^2 on training set:", round(optimal_r2, 4), "\n")
## Optimal R^2 on training set: -Inf
pls_test_preds <- predict(pls_model, x_test_scaled)
test_r2 <- cor(pls_test_preds, y_test)^2
test_rmse <- RMSE(pls_test_preds, y_test)
optimal_r2 <- max(pls_model$results$R2, na.rm = TRUE)
if (!is.na(optimal_r2) && !is.infinite(optimal_r2)) {
r2_diff <- optimal_r2 - test_r2
} else {
r2_diff <- NA
}
cat("Test R²:", round(test_r2, 4), "\n")
## Test R²: 0.6538
cat("Test RMSE:", round(test_rmse, 4), "\n")
## Test RMSE: 1.0308
cat("Difference in R²:", round(r2_diff, 4), "\n")
## Difference in R²: NA
enet_model <- enet(x = as.matrix(x_train), y = y_train, lambda = 0.5, normalize = TRUE)
enet_coefs <- predict(enet_model, s = 0.35, type = "coefficients", mode = "fraction")$coefficients
non_zero_coefs <- enet_coefs[enet_coefs != 0]
sort(non_zero_coefs, decreasing = TRUE)
## ManufacturingProcess34 ManufacturingProcess09 ManufacturingProcess32
## 6.316599e-01 2.025896e-01 1.104772e-01
## BiologicalMaterial06 ManufacturingProcess06 BiologicalMaterial03
## 2.679143e-02 2.100535e-02 1.435462e-02
## BiologicalMaterial02 ManufacturingProcess12 ManufacturingProcess37
## 9.790877e-03 2.105723e-05 -3.550973e-02
## ManufacturingProcess17 ManufacturingProcess13 ManufacturingProcess36
## -1.732512e-01 -2.830565e-01 -3.955832e+02
there is strong positive relationship between Manufacturing Process and yield. This may mean that the variable is essential to output in the chemical manufacturing process. This relationship can help focus on monitoring and optimizing Manufacturing Process during production. This can be maintained at the higher end of its operational range, the yield may be consistently improved.
top_feature <- names(sort(abs(non_zero_coefs), decreasing = TRUE))[1]
value_range <- seq(min(predictors[[top_feature]]), max(predictors[[top_feature]]), length.out = 100)
newdata <- data.frame(matrix(rep(colMeans(predictors), each = 100), nrow = 100))
colnames(newdata) <- colnames(predictors)
newdata[[top_feature]] <- value_range
pred_yield <- predict(enet_model, newx = as.matrix(newdata), s = 0.35, mode = "fraction", type = "fit")$fit
ggplot(data.frame(value_range, pred_yield), aes(x = value_range, y = pred_yield)) +
geom_line(color = "blue") +
labs(title = paste("Effect of", top_feature, "on Yield"),
x = top_feature, y = "Predicted Yield") +
theme_minimal()