library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(tidyverse)
data(permeability)
nzv <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv]
num_predictors_left <- ncol(filtered_fingerprints)
num_predictors_left
## [1] 388
set.seed(123)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
X_train <- filtered_fingerprints[trainIndex, ]
X_test <- filtered_fingerprints[-trainIndex, ]
y_train <- permeability[trainIndex]
y_test <- permeability[-trainIndex]
preproc <- preProcess(X_train)
X_train_pp <- predict(preproc, X_train)
X_test_pp <- predict(preproc, X_test)
ctrl <- trainControl(method = "cv", number = 10)
pls_model <- train(X_train_pp, y_train,
method = "pls",
tuneLength = 20,
trControl = ctrl,
metric = "Rsquared")
pls_model
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.36940 0.3397706 10.315252
## 2 11.78929 0.4816147 8.575238
## 3 11.90097 0.4849485 9.105076
## 4 12.00362 0.4962113 9.407429
## 5 11.76339 0.5219280 9.024291
## 6 11.55813 0.5310320 8.646010
## 7 11.56404 0.5294165 8.761596
## 8 11.83939 0.5159611 9.222387
## 9 11.97478 0.5174448 9.199224
## 10 12.57694 0.4792439 9.678215
## 11 12.73620 0.4725734 9.714152
## 12 13.03145 0.4514959 9.968887
## 13 13.06366 0.4396748 9.834200
## 14 13.38089 0.4168034 10.057344
## 15 13.60993 0.4011096 10.200724
## 16 13.74283 0.3964039 10.201961
## 17 13.97149 0.3844652 10.412445
## 18 14.21149 0.3695810 10.612168
## 19 14.23455 0.3709200 10.593149
## 20 14.31085 0.3732814 10.675872
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 6.
pls_pred <- predict(pls_model, X_test_pp)
test_R2 <- cor(pls_pred, y_test)^2
test_R2
## [1] 0.3244542
set.seed(123)
rf_model <- train(X_train_pp, y_train, method = "rf", trControl = ctrl, metric = "Rsquared")
svm_model <- train(X_train_pp, y_train, method = "svmRadial", trControl = ctrl, metric = "Rsquared")
rf_model
## Random Forest
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 11.99516 0.5315122 9.116588
## 195 10.42245 0.5809743 7.487867
## 388 10.59277 0.5611845 7.612958
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 195.
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 120, 119, 120, 120, 119, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 12.85182 0.5586969 8.422283
## 0.50 11.94997 0.5669842 7.962381
## 1.00 11.02712 0.5849114 7.547417
##
## Tuning parameter 'sigma' was held constant at a value of 0.001587463
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.001587463 and C = 1.
While the Random Forest and SVM models provide reasonable predictive power (with R^2 ≈ 0.58), they are not yet accurate enough to fully replace the permeability lab experiment.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
ls()
## [1] "ChemicalManufacturingProcess" "ctrl"
## [3] "filtered_fingerprints" "fingerprints"
## [5] "num_predictors_left" "nzv"
## [7] "permeability" "pls_model"
## [9] "pls_pred" "preproc"
## [11] "rf_model" "svm_model"
## [13] "test_R2" "trainIndex"
## [15] "X_test" "X_test_pp"
## [17] "X_train" "X_train_pp"
## [19] "y_test" "y_train"
str(ChemicalManufacturingProcess)
## 'data.frame': 176 obs. of 58 variables:
## $ Yield : num 38 42.4 42 41.4 42.5 ...
## $ BiologicalMaterial01 : num 6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
## $ BiologicalMaterial02 : num 49.6 61 61 61 63.3 ...
## $ BiologicalMaterial03 : num 57 67.5 67.5 67.5 72.2 ...
## $ BiologicalMaterial04 : num 12.7 14.7 14.7 14.7 14 ...
## $ BiologicalMaterial05 : num 19.5 19.4 19.4 19.4 17.9 ...
## $ BiologicalMaterial06 : num 43.7 53.1 53.1 53.1 54.7 ...
## $ BiologicalMaterial07 : num 100 100 100 100 100 100 100 100 100 100 ...
## $ BiologicalMaterial08 : num 16.7 19 19 19 18.2 ...
## $ BiologicalMaterial09 : num 11.4 12.6 12.6 12.6 12.8 ...
## $ BiologicalMaterial10 : num 3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
## $ BiologicalMaterial11 : num 138 154 154 154 148 ...
## $ BiologicalMaterial12 : num 18.8 21.1 21.1 21.1 21.1 ...
## $ ManufacturingProcess01: num NA 0 0 0 10.7 12 11.5 12 12 12 ...
## $ ManufacturingProcess02: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess03: num NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
## $ ManufacturingProcess04: num NA 917 912 911 918 924 933 929 928 938 ...
## $ ManufacturingProcess05: num NA 1032 1004 1015 1028 ...
## $ ManufacturingProcess06: num NA 210 207 213 206 ...
## $ ManufacturingProcess07: num NA 177 178 177 178 178 177 178 177 177 ...
## $ ManufacturingProcess08: num NA 178 178 177 178 178 178 178 177 177 ...
## $ ManufacturingProcess09: num 43 46.6 45.1 44.9 45 ...
## $ ManufacturingProcess10: num NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
## $ ManufacturingProcess11: num NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
## $ ManufacturingProcess12: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess13: num 35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
## $ ManufacturingProcess14: num 4898 4869 4878 4897 4992 ...
## $ ManufacturingProcess15: num 6108 6095 6087 6102 6233 ...
## $ ManufacturingProcess16: num 4682 4617 4617 4635 4733 ...
## $ ManufacturingProcess17: num 35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
## $ ManufacturingProcess18: num 4865 4867 4877 4872 4886 ...
## $ ManufacturingProcess19: num 6049 6097 6078 6073 6102 ...
## $ ManufacturingProcess20: num 4665 4621 4621 4611 4659 ...
## $ ManufacturingProcess21: num 0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
## $ ManufacturingProcess22: num NA 3 4 5 8 9 1 2 3 4 ...
## $ ManufacturingProcess23: num NA 0 1 2 4 1 1 2 3 1 ...
## $ ManufacturingProcess24: num NA 3 4 5 18 1 1 2 3 4 ...
## $ ManufacturingProcess25: num 4873 4869 4897 4892 4930 ...
## $ ManufacturingProcess26: num 6074 6107 6116 6111 6151 ...
## $ ManufacturingProcess27: num 4685 4630 4637 4630 4684 ...
## $ ManufacturingProcess28: num 10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
## $ ManufacturingProcess29: num 21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
## $ ManufacturingProcess30: num 9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
## $ ManufacturingProcess31: num 69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
## $ ManufacturingProcess32: num 156 169 173 171 171 173 159 161 160 164 ...
## $ ManufacturingProcess33: num 66 66 66 68 70 70 65 65 65 66 ...
## $ ManufacturingProcess34: num 2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ ManufacturingProcess35: num 486 508 509 496 468 490 475 478 491 488 ...
## $ ManufacturingProcess36: num 0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
## $ ManufacturingProcess37: num 0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
## $ ManufacturingProcess38: num 3 2 2 2 2 2 2 2 3 3 ...
## $ ManufacturingProcess39: num 7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
## $ ManufacturingProcess40: num NA 0.1 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess41: num NA 0.15 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess42: num 11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
## $ ManufacturingProcess43: num 3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
## $ ManufacturingProcess44: num 1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
## $ ManufacturingProcess45: num 2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
processPredictors <- ChemicalManufacturingProcess[, -1]
yield <- ChemicalManufacturingProcess[, 1]
sum(is.na(processPredictors))
## [1] 106
preproc2 <- preProcess(processPredictors, method = "knnImpute")
processPredictors_imp <- predict(preproc2, processPredictors)
set.seed(123)
trainIndex2 <- createDataPartition(yield, p = 0.8, list = FALSE)
X_train2 <- processPredictors_imp[trainIndex2, ]
X_test2 <- processPredictors_imp[-trainIndex2, ]
y_train2 <- yield[trainIndex2]
y_test2 <- yield[-trainIndex2]
ctrl2 <- trainControl(method = "cv", number = 10)
ridge_model <- train(X_train2, y_train2,
method = "ridge",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = ctrl2,
metric = "Rsquared")
ridge_model
## Ridge Regression
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 131, 130, 130, 129, 131, 129, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.0000000000 6.742444 0.3775102 2.664958
## 0.0001000000 5.991244 0.3985266 2.427099
## 0.0002371374 5.392968 0.4254371 2.228818
## 0.0005623413 4.839187 0.4227875 2.092529
## 0.0013335214 4.477744 0.4089676 2.003610
## 0.0031622777 4.199031 0.4081150 1.918281
## 0.0074989421 3.842863 0.4184952 1.797845
## 0.0177827941 3.323832 0.4410475 1.626668
## 0.0421696503 2.716389 0.4781883 1.433809
## 0.1000000000 2.171541 0.5257725 1.263974
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was lambda = 0.1.
ridge_pred <- predict(ridge_model, X_test2)
test_R2_chem <- cor(ridge_pred, y_test2)^2
test_R2_chem
## [1] 0.4997636
importance <- varImp(ridge_model, scale = TRUE)
plot(importance, top = 15)
top_predictors <- rownames(importance$importance)[order(importance$importance$Overall, decreasing = TRUE)][1:5]
top_predictors
## [1] "ManufacturingProcess32" "BiologicalMaterial06" "BiologicalMaterial03"
## [4] "ManufacturingProcess13" "ManufacturingProcess36"
The top five predictors most strongly associated with product yield were ManufacturingProcess32, BiologicalMaterial06, BiologicalMaterial03, ManufacturingProcess13, and ManufacturingProcess36. This suggests that both process parameters and raw material quality play important roles in determining yield. While biological predictors like BiologicalMaterial06 and BiologicalMaterial03 cannot be modified during production, they can be used to screen and select higher-quality raw materials. On the other hand, process variables such as ManufacturingProcess32 offer opportunities for optimization and control to enhance yield in future manufacturing runs.