Taking a look at the data:
data(permeability)
summary(permeability)
## permeability
## Min. : 0.06
## 1st Qu.: 1.55
## Median : 4.91
## Mean :12.24
## 3rd Qu.:15.47
## Max. :55.60
head(permeability)
## permeability
## 1 12.520
## 2 1.120
## 3 19.405
## 4 1.730
## 5 1.680
## 6 0.510
# summary(fingerprints)
hist(permeability)
summary(apply(fingerprints, 2, mean))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.006061 0.024242 0.154767 0.181818 1.000000
initial_num_predictors <- ncol(fingerprints)
initial_num_predictors
## [1] 1107
near_zero_variables <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -near_zero_variables]
num_predictors_left <- ncol(filtered_fingerprints)
num_predictors_left
## [1] 388
It looks like 719 predictors got removed. We have 388 predictors left.
set.seed(123)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
train_fingerprints <- filtered_fingerprints[trainIndex, ]
test_fingerprints <- filtered_fingerprints[-trainIndex, ]
train_permeability <- permeability[trainIndex]
test_permeability <- permeability[-trainIndex]
preProcess_params <- preProcess(train_fingerprints, method = c("center", "scale"))
train_fingerprints <- predict(preProcess_params, train_fingerprints)
test_fingerprints <- predict(preProcess_params, test_fingerprints)
train_control <- trainControl(method = "cv", number = 10)
pls_model <- train(
x = train_fingerprints,
y = train_permeability,
method = "pls",
tuneLength = 20,
trControl = train_control,
preProcess = c("center", "scale")
)
optimal_components <- pls_model$bestTune$ncomp
The number of optimal components is 6, and the resulting resampled R-squared is 0.53.
We can look at the rmsep as we add components:
best_r2 <- max(pls_model$results$Rsquared)
list(optimal_components = optimal_components, best_r2 = best_r2)
## $optimal_components
## [1] 6
##
## $best_r2
## [1] 0.5335956
rmsep_values <- pls_model$results
ggplot(rmsep_values, aes(x = ncomp, y = RMSE)) +
geom_line() +
geom_point() +
labs(
title = "RMSEP vs Number of Components in PLS Model",
x = "Number of Components",
y = "RMSEP"
) +
theme_minimal()
test_predictions <- predict(pls_model, newdata = test_fingerprints)
test_r2 <- cor(test_predictions, test_permeability)^2
test_r2
## [1] 0.3244542
The test estimate R-squared is 0.3244542.
train_control <- trainControl(method = "cv", number = 10)
ridge_model <- train(
x = train_fingerprints,
y = train_permeability,
method = "ridge",
trControl = train_control,
preProcess = c("center", "scale")
)
## Warning: model fit failed for Fold02: lambda=0e+00 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
lasso_model <- train(
x = train_fingerprints,
y = train_permeability,
method = "lasso",
trControl = train_control,
preProcess = c("center", "scale")
)
## Warning: model fit failed for Fold03: fraction=0.9 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: There were missing values in resampled performance measures.
elastic_net_model <- train(
x = train_fingerprints,
y = train_permeability,
method = "glmnet",
tuneGrid = expand.grid(alpha = seq(0, 1, length = 10), lambda = seq(0.01, 0.1, length = 10)),
trControl = train_control,
preProcess = c("center", "scale")
)
model_results <- resamples(list(
PLS = pls_model,
Ridge = ridge_model,
Lasso = lasso_model,
ElasticNet = elastic_net_model
))
summary(model_results)
##
## Call:
## summary.resamples(object = model_results)
##
## Models: PLS, Ridge, Lasso, ElasticNet
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 5.940748 7.855195 8.101540 8.658301 8.587837 12.53980 0
## Ridge 5.151345 7.914686 9.297293 9.609113 11.384128 15.00864 0
## Lasso 5.580568 8.240237 9.799450 14.693278 11.754338 43.53765 1
## ElasticNet 6.805424 7.517214 7.980207 8.463982 8.901325 11.12721 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 7.479049 10.66227 11.18215 11.53275 11.66271 16.49145 0
## Ridge 7.919648 10.69651 12.44402 12.91785 15.43402 18.02992 0
## Lasso 7.170588 11.62398 12.19772 20.47861 17.05290 63.21441 1
## ElasticNet 10.030656 10.12274 10.49953 11.44026 11.49829 15.68488 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 2.072535e-01 0.3875424 0.5626977 0.5335956 0.6291758 0.8326590 0
## Ridge 1.520028e-01 0.4317830 0.5265775 0.5272785 0.7257137 0.8134886 0
## Lasso 4.994135e-06 0.2185169 0.3885725 0.4089517 0.6773989 0.7810474 1
## ElasticNet 1.828886e-01 0.2909709 0.5741940 0.5211502 0.6512159 0.8751188 0
r2_summary <- summary(model_results)$statistics$Rsquared
r2_means <- r2_summary[, "Mean"]
best_model <- names(which.max(r2_means))
best_model
## [1] "PLS"
PLS is the best model based on R-squared.
Based on R-squared the lasso model is the best of the bunch, and that’s with a score of 0.57, which is fairly low indicates weak predictive power. The best model we have won’t capture enough variability in permeability to justify using it over experiments, and being a scientific setting we would need to be as precise and accurate as possible.
A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), 6.5 Computing 139 measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch: a. Start R and use these commands to load the data:
data(ChemicalManufacturingProcess)
# str(ChemicalManufacturingProcess)
# summary(ChemicalManufacturingProcess)
head(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 38.00 6.25 49.58 56.97
## 2 42.44 8.01 60.97 67.48
## 3 42.03 8.01 60.97 67.48
## 4 41.42 8.01 60.97 67.48
## 5 42.49 7.47 63.33 72.25
## 6 43.57 6.12 58.36 65.31
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 12.74 19.51 43.73
## 2 14.65 19.36 53.14
## 3 14.65 19.36 53.14
## 4 14.65 19.36 53.14
## 5 14.02 17.91 54.66
## 6 15.17 21.79 51.23
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1 100 16.66 11.44
## 2 100 19.04 12.55
## 3 100 19.04 12.55
## 4 100 19.04 12.55
## 5 100 18.22 12.80
## 6 100 18.30 12.13
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1 3.46 138.09 18.83
## 2 3.46 153.67 21.05
## 3 3.46 153.67 21.05
## 4 3.46 153.67 21.05
## 5 3.05 147.61 21.05
## 6 3.78 151.88 20.76
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1 NA NA NA
## 2 0.0 0 NA
## 3 0.0 0 NA
## 4 0.0 0 NA
## 5 10.7 0 NA
## 6 12.0 0 NA
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1 NA NA NA
## 2 917 1032.2 210.0
## 3 912 1003.6 207.1
## 4 911 1014.6 213.3
## 5 918 1027.5 205.7
## 6 924 1016.8 208.9
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1 NA NA 43.00
## 2 177 178 46.57
## 3 178 178 45.07
## 4 177 177 44.92
## 5 178 178 44.96
## 6 178 178 45.32
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1 NA NA NA
## 2 NA NA 0
## 3 NA NA 0
## 4 NA NA 0
## 5 NA NA 0
## 6 NA NA 0
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1 35.5 4898 6108
## 2 34.0 4869 6095
## 3 34.8 4878 6087
## 4 34.8 4897 6102
## 5 34.6 4992 6233
## 6 34.0 4985 6222
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1 4682 35.5 4865
## 2 4617 34.0 4867
## 3 4617 34.8 4877
## 4 4635 34.8 4872
## 5 4733 33.9 4886
## 6 4786 33.4 4862
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1 6049 4665 0.0
## 2 6097 4621 0.0
## 3 6078 4621 0.0
## 4 6073 4611 0.0
## 5 6102 4659 -0.7
## 6 6115 4696 -0.6
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1 NA NA NA
## 2 3 0 3
## 3 4 1 4
## 4 5 2 5
## 5 8 4 18
## 6 9 1 1
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1 4873 6074 4685
## 2 4869 6107 4630
## 3 4897 6116 4637
## 4 4892 6111 4630
## 5 4930 6151 4684
## 6 4871 6128 4687
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1 10.7 21.0 9.9
## 2 11.2 21.4 9.9
## 3 11.1 21.3 9.4
## 4 11.1 21.3 9.4
## 5 11.3 21.6 9.0
## 6 11.4 21.7 10.1
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1 69.1 156 66
## 2 68.7 169 66
## 3 69.3 173 66
## 4 69.3 171 68
## 5 69.4 171 70
## 6 68.2 173 70
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1 2.4 486 0.019
## 2 2.6 508 0.019
## 3 2.6 509 0.018
## 4 2.5 496 0.018
## 5 2.5 468 0.017
## 6 2.5 490 0.018
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1 0.5 3 7.2
## 2 2.0 2 7.2
## 3 0.7 2 7.2
## 4 1.2 2 7.2
## 5 0.2 2 7.3
## 6 0.4 2 7.2
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1 NA NA 11.6
## 2 0.1 0.15 11.1
## 3 0.0 0.00 12.0
## 4 0.0 0.00 10.6
## 5 0.0 0.00 11.0
## 6 0.0 0.00 11.5
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1 3.0 1.8 2.4
## 2 0.9 1.9 2.2
## 3 1.0 1.8 2.3
## 4 1.1 1.8 2.1
## 5 1.1 1.7 2.1
## 6 2.2 1.8 2.0
rows_with_na <- ChemicalManufacturingProcess[!complete.cases(ChemicalManufacturingProcess), ]
NROW(rows_with_na)
## [1] 24
data(ChemicalManufacturingProcess)
predictors <- ChemicalManufacturingProcess[, -1]
preProcess_params <- preProcess(predictors, method = "medianImpute")
predictors_imputed <- predict(preProcess_params, predictors)
sum(is.na(predictors_imputed))
## [1] 0
ChemicalManufacturingProcess_imputed <- cbind(Yield = ChemicalManufacturingProcess$Yield, predictors_imputed)
rows_with_na_after <- ChemicalManufacturingProcess_imputed[!complete.cases(ChemicalManufacturingProcess_imputed), ]
NROW(rows_with_na_after)
## [1] 0
set.seed(123)
trainIndex <- createDataPartition(ChemicalManufacturingProcess$Yield, p = 0.8, list = FALSE)
train_data <- ChemicalManufacturingProcess[trainIndex, ]
test_data <- ChemicalManufacturingProcess[-trainIndex, ]
train_predictors <- train_data[, -1]
train_yield <- train_data$Yield
test_predictors <- test_data[, -1]
test_yield <- test_data$Yield
preProcess_params <- preProcess(train_predictors, method = c("center", "scale"))
train_predictors <- predict(preProcess_params, train_predictors)
test_predictors <- predict(preProcess_params, test_predictors)
train_control <- trainControl(method = "cv", number = 10)
pls_model <- train(
x = train_predictors,
y = train_yield,
method = "pls",
tuneLength = 20,
trControl = train_control,
preProcess = c("center", "scale")
)
optimal_components <- pls_model$bestTune$ncomp
optimal_rmse <- min(pls_model$results$RMSE)
optimal_r2 <- max(pls_model$results$Rsquared)
list(
optimal_components = optimal_components,
optimal_rmse = optimal_rmse,
optimal_r2 = optimal_r2
)
## $optimal_components
## [1] 3
##
## $optimal_rmse
## [1] 1.131543
##
## $optimal_r2
## [1] 0.6499904
The optimal number of components (3) gives us an RMSE of 1.13.
rmse_values <- pls_model$results
ggplot(rmse_values, aes(x = ncomp, y = RMSE)) +
geom_line() +
geom_point() +
labs(
title = "RMSEP vs Number of Components in PLS Model",
x = "Number of Components",
y = "RMSEP"
) +
theme_minimal()
sum(is.na(test_predictions))
## [1] 0
sum(is.na(test_yield))
## [1] 0
complete_cases <- !is.na(test_predictions)
test_predictions <- test_predictions[complete_cases]
test_yield <- test_yield[complete_cases]
test_rmse <- RMSE(test_predictions, test_yield)
ss_total <- sum((test_yield - mean(test_yield))^2)
ss_residual <- sum((test_yield - test_predictions)^2)
test_r2 <- 1 - (ss_residual / ss_total)
train_rmse <- min(pls_model$results$RMSE)
train_r2 <- max(pls_model$results$Rsquared)
list(
test_rmse = test_rmse,
test_r2 = test_r2,
train_rmse = train_rmse,
train_r2 = train_r2
)
## $test_rmse
## [1] 30.72849
##
## $test_r2
## [1] -285.1475
##
## $train_rmse
## [1] 1.131543
##
## $train_r2
## [1] 0.6499904
The test RMSE is higher than the train RMSE.
vip_scores <- vip(pls_model$finalModel)
vip_scores
library(ggplot2)
top_predictors <- c("ManufacturingProcess09", "ManufacturingProcess17",
"ManufacturingProcess13", "ManufacturingProcess32",
"ManufacturingProcess36", "BiologicalMaterial03",
"BiologicalMaterial06", "BiologicalMaterial02",
"ManufacturingProcess06", "ManufacturingProcess11")
selected_data <- ChemicalManufacturingProcess[, c("Yield", top_predictors)]
long_data <- melt(selected_data, id.vars = "Yield", variable.name = "Predictor", value.name = "Value")
ggplot(long_data, aes(x = Value, y = Yield)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
facet_wrap(~ Predictor, scales = "free_x") +
labs(title = "Relationships between Top Predictors and Yield",
x = "Predictor Value",
y = "Yield") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
I would definitely address some of these- for example
ManufacturingProcess36 seems to be categorical. We could also look into
removing some of the outliers that might be skering
ManufacturingProcess06, 09, 17, and 13.
library(ggplot2)
library(reshape2)
cor_data <- ChemicalManufacturingProcess[, c("Yield", top_predictors)]
cor_matrix <- cor(cor_data, use = "complete.obs")
melted_cor_matrix <- melt(cor_matrix)
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name="Correlation") +
theme_minimal() +
labs(title = "Correlation Matrix of Top Predictors and Yield",
x = "Predictors",
y = "Predictors") +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
hjust = 1))
Some of these also correlate with each other like BiologicalMaterial02 and 3, and 02 and 06.